In [70]:
# Get pandas and postgres to work together
import psycopg2 as pg
import pandas as pd

# We are also going to do some basic viz
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
import pickle

In [2]:
sns.set_context('notebook')
sns.set_style('whitegrid')

In [31]:
with open('data/df_full.pickle', 'rb') as f:
    df = pickle.load(f)

## Split data into train and test

In [27]:
len(df)

11826

In [11]:
import sklearn
from sklearn.model_selection import train_test_split

In [28]:
df_train, df_test = train_test_split(df, test_size=.20, random_state=42)

In [None]:
df_train.corr()

In [None]:
df_train.info()

In [73]:
X_train = df_train.iloc[:,4:9]
y_train = df_train['category']
X_test = df_test.iloc[:,4:9]
y_test = df_test['category']

In [None]:
X_train.head()

In [None]:
plt.hist(y_train)

In [14]:
len(y_train)

9460

In [None]:
#note that histogram of this indicates that there is an imbalanced problem

## Oversampling

In [110]:
import imblearn.over_sampling

# setup for the ratio argument of RandomOverSampler initialization
Obese = np.sum(y_train == 'Obese')
Average = np.sum(y_train == 'Average')
Fitness = np.sum(y_train == 'Fitness')
Athlete = np.sum(y_train == 'Athlete')
ratio = {'Obese': Obese, 'Average': Average*2, 'Fitness': Fitness*5, 'Athlete': Athlete*10} 


In [111]:
#Sample code for consideration for oversampling
# randomly oversample positive samples:
ROS = imblearn.over_sampling.RandomOverSampler(sampling_strategy = ratio, random_state=42) 
    
# X_tr_rs, y_tr_rs = ROS.fit_resample(df_train,y_train)

## Scoring

## Functions (Model Selection Pipeline)
Things to address in pipeline:

Class Imbalance (sampling metrics)\
Kfold cross validation (small dataset) \
Modeling (parameter tuning, class weights for those it applies to) \
Metrics selected above (F1) \
ROC Curve

# Modeling

In [79]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.dummy import DummyClassifier
from operator import itemgetter
from sklearn import tree
from sklearn.utils import check_random_state
import numpy as np
from sklearn.model_selection import ParameterGrid

In [None]:
dum = DummyClassifier(strategy='prior')
dum_dum = dum.fit(X_train,y_train) 

In [None]:
dum_dum.predict_proba(X_test)

In [None]:
metrics.precision_score(y_train, dum_dum.predict(y_train), average = 'macro')
#note that micro will simply calculate true positives / (TP+FP), in multiclass this equals TP/all guesses = 74%
#macro will calculate TP/(TP+FP) for each class, then average across. In naive case, this is 74%*0.25 = 0.185 because obese is one of 4 classes 
#weighted this is 74%* 74% (precision for obese*obese percentage) + 0%*19% (percentage average) + 0*5%(percent fit) +0*2% (percent athlete) because obese is weighted

In [None]:
metrics.precision_score(y_train, dum_dum.predict(X_train), average = 'macro', zero_division = 0)

In [None]:
import graphviz

In [None]:
#Process
#For each algorithm - Decision Tree, KNeighbors, Logistic Regression, XGBoost -:
    
#    Find the best parameters for each algorithm

In [20]:
from sklearn.metrics import make_scorer

In [21]:
my_scorer = make_scorer(f1_score,average='macro',zero_division=0)

### Decision Tree

In [8]:
from sklearn.model_selection import StratifiedKFold

In [9]:
kf = StratifiedKFold(n_splits=5, shuffle = True, random_state=42)


In [15]:
kf.split(X_train,y_train)

<generator object _BaseKFold.split at 0x7f975b79f0b0>

In [74]:
DTCparameters = {'max_depth':range(2,11)}

In [112]:
SamplingGridSearchCV(ROS, DecisionTreeClassifier(), X_train, y_train, DTCparameters, folds=5)



(DecisionTreeClassifier(max_depth=10),
 0.2573305586457212,
 {'max_depth': 10},
 array([0.2111274 , 0.2240519 , 0.23459968, 0.24601399, 0.27530689,
        0.30488288, 0.35595166, 0.40459926, 0.44712441]),
 array([0.21112739, 0.22175882, 0.22501979, 0.22865624, 0.23206115,
        0.24102499, 0.25215568, 0.2563504 , 0.25733056]))

In [91]:
#Create KFOLD GRIDSEARCH WITH SAMPLING TECHNIQUES.
#return best model based on F1 score, average = "macro"


#helper function
def model_scores(scores):
    '''
    Input: Scores within a list (of folds) containing lists (of scores by models with diff params)
    Output: (1) array with all scores, (2) array with average scores for each fold
    '''
    flat_list = []
    for fold in scores:
        for model in fold:
            flat_list.append(model)
    X = np.array(flat_list).reshape(len(scores),len(scores[0]))
    return X, X.mean(axis = 0)
    
#Main function    
def SamplingGridSearchCV(resampler, model, X_train, y_train, param_grid, folds=5):
        
    kf = StratifiedKFold(n_splits=folds, shuffle = True, random_state=42)

    # Lists to hold results
    parameters = []
    train_scores = []
    val_scores = []

    # Get indices for split
    X_train = np.array(X_train)
    y_train = np.array(y_train)

    #create folds with resampling
    for indices in kf.split(X_train, y_train):
        train_ind = indices[0]
        val_ind = indices[1]
        X_tr, y_tr = X_train[train_ind], y_train[train_ind]
        X_resampled_train, y_resampled_train = resampler.fit_sample(X_tr,y_tr)
        X_val, y_val = X_train[val_ind], y_train[val_ind]
        X_resampled_val, y_resampled_val = resampler.fit_sample(X_val, y_val)
        
        
        # initialize lists to hold results for each model/parameter
        fold_parameters = []
        fold_train_scores = []
        fold_val_scores = []
        
        #fit model with each parameter and append results to list for current fold
        for g in ParameterGrid(param_grid):
            model.set_params(**g)
            mod = model.fit(X_tr, y_tr)    
            train_score = f1_score(y_tr, model.predict(X_tr), average='macro', zero_division = 0)
            val_score = f1_score(y_val, model.predict(X_val), average='macro', zero_division = 0)
            
            #add results to list 
            fold_parameters.append(g)
            fold_train_scores.append(train_score)
            fold_val_scores.append(val_score)
        
        #append results from the fold into larger list
        parameters.append(fold_parameters)
        train_scores.append(fold_train_scores)
        val_scores.append(fold_val_scores)

    #aggregate all results and prepare outputs
    parameters = parameters[0]
    
    all_train_scores, model_train_scores = model_scores(train_scores)
    all_val_scores, model_val_scores = model_scores(val_scores)
    best_params = parameters[max(enumerate(model_val_scores), key=itemgetter(1))[0]]
    best_model = model.set_params(**best_params)
    best_model_score = max(model_val_scores)
   
    return best_model, best_model_score, best_params, model_train_scores, model_val_scores

In [None]:
def SamplingGridSearchCV(resampler, model, X_train, y_train, param_grid, folds=5)

In [None]:
#Run loop on multiple codes

#get values and create model v model plots


In [None]:
#function to obtain results and plots (based on val data) for selected model with parameters

In [40]:
DTC = DecisionTreeClassifier(class_weight= {'Obese':1,'Average':2,'Fitness':5,'Athlete':10})
parameters = {'max_depth':range(2,11)}
DTC_mod_CV = GridSearchCV(DTC, parameters,scoring=my_scorer)
DTC_fitted = DTC_mod_CV.fit(X_train,y_train)a


In [41]:
print(DTC_fitted.best_params_)
print(DTC_fitted.best_score_)

{'max_depth': 5}
0.3149262913774683


In [None]:
conf = metrics.confusion_matrix(y_train, DTC_fitted.predict(X_train),labels=['Athlete','Fitness','Average','Obese'])
conf = pd.DataFrame(conf, columns=['Athlete','Fitness','Average','Obese'], index = ['Athlete','Fitness','Average','Obese'])
conf

In [None]:
import seaborn as sns

In [None]:
import matplotlib.pyplot as plt

In [None]:
test_conf

In [None]:
sns.heatmap(test_conf, annot = True, cmap="Reds")

In [None]:
naive_conf = metrics.confusion_matrix(y_test, dum_dum.predict(X_test),labels=['Athlete','Average','Fitness','Obese'])

In [None]:
naive_conf

### K-nearest Neighbors

In [None]:
KNN = KNeighborsClassifier()
parameters = {'n_neighbors':range(2,11)}
KNN_mod_CV = GridSearchCV(KNN, parameters,scoring=my_scorer)
KNN_fitted = KNN_mod_CV.fit(X_train,y_train)

In [None]:
print(KNN_fitted.best_params_)
print(KNN_fitted.best_score_)

In [None]:
conf = metrics.confusion_matrix(y_train, KNN_fitted.predict(X_train),labels=['Athlete','Fitness','Average','Obese'])
conf = pd.DataFrame(conf, columns=['Athlete','Fitness','Average','Obese'], index = ['Athlete','Fitness','Average','Obese'])
conf

### Logistic Regression

In [None]:
LR = LogisticRegression(class_weight= {'Obese':1,'Average':2,'Fitness':5,'Athlete':10},solver='liblinear')
parameters = {'C':np.arange(.01,1.1,.02)}
LR_mod_CV = GridSearchCV(LR, parameters,scoring=my_scorer)
LR_fitted = LR_mod_CV.fit(X_train,y_train)

In [None]:
print(LR_fitted.best_params_)
print(LR_fitted.best_score_)

In [None]:
conf = metrics.confusion_matrix(y_train, LR_fitted.predict(X_train),labels=['Athlete','Fitness','Average','Obese'])
conf = pd.DataFrame(conf, columns=['Athlete','Fitness','Average','Obese'], index = ['Athlete','Fitness','Average','Obese'])
conf

### Random Forest

In [None]:
RF = RandomForestClassifier(class_weight= {'Obese':1,'Average':2,'Fitness':5,'Athlete':10})
parameters = {'n_estimators':range(20,100,10),'min_samples_split':[2],'criterion':["gini","entropy"]}
RF_mod_CV = GridSearchCV(RF,parameters,scoring=my_scorer)
RF_fitted = RF_mod_CV.fit(X_train,y_train)

In [None]:
print(RF_fitted.best_params_)
print(RF_fitted.best_score_)

In [None]:
conf = metrics.confusion_matrix(y_train, RF_fitted.predict(X_train),labels=['Athlete','Fitness','Average','Obese'])
conf = pd.DataFrame(conf, columns=['Athlete','Fitness','Average','Obese'], index = ['Athlete','Fitness','Average','Obese'])
conf

### Naive Bayes

In [None]:
NB = GaussianNB()
parameters = {'priors': [None,[0.25,0.25,0.25,0.25],[0.2,0.3,0.2,0.3]]}
NB_mod_CV = GridSearchCV(NB,parameters,scoring=my_scorer)
NB_fitted = NB_mod_CV.fit(X_train,y_train)

In [None]:
NB_fitted.best_estimator_.class_prior_

In [None]:
print(NB_fitted.best_params_)
print(NB_fitted.best_score_)

In [None]:
conf = metrics.confusion_matrix(y_train, NB_fitted.predict(X_train),labels=['Athlete','Fitness','Average','Obese'])
conf = pd.DataFrame(conf, columns=['Athlete','Fitness','Average','Obese'], index = ['Athlete','Fitness','Average','Obese'])
conf

## Scratch for personal scoring metrics

In [None]:
score_points = np.array([[1,0.2,0.8,0],[0.2,1,0.4,0.6],[0.8,0.4,1,.2],[0,.6,.2,1]])

In [None]:
score_points

In [None]:
DTC_prob

In [None]:
y_test

In [None]:
from sklearn.preprocessing import LabelBinarizer

In [None]:
lb = LabelBinarizer()
lb.fit(['Athlete','Average','Fitness','Obese'])

In [None]:
y_test_bin = lb.transform(y_test)
y_test_bin

In [None]:
len(y_test_bin)

In [None]:
y_test_bin[0]

In [None]:
len(np.matmul(y_test_bin,score_points))

In [None]:
interstep = np.matmul(y_test_bin,score_points)

In [None]:
interstep[0]

In [None]:
DTC_prob[0]

In [None]:
fin = interstep*DTC_prob

In [None]:
len(fin)

In [None]:
fin_score = fin.sum()/len(fin)

In [None]:
fin_score