In [133]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA

from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

# Import Data, Create Holdout

In [4]:
df = pd.read_csv('C:/Users/philb/Google Drive/Thinkful/Thinkful_repo/projects/supervised_capstone/target_features.csv', index_col=0)

In [5]:
df.shape

(7276, 136)

In [6]:
target = df.iloc[:, 0]
features = df.iloc[:, 1:]

In [7]:
print(target.shape)
print(features.shape)

(7276,)
(7276, 135)


Create a holdout group of 20% that we can test our best models on at the end (in order to verify that our model was generalized.

In [8]:
X, X_holdout, y, y_holdout = train_test_split(features, target, test_size=.2)

Create a small validation set (mini holdout) to predict after cross validation to try and find best generalization before testing on holdout. Ultimately, the chosen model will train on this validation set as well (and be adopted, if it has better cross validation score).

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1)

# Classification Modeling

**The Models:**
1. Naive Bayes: Multinomial, Bernoulli
2. Logistic Regression
3. K-Nearest Neighbors Classifier
4. Decision Tree Classifier
5. Random Forest Classifier
6. Support Vector Machines (SVM) Classifier
7. Gradient Boosting Classifier
8. XGBoost or Extreme Gradient Boosting
9. Extra-trees Classifier
10. Ensembles

In [10]:
ol_scaler = RobustScaler()
minmax_scaler = MinMaxScaler()
standard_scaler = StandardScaler()
scalers = [outlier_scaler, minmax_scaler, standard_scaler]

In [26]:
outlier_X_train = outlier_scaler.fit_transform(X_train) #outlier elimination using RobustScaler
minmax_X_train = minmax_scaler.fit_transform(X_train) #MinMax scale between 0 and 1 (positives only)
standard_X_train = standard_scaler.fit_transform(X_train) #StandardScaler with mean 0 and scaled std
minmax_outlier_X_train = minmax_scaler.fit_transform(ol_X_train) #outlier removed and MinMaxed (0,1)

In [158]:
train_datas = {'original': X_train,'outlier_removed':outlier_X_train, 'minmaxed':minmax_X_train, 'standardized':standard_X_train, 'minmaxed_outlier_removed':minmax_outlier_X_train}
pca_obj = {}
pca_data = 

In [159]:
for item in train_datas.items():
    pca_obj[str(item[0])] = PCA()

In [165]:
for item in pca_obj.items():
    pca_obj[item[0]].fit_transform(train_datas[item[0]])

135

In [177]:
for pca_obj in pca_obj.items():
    print(pca_obj[0])
    print(pca_obj[1].explained_variance_ratio_.cumsum())

original
[0.97850731 0.99464223 0.99912321 0.99945541 0.99970482 0.99974912
 0.99978137 0.99980815 0.99983215 0.99985508 0.99987467 0.99988524
 0.99989462 0.9999033  0.99991169 0.99991963 0.99992599 0.99993199
 0.9999363  0.99994016 0.99994397 0.99994734 0.99995052 0.99995334
 0.99995611 0.99995881 0.99996124 0.99996338 0.99996541 0.99996734
 0.99996918 0.99997099 0.99997268 0.99997434 0.99997596 0.99997755
 0.99997911 0.99998051 0.99998173 0.99998291 0.99998403 0.9999851
 0.99998608 0.99998702 0.99998794 0.99998879 0.99998954 0.99999027
 0.99999096 0.99999163 0.99999228 0.99999284 0.99999335 0.99999382
 0.99999428 0.99999472 0.9999951  0.99999547 0.99999582 0.99999614
 0.99999644 0.99999672 0.99999697 0.99999721 0.99999744 0.99999766
 0.99999788 0.99999809 0.9999983  0.99999849 0.99999868 0.9999988
 0.99999888 0.99999897 0.99999905 0.99999912 0.9999992  0.99999927
 0.99999932 0.99999938 0.99999944 0.99999949 0.99999954 0.99999958
 0.99999962 0.99999966 0.99999968 0.99999971 0.99999974

In [109]:
def train_data_testing(classifier, verbose=0):
    datas = []
    means = []
    
    if verbose == False:
        for x in train_datas:
            try:
                cvs = cross_val_score(classifier, train_datas[x], y_train, cv=5, n_jobs=-1)
                cv_mean = cvs.mean()
                datas.append(x)
                means.append(cv_mean)
            except:
                pass
        
    elif verbose == True:
        for x in train_datas:
            try:
                cvs = cross_val_score(classifier, train_datas[x], y_train, cv=5, n_jobs=-1)
                cv_mean = cvs.mean()
                print(f'Classifier: {classifier}')
                print(f"Train Data: {x}")
                print(f'CVs: {cvs}')
                print(f'CVs Average: {cv_mean}')
                print('-----')
                datas.append(x)
                means.append(cv_mean)

            except:
                print(f'Something went wrong using "{x}" data in conjuncion with {classifier}')
                print('-----')

    best_cv_average = max(np.array(means))
    best_indicies = [i for i, x in enumerate(means) if x == best_cv_average]
    best_datas = [datas[i] for i in best_indicies]
    
    return classifier, best_cv_average, best_datas

In [116]:
def classifier_testing(clfs_list, verbose=1):
    classifiers = []
    classifier_best_cv_averages = []
    classifier_best_datas = []
    
    if verbose == 0:
        for classifier in clfs_list:
            classifier, best_cv_average, best_datas = train_data_testing(classifier)
            classifiers.append(classifier)
            classifier_best_cv_averages.append(best_cv_average)
            classifier_best_datas.append(best_datas)

        best_average = max(np.array(classifier_best_cv_averages))
        best_idxs = [i for i, x in enumerate(classifier_best_cv_averages) if x == best_average]
        best_classifiers = [classifiers[i] for i in best_idxs]
        best_datas_for_best_classifiers = [classifier_best_datas[i] for i in best_idxs]
    
    elif verbose == 1:
        for classifier in clfs_list:
            print('Testing: {}'.format(classifier))
            print('-----')
            classifier, best_cv_average, best_datas = train_data_testing(classifier, verbose=0)
            print('Finished: {}'.format(classifier))
            print('Best CV Score: {}'.format(best_cv_average))
            print('Best Data Transformation(s): {}'.format(best_datas))
            print('-----')
            classifiers.append(classifier)
            classifier_best_cv_averages.append(best_cv_average)
            classifier_best_datas.append(best_datas)

        best_average = max(np.array(classifier_best_cv_averages))
        best_idxs = [i for i, x in enumerate(classifier_best_cv_averages) if x == best_average]
        best_classifiers = [classifiers[i] for i in best_idxs]
        best_datas_for_best_classifiers = [classifier_best_datas[i] for i in best_idxs]

        print('Best Classifier(s):')
        print('-------------------')
        for a, b in zip(best_classifiers, best_datas_for_best_classifiers):
            print('Classifier: {}'.format(a))
            print('Best Data Transformations for Classifier:')
            for i, data in enumerate(b, 1):
                print('{}. {}: {}% accuracy'.format(i, data, round(best_average*100, 5)))
    
    elif verbose == 2:
        for classifier in clfs_list:
            print('Testing: {}'.format(classifier))
            print('-----')
            classifier, best_cv_average, best_datas = train_data_testing(classifier, verbose=1)
            print('Finished: {}'.format(classifier))
            print('Best CV Score: {}'.format(best_cv_average))
            print('Best Data Transformation(s): {}'.format(best_datas))
            print('-----')
            classifiers.append(classifier)
            classifier_best_cv_averages.append(best_cv_average)
            classifier_best_datas.append(best_datas)

        best_average = max(np.array(classifier_best_cv_averages))
        best_idxs = [i for i, x in enumerate(classifier_best_cv_averages) if x == best_average]
        best_classifiers = [classifiers[i] for i in best_idxs]
        best_datas_for_best_classifiers = [classifier_best_datas[i] for i in best_idxs]

        print('Best Classifier(s):')
        print('-------------------')
        for a, b in zip(best_classifiers, best_datas_for_best_classifiers):
            print('Classifier: {}'.format(a))
            print('Best Data Transformations for Classifier:')
            for i, data in enumerate(b, 1):
                print('{}. {}: {}% accuracy'.format(i, data, round(best_average*100, 5)))
                
    return best_classifiers, best_average, best_datas_for_best_classifiers

In [129]:
clf_list = [MultinomialNB(), BernoulliNB(), LogisticRegression(), KNeighborsClassifier(), DecisionTreeClassifier(), RandomForestClassifier(), 
            GradientBoostingClassifier(), ExtraTreesClassifier(), SVC(), XGBClassifier()]

In [130]:
classifier_testing(clf_list)

Testing: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
-----
Finished: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
Best CV Score: 0.7562042039414685
Best Data Transformation(s): ['minmaxed', 'minmaxed_outlier_removed']
-----
Testing: BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
-----
Finished: BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
Best CV Score: 0.7596382248080666
Best Data Transformation(s): ['original']
-----
Testing: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)
-----
Finished: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
  

([GradientBoostingClassifier(criterion='friedman_mse', init=None,
                             learning_rate=0.1, loss='deviance', max_depth=3,
                             max_features=None, max_leaf_nodes=None,
                             min_impurity_decrease=0.0, min_impurity_split=None,
                             min_samples_leaf=1, min_samples_split=2,
                             min_weight_fraction_leaf=0.0, n_estimators=100,
                             n_iter_no_change=None, presort='auto',
                             random_state=None, subsample=1.0, tol=0.0001,
                             validation_fraction=0.1, verbose=0,
                             warm_start=False)],
 0.9356640565191714,
 [['original']])