In [1]:

# load df
import pandas as pd
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.feature_selection import RFE, SelectKBest, SelectFdr, SelectFpr
from sklearn.linear_model import ElasticNet, LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import FunctionTransformer, StandardScaler, MinMaxScaler, PowerTransformer
from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier
from imblearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, RandomizedSearchCV, LeaveOneOut
from xgboost import XGBClassifier, XGBRegressor


for path in [r"C:\‏‏PycharmProjects\SocialAnxietyClassifier\data\training_data\mean_analysis_trainingpupil_size_0_std_0.xlsx",
             r"C:\‏‏PycharmProjects\SocialAnxietyClassifier\data\training_data\mean_analysis_trainingpupil_size_0_std_1.xlsx",
             r"C:\‏‏PycharmProjects\SocialAnxietyClassifier\data\training_data\mean_analysis_trainingpupil_size_1_std_0.xlsx",
             r"C:\‏‏PycharmProjects\SocialAnxietyClassifier\data\training_data\mean_analysis_trainingpupil_size_1_std_1.xlsx",
             r"C:\‏‏PycharmProjects\SocialAnxietyClassifier\data\training_data\training_data_pupil_size_0_std_0_ver2.xlsx",
             r"C:\‏‏PycharmProjects\SocialAnxietyClassifier\data\training_data\training_data_pupil_size_0_std_1_ver2.xlsx",
             r"C:\‏‏PycharmProjects\SocialAnxietyClassifier\data\training_data\training_data_pupil_size_1_std_0_ver2.xlsx",
             r"C:\‏‏PycharmProjects\SocialAnxietyClassifier\data\training_data\training_data_pupil_size_1_std_1_ver2.xlsx"]:
    print(path)
    df = pd.read_excel(path)

    # impute zeros
    df = df.fillna(0)

    # split to holdouts and training
    sad_cutoff = 50
    print(f"\nSocial Anxiety LSAS cutoff is {sad_cutoff}\n")

    Y = df["LSAS"] >= sad_cutoff
    print(f"pos y = {sum(Y)}\nneg y = {(len(Y) - sum(Y))}")

    X = df.drop(["LSAS","Subject"], axis=1)
    c = ((len(Y) - sum(Y)) / sum(Y))
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y)


    pipe_svc = Pipeline(steps=[
                ('scaling', StandardScaler()),
                ('classifier', SVC())
    ])

    params_svc = {
            'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
            'classifier__C': [0.001, 0.01, 1, 10, 100, 1000],
            'classifier__gamma': [1, 0.1, 0.001, 0.0001]
        }


    pipe_rf = Pipeline(steps=[
            ('classifier', RandomForestClassifier())
    ])

    params_rf = {
            'classifier__n_estimators': [100, 250, 400, 700],
            'classifier__max_depth': [2, 3, 6, 10],
            'classifier__max_features': [1.0, 0.8, 0.5, 'auto'],
            'classifier__min_samples_split': [1.0, 0.8, 0.5]
    }

    pipe_bbc = Pipeline(steps=[
                ('classifier', BalancedBaggingClassifier())
        ])

    params_bbc = {
                'classifier': [BalancedRandomForestClassifier(), BalancedBaggingClassifier()],
                'classifier__n_estimators': [100, 250, 400, 700],
                'classifier__max_features': [1.0, 0.9, 0.8, 0.5],
    }

    pipe_xgb = Pipeline(steps=[
                ('classifier', XGBClassifier())
    ])

    params_xgb = {
                'classifier__n_estimators': [100, 300],
                'classifier__max_depth': [2, 3, 5],
                'classifier__learning_rate': [0.1, 0.05, 0.25],
                'classifier__gamma': [0, 0.5, 1],
                'classifier__min_child_weight': [1, 0.5, 2],
                'classifier__scale_pos_weight': [c, 2 * c, 0.5 * c, 1],
                'classifier__reg_alpha': [0, 1, 0.5],
                'classifier__reg_lambda': [0, 1, 0.5]
        }

    pipe_lr = Pipeline(steps=[
                ('scaling', StandardScaler()),
                ('classifier', LogisticRegression())
    ])

    params_lr = {
            'classifier__penalty': ['l1', 'l2'],
            'classifier__tol': [0.00001, 0.0001, 0.001, 0.01, 0.1],
            'classifier__C': [0.0001, 0.001, 0.01,0.05, 0.1,0.5, 1, 10, 100, 1000, 10000],
            'classifier__solver': ['liblinear', 'saga', 'warn']
    }

    pipe_lr2 = Pipeline(steps=[
                ('scaling', StandardScaler()),
                ('classifier', LogisticRegression(solver='liblinear', dual=True,penalty='l2'))
    ])

    params_lr2 = {
            'classifier__tol': [0.00001, 0.0001, 0.001, 0.01, 0.1],
            'classifier__C': [0.0001, 0.001, 0.01,0.05, 0.1,0.5, 1, 10, 100, 1000, 10000],
    }

    pipe_en = Pipeline(steps=[
                ('scaling', StandardScaler()),
                ('classifier', SGDClassifier(penalty='elasticnet', loss='log'))
    ])
    params_en = {
            'classifier__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
            'classifier__tol': [0.000001, 0.00001, 0.0001, 0.001, 0.01],
            'classifier__n_iter':[2,3,4,5,6,7,8,9,10],
            'classifier__l1_ratio': [0, 0.025, 0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1],
    }
    for loo in [StratifiedKFold(5)]:#:, StratifiedKFold(5)]:
        for score in ['accuracy']:#, 'f1']:
            print(f"checking generalization with {loo}\nscore function is {score}")
            gs_svc = GridSearchCV(estimator=pipe_svc,
                                  param_grid=params_svc,
                                  scoring=score,
                                  cv=loo)

            gs_rf = GridSearchCV(estimator=pipe_rf,
                                 param_grid=params_rf,
                                 scoring=score,
                                 cv=loo)

            gs_bbc = GridSearchCV(estimator=pipe_bbc,
                                  param_grid=params_bbc,
                                  scoring=score,
                                  cv=loo)

            gs_xgb = GridSearchCV(estimator=pipe_xgb,
                                  param_grid=params_xgb,
                                  scoring=score,
                                  cv=loo)

            gs_lr = GridSearchCV(estimator=pipe_lr,
                                 param_grid=params_lr,
                                 scoring=score,
                                 cv=loo)
            gs_lr2 = GridSearchCV(estimator=pipe_lr2,
                                 param_grid=params_lr2,
                                 scoring=score,
                                 cv=loo)
            gs_en = GridSearchCV(estimator=pipe_en,
                                 param_grid=params_en,
                                 scoring=score,
                                 cv=loo)


            # List of pipelines for ease of iteration
            grids = [gs_xgb]#[gs_en, gs_lr, gs_lr2, gs_xgb, gs_rf, gs_bbc, gs_svc, gs_lr]

            # Dictionary of pipelines and classifier types for ease of reference
            grid_dict = {0: 'xgb'}
            #grid_dict = {0: 'en', 1: 'lr', 2: 'lr dual', 3: 'xgb short', 4: 'rf', 5: 'bbc', 6: 'svc', 7: 'lr'}

            # Fit the grid search objects
            print('Performing model optimizations...')
            best_clf = 0
            best_acc = 0.0
            for idx, gs in enumerate(grids):
                print('\nEstimator: %s' % grid_dict[idx])
                # Fit grid search
                gs.fit(X_train, y_train)
                # Best params
                print('Best params: %s' % gs.best_params_)
                # Best training data accuracy
                print('Best training score: %.3f' % gs.best_score_)
                # Predict on test data with best params
                y_pred = gs.predict(X_test)
                # Test data accuracy of model with best params
                print('Test set score score for best params: %.3f ' % accuracy_score(y_test, y_pred))
                # Track best (highest test accuracy) model
                if accuracy_score(y_test, y_pred) > best_acc:
                    best_acc = accuracy_score(y_test, y_pred)
                    best_clf = idx
            print('\nClassifier with best test set score: %s' % grid_dict[best_clf])


C:\‏‏PycharmProjects\SocialAnxietyClassifier\data\training_data\mean_analysis_trainingpupil_size_0_std_0.xlsx

Social Anxiety LSAS cutoff is 50

pos y = 70
neg y = 61
checking generalization with StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
score function is accuracy
Performing model optimizations...

Estimator: xgb


KeyboardInterrupt: 

In [3]:
C:\‏‏PycharmProjects\SocialAnxietyClassifier\data\training_data\training_data_pupil_size_0_std_1_ver2.xlsx

Estimator: en
Best params: {'classifier__alpha': 0.001, 'classifier__l1_ratio': 0.2, 'classifier__n_iter': 5, 'classifier__tol': 0.001}
Best training score: 0.654
Test set score score for best params: 0.667 

Estimator: en
Best params: {'classifier__alpha': 0.1, 'classifier__l1_ratio': 0.3, 'classifier__n_iter': 3, 'classifier__tol': 0.01}
Best training score: 0.710
Test set score score for best params: 0.667 

Estimator: xgb short
Best params: {'classifier__gamma': 0.5, 'classifier__learning_rate': 0.25, 'classifier__max_depth': 3, 'classifier__n_estimators': 100}
Best training score: 0.654
Test set score score for best params: 0.667 

Estimator: svc
Best params: {'classifier__C': 10, 'classifier__gamma': 0.1, 'classifier__kernel': 'rbf'}
Best training score: 0.705
Test set score score for best params: 0.630 


SyntaxError: unexpected character after line continuation character (<ipython-input-3-b406bd087bdf>, line 1)

In [4]:
C:\‏‏PycharmProjects\SocialAnxietyClassifier\data\training_data\training_data_pupil_size_1_std_0_ver2.xlsx
Estimator: lr dual
Best params: {'classifier__C': 0.01, 'classifier__tol': 1e-05}
Best training score: 0.644
Test set score score for best params: 0.630 

Estimator: rf
Best params: {'classifier__max_depth': 2, 'classifier__max_features': 'auto', 'classifier__min_samples_split': 0.5, 'classifier__n_estimators': 250}
Best training score: 0.644
Test set score score for best params: 0.630 


Estimator: en
Best params: {'classifier__alpha': 0.001, 'classifier__l1_ratio': 0.3, 'classifier__n_iter': 4, 'classifier__tol': 0.001}
Best training score: 0.720
Test set score score for best params: 0.630 

Estimator: lr
Best params: {'classifier__C': 0.01, 'classifier__penalty': 'l2', 'classifier__solver': 'saga', 'classifier__tol': 0.001}
Best training score: 0.728
Test set score score for best params: 0.630 


SyntaxError: unexpected character after line continuation character (<ipython-input-4-e762ae5ade66>, line 1)

In [5]:
C:\‏‏PycharmProjects\SocialAnxietyClassifier\data\training_data\mean_analysis_trainingpupil_size_0_std_0.xlsx

Estimator: en
Best params: {'classifier__alpha': 0.0001, 'classifier__l1_ratio': 0.025, 'classifier__n_iter': 7, 'classifier__tol': 0.0001}
Best training score: 0.663
Test set score score for best params: 0.704 


SyntaxError: unexpected character after line continuation character (<ipython-input-5-e6e94331b544>, line 1)

In [6]:
C:\‏‏PycharmProjects\SocialAnxietyClassifier\data\training_data\mean_analysis_trainingpupil_size_0_std_1.xlsx


Estimator: en
Best params: {'classifier__alpha': 0.0001, 'classifier__l1_ratio': 0.2, 'classifier__n_iter': 3, 'classifier__tol': 0.0001}
Best training score: 0.740
Test set score score for best params: 0.630 


Estimator: en
Best params: {'classifier__alpha': 0.01, 'classifier__l1_ratio': 0, 'classifier__n_iter': 3, 'classifier__tol': 0.001}
Best training score: 0.761
Test set score score for best params: 0.630 

Estimator: en
Best params: {'classifier__alpha': 0.0001, 'classifier__l1_ratio': 0.7, 'classifier__n_iter': 7, 'classifier__tol': 1e-06}
Best training score: 0.740
Test set score score for best params: 0.667 


Estimator: svc
Best params: {'classifier__C': 1000, 'classifier__gamma': 0.001, 'classifier__kernel': 'sigmoid'}
Best training score: 0.702
Test set score score for best params: 0.667 

Estimator: xgb short
Best params: {'classifier__gamma': 0, 'classifier__learning_rate': 0.25, 'classifier__max_depth': 2, 'classifier__n_estimators': 300}
Best training score: 0.678
Test set score score for best params: 0.630 


Estimator: xgb short
Best params: {'classifier__gamma': 0, 'classifier__learning_rate': 0.25, 'classifier__max_depth': 5, 'classifier__n_estimators': 300}
Best training score: 0.655
Test set score score for best params: 0.667 


SyntaxError: unexpected character after line continuation character (<ipython-input-6-645d2ef72f1f>, line 1)

In [7]:
C:\‏‏PycharmProjects\SocialAnxietyClassifier\data\training_data\mean_analysis_trainingpupil_size_1_std_0.xlsx


Estimator: en
Best params: {'classifier__alpha': 0.1, 'classifier__l1_ratio': 0.6, 'classifier__n_iter': 10, 'classifier__tol': 1e-06}
Best training score: 0.740
Test set score score for best params: 0.593 

Estimator: svc
Best params: {'classifier__C': 1, 'classifier__gamma': 0.1, 'classifier__kernel': 'rbf'}
Best training score: 0.708
Test set score score for best params: 0.630 


SyntaxError: unexpected character after line continuation character (<ipython-input-7-2b62deb29ab6>, line 1)

In [8]:
C:\‏‏PycharmProjects\SocialAnxietyClassifier\data\training_data\mean_analysis_trainingpupil_size_1_std_1.xlsx


Estimator: en
Best params: {'classifier__alpha': 0.01, 'classifier__l1_ratio': 0.025,
'classifier__n_iter': 7, 'classifier__tol': 0.01}
Best training score: 0.813
Test set score score for best params: 0.704 


Estimator: lr
Best params: {'classifier__C': 1000, 'classifier__penalty': 'l1',
'classifier__solver': 'liblinear', 'classifier__tol': 0.0001}
Best training score: 0.762
Test set score score for best params: 0.630 

Estimator: lr dual
Best params: {'classifier__C': 100, 'classifier__tol': 0.0001}
Best training score: 0.741
Test set score score for best params: 0.667 


Estimator: svc
Best params: {'classifier__C': 1, 'classifier__gamma': 1, 'classifier__kernel': 'linear'}
Best training score: 0.768
Test set score score for best params: 0.630 

Estimator: lr
Best params: {'classifier__C': 10000, 'classifier__penalty': 'l1',
'classifier__solver': 'warn', 'classifier__tol': 1e-05}
Best training score: 0.756
Test set score score for best params: 0.667 


Best params: {'classifier__alpha': 0.01, 'classifier__l1_ratio': 0.3, 'classifier__n_iter': 6, 'classifier__tol': 0.001}
Best training score: 0.731
Test set score score for best params: 0.704 

Estimator: lr
Best params: {'classifier__C': 10, 'classifier__penalty': 'l2', 
'classifier__solver': 'liblinear', 'classifier__tol': 1e-05}
Best training score: 0.712
Test set score score for best params: 0.667 










Estimator: lr dual
Best params: {'classifier__C': 10, 'classifier__tol': 1e-05}
Best training score: 0.712
Test set score score for best params: 0.667 


Estimator: svc
Best params: {'classifier__C': 1000, 'classifier__gamma': 0.0001, 'classifier__kernel': 'sigmoid'}
Best training score: 0.712
Test set score score for best params: 0.704 

Estimator: lr
Best params: {'classifier__C': 10, 'classifier__penalty': 'l2',
'classifier__solver': 'liblinear', 'classifier__tol': 1e-05}
Best training score: 0.712
Test set score score for best params: 0.667 


Estimator: en
Best params: {'classifier__alpha': 0.01, 'classifier__l1_ratio': 0.1, 'classifier__n_iter': 7, 'classifier__tol': 0.01}
Best training score: 0.781
Test set score score for best params: 0.630 

Estimator: lr
Best params: {'classifier__C': 100, 'classifier__penalty': 'l1',
'classifier__solver': 'warn', 'classifier__tol': 0.1}
Best training score: 0.749
Test set score score for best params: 0.630 

Estimator: lr dual
Best params: {'classifier__C': 10, 'classifier__tol': 1e-05}
Best training score: 0.739
Test set score score for best params: 0.667 



Estimator: svc
Best params: {'classifier__C': 1000, 'classifier__gamma': 0.0001, 'classifier__kernel': 'sigmoid'}
Best training score: 0.739
Test set score score for best params: 0.704 

Estimator: lr
Best params: {'classifier__C': 10000, 'classifier__penalty': 'l1',
'classifier__solver': 'warn', 'classifier__tol': 0.1}
Best training score: 0.773
Test set score score for best params: 0.667 



SyntaxError: unexpected character after line continuation character (<ipython-input-8-a328f951f655>, line 1)

In [None]:
C:\‏‏PycharmProjects\SocialAnxietyClassifier\data\training_data\mean_analysis_trainingpupil_size_1_std_1.xlsx

Estimator: en
Best params: {'classifier__alpha': 0.0001, 'classifier__l1_ratio': 0.1,
'classifier__n_iter': 5, 'classifier__tol': 1e-05}
Best training score: 0.769
Test set score score for best params: 0.630 


Estimator: lr
Best params: {'classifier__C': 10, 'classifier__penalty': 'l1',
'classifier__solver': 'saga', 'classifier__tol': 1e-05}
Best training score: 0.731
Test set score score for best params: 0.667 

Estimator: lr dual
Best params: {'classifier__C': 1, 'classifier__tol': 1e-05}
Best training score: 0.721
Test set score score for best params: 0.630 


Estimator: svc
Best params: {'classifier__C': 1, 'classifier__gamma': 1, 
'classifier__kernel': 'linear'}
Best training score: 0.740
Test set score score for best params: 0.630 

