# Notes

In [3]:
path_to_CNN = '/Users/DrV/PyProj/MRI_personality/sub-0001/nibabel_CNN'
path_to_excel = '/Users/DrV/code/lukevano/tables_2/'

# Imports

In [4]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_validate, cross_val_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, RobustScaler, FunctionTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVC
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.inspection import permutation_importance
import statsmodels.api as sm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, max_error
import math
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture



In [5]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Log Regression

## Description of loaded data

Three datasets are all merged in the following way:

Top 208 observations are ds002785, from 209 to 434 are ds002790, and from 435 to 1362 are ds003097

Columns have been concatenated so that the first columns are the participant details, then the targets, followed by the lt then rt cortical parcellations and lastly the volume segmentation of the subcortical structures.

Columns religious and raven have been deleted as not recorded in all datasets.

In [6]:
# Load the dataset
df = pd.read_excel('/Users/DrV/code/lukevano/tables_2/master_combined.xlsx')
df

FileNotFoundError: [Errno 2] No such file or directory: '/Users/DrV/code/lukevano/tables_2/master_combined.xlsx'

In [None]:
# Keep only the volume and thickness features and view correlations
vol_thick_features = df.iloc[:,11:]
corr = vol_thick_features.corr()
corr_df = corr.unstack().reset_index() # Unstack correlation matrix 
corr_df.columns = ['feature_1','feature_2', 'correlation'] # rename columns
corr_df.sort_values(by="correlation",ascending=False, inplace=True) # sort by correlation
corr_df = corr_df[corr_df['feature_1'] != corr_df['feature_2']] # Remove self correlation
corr_df

In [None]:
# View the columns that are identical
corr_df[corr_df.correlation>0.999]

In [None]:
# Make a list of the columns to remove and consider removing
# Remove idenical columns and hypointensity columns with virtually no values
to_remove = ['eTIV.1', 'EstimatedTotalIntraCranialVol', 'BrainSegVolNotVent.2',
    'BrainSegVolNotVent.1', 'BrainSegVolNotVentSurf', 'SupraTentorialVolNotVentVox',
    'lhCerebralWhiteMatterVol', 'rhCerebralWhiteMatterVol', 'BrainSegVolNotVent.2', 
    'BrainSegVol', 'SupraTentorialVol', 'SupraTentorialVolNotVent',
    'BrainSegVol-to-eTIV', 'MaskVol', 'rhCortexVol', 'lhCortexVol', 'Left-WM-hypointensities',
    'Right-WM-hypointensities', 'non-WM-hypointensities', 'Left-non-WM-hypointensities',
    'Right-non-WM-hypointensities']
consider_removal = ['5th-Ventricle', 'SurfaceHoles', 'lhSurfaceHoles', 'rhSurfaceHoles']

In [None]:
vol_thick_features_less = vol_thick_features.drop(columns=to_remove)
corr_less = vol_thick_features_less.corr()
corr_less_df = corr_less.unstack().reset_index() # Unstack correlation matrix 
corr_less_df.columns = ['feature_1','feature_2', 'correlation'] # rename columns
corr_less_df.sort_values(by="correlation",ascending=False, inplace=True) # sort by correlation
corr_less_df = corr_less_df[corr_less_df['feature_1'] != corr_less_df['feature_2']] # Remove self correlation
corr_less_df[corr_less_df.correlation>0.9]

In [None]:
df_drop = df.drop(columns=to_remove)

In [None]:
df_drop.corr()['NEO_N'].sort_values(ascending=False)

In [None]:
df_drop.corr()['NEO_E'].sort_values(ascending=False)

In [None]:
df_drop.corr()['NEO_O'].sort_values(ascending=False)

In [None]:
df_drop.corr()['NEO_A'].sort_values(ascending=False)

In [None]:
df_drop.corr()['NEO_C'].sort_values(ascending=False)

## Preprocessing master_dup_removed v1

In [None]:
# Load the dataset
df_neo_nan = pd.read_excel('/Users/DrV/code/lukevano/tables_2/master_dup_removed v1.xlsx')
df_neo_nan

In [None]:
# Check for null values in main feat/targets
df_neo_nan.isnull().sum().sort_values(ascending=False).head(8)

In [None]:
# Remove nan from NEO_C
df = df_neo_nan[df_neo_nan.NEO_C.notnull()].reset_index(drop=True) # reset or have empty indexes
df

In [None]:
# Discretizing target: 0 = lower, 1 = higher
y_disc = df.iloc[:,6:11]
y = []
for i in range(5):
    y.append(pd.cut(x= y_disc.iloc[:,i], 
                    bins= [y_disc.iloc[:,i].min()-1, y_disc.iloc[:,i].mean(), y_disc.iloc[:,i].max()+1], 
                    labels= [0, 1]))
y = pd.DataFrame(y[0:5]).T
y.head()

In [None]:
# Feature creation
X = df.iloc[:,11:]
X.head()

In [None]:
# Check that no null in features and targets
print(X.isnull().sum().sum())
print(y.isnull().sum())

## Building models and feature selection

Use statsmodels over sklearn as you can determine more about the features. When doing logisitic regression you are sacrificing quality for explainability: you know what is happening behind the model.

The two approaches for building models are either make a model with each feature individually and then start adding the features together or put all the features together and start to remove those that are too highly correlated.

In [None]:
# Add a constant as a feature- needed unless you want to go through origin on Logit
X_int = sm.add_constant(X)
X_int.head()

Build the models and work out what features to keep for each. If you standardise then you lose some information but it allows you to compare coeffiecients: if the feature is on a larger scale smaller coefficients make have a larger impact on the model. The null hypothesis is that the coef is 0 so if p>0.05 then can't prove not 0.

In [None]:
# Here is the output for a non-standardised model
# Will take longer to converge if not scaled= 41 iterations. Get the same accuracy:
# It will always converge on the global minima as Logit is a convex problem
results_NEO_N = sm.Logit(y['NEO_N'], X_int).fit(maxiter=100)
results_NEO_N.summary()

In [None]:
# Normalise features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled

In [None]:
# Add a constant as a feature to the scaled X and make DF
X_scaled_int = sm.add_constant(X_scaled)
X_scaled_int = pd.DataFrame(X_scaled_int, columns=X_int.columns)
X_scaled_int

In [None]:
# Here is the output for a standardised model
results_NEO_N_scaled = sm.Logit(y['NEO_N'], X_scaled_int).fit()
results_NEO_N_scaled.summary()

In [None]:
# Select the thickness col with p<0.25
NEO_N_col_thick_25 = ['lh_bankssts_thickness', 'lh_lateralorbitofrontal_thickness', 
                  'lh_parahippocampal_thickness', 'lh_parsorbitalis_thickness',
                  'lh_postcentral_thickness', 'lh_precentral_thickness',
                  'lh_superiorfrontal_thickness', 'lh_superiorparietal_thickness',
                  'lh_supramarginal_thickness', 'rh_caudalmiddlefrontal_thickness',
                  'rh_entorhinal_thickness', 'rh_inferiorparietal_thickness',
                  'rh_lateraloccipital_thickness', 'rh_lateralorbitofrontal_thickness',
                  'rh_middletemporal_thickness', 'rh_parahippocampal_thickness',
                  'rh_precentral_thickness', 'rh_superiorfrontal_thickness',
                  'rh_superiorparietal_thickness', 'rh_superiortemporal_thickness',
                  'rh_MeanThickness_thickness']

In [None]:
X_NEO_N_25 = X_scaled_int[NEO_N_col_thick_25]
X_NEO_N_25.head()

In [None]:
# Here is the output for a non-standardised model
results_NEO_N_25 = sm.Logit(y['NEO_N'], X_NEO_N_25).fit()
results_NEO_N_25.summary()

In [None]:
# Make y pred and y pred proba for the models
y_pred_proba_NEO_N = results_NEO_N.predict(X_int) # remember to add int to features
y_pred_proba_NEO_N_scaled = results_NEO_N_scaled.predict(X_scaled_int)
y_pred_proba_NEO_N_25 = results_NEO_N_25.predict(X_NEO_N_25)

y_pred_NEO_N = results_NEO_N.predict(X_int)>0.5
y_pred_NEO_N_scaled = results_NEO_N_scaled.predict(X_scaled_int)>0.5
y_pred_NEO_N_25 = results_NEO_N_25.predict(X_NEO_N_25)>0.5

In [None]:
# Print classification reports
print(classification_report(y['NEO_N'], y_pred_NEO_N))
print(classification_report(y['NEO_N'], y_pred_NEO_N_scaled))
print(classification_report(y['NEO_N'], y_pred_NEO_N_25))

In [None]:
# Extract associated metrics and thresholds then work out AUC
fpr, tpr, thresholds = roc_curve(y['NEO_N'], y_pred_proba_NEO_N)
scores = pd.DataFrame({'threshold':thresholds,
                       'fpr':fpr,
                       'tpr':tpr})
scores

In [None]:
plt.plot(scores['fpr'],scores['tpr']);
plt.ylabel('tpr');
plt.xlabel('fpr');

In [None]:
auc_score = roc_auc_score(y['NEO_N'], y_pred_NEO_N)
auc_score

## Building seperate models M and F models for each target

In [None]:
# Clean sex col- drop nan and convert to single char
sex_df = df
print(sex_df.sex.unique())
sex_df.sex.isnull().sum()
sex_df = sex_df[sex_df.sex.notnull()]
sex_df.sex.replace('female', 'F', inplace=True)
sex_df.sex.replace('male', 'M', inplace=True)
print(sex_df.sex.unique())
sex_df.shape

In [None]:
# Make male and female df
f_df = sex_df[sex_df.sex=='F']
m_df = sex_df[sex_df.sex=='M']

In [None]:
# Feature creation
X_f = f_df.iloc[:,11:].reset_index(drop=True)
X_m = m_df.iloc[:,11:].reset_index(drop=True)

In [None]:
# Target creation: 0 = lower, 1 = higher
y_f_disc = f_df.iloc[:,6:11].reset_index(drop=True)
y_f = []
for i in range(5):
    y_f.append(pd.cut(x= y_f_disc.iloc[:,i], 
                    bins= [y_f_disc.iloc[:,i].min()-1, y_f_disc.iloc[:,i].mean(), y_f_disc.iloc[:,i].max()+1], 
                    labels= [0, 1]))
y_f = pd.DataFrame(y_f[0:5]).T.reset_index(drop=True)

y_m_disc = m_df.iloc[:,6:11].reset_index(drop=True)
y_m = []
for i in range(5):
    y_m.append(pd.cut(x= y_m_disc.iloc[:,i], 
                    bins= [y_m_disc.iloc[:,i].min()-1, y_m_disc.iloc[:,i].mean(), y_m_disc.iloc[:,i].max()+1], 
                    labels= [0, 1]))
y_m = pd.DataFrame(y_m[0:5]).T.reset_index(drop=True)

In [None]:
# Normalise features
scaler_f = StandardScaler()
X_f_scaled = scaler_f.fit_transform(X_f)

scaler_m = StandardScaler()
X_m_scaled = scaler_m.fit_transform(X_m)

In [None]:
# Add a constant as a feature to the scaled X
X_f_scaled_int = sm.add_constant(X_f_scaled)
X_f_scaled_int = pd.DataFrame(X_f_scaled_int, columns=pd.Index(['const']).append(X_f.columns))

X_m_scaled_int = sm.add_constant(X_m_scaled)
X_m_scaled_int = pd.DataFrame(X_m_scaled_int, columns=pd.Index(['const']).append(X_f.columns))

In a log reg the coef below show you how the model is made. The const coef means that the value of the const in the feat will be multiplied by the coef- this is why we add a 1 in the const column. The coef in the other columns is the formula for the logit equation.

NEO_N

In [None]:
results_f_NEO_N = sm.Logit(y_f['NEO_N'], X_f_scaled_int).fit(maxiter=100)
results_f_NEO_N.summary()

In [None]:
results_m_NEO_N = sm.Logit(y_m['NEO_N'], X_m_scaled_int).fit(maxiter=100)
results_m_NEO_N.summary()

In [None]:
# Make y pred and y pred proba for the models
y_pred_f_proba_NEO_N = results_f_NEO_N.predict(X_f_scaled_int)
y_pred_m_proba_NEO_N = results_m_NEO_N.predict(X_m_scaled_int)

y_pred_f_NEO_N = results_f_NEO_N.predict(X_f_scaled_int)>0.5
y_pred_m_NEO_N = results_m_NEO_N.predict(X_m_scaled_int)>0.5

In [None]:
# Print classification reports
print(classification_report(y_f['NEO_N'], y_pred_f_NEO_N))
print(classification_report(y_m['NEO_N'], y_pred_m_NEO_N))

In [None]:
auc_f_NEO_N_score = roc_auc_score(y_f['NEO_N'], y_pred_f_NEO_N)
auc_m_NEO_N_score = roc_auc_score(y_m['NEO_N'], y_pred_m_NEO_N)
print(auc_f_NEO_N_score)
auc_m_NEO_N_score

NEO_E

In [None]:
results_f_NEO_E = sm.Logit(y_f['NEO_E'], X_f_scaled_int).fit(maxiter=100)
results_f_NEO_E.summary()

In [None]:
results_m_NEO_E = sm.Logit(y_m['NEO_E'], X_m_scaled_int).fit(maxiter=100)
results_m_NEO_E.summary()

In [None]:
# Make y pred and y pred proba for the models
y_pred_f_proba_NEO_E = results_f_NEO_E.predict(X_f_scaled_int) # add constant to feat
y_pred_m_proba_NEO_E = results_m_NEO_E.predict(X_m_scaled_int)
y_pred_f_NEO_E = results_f_NEO_E.predict(X_f_scaled_int)>0.5
y_pred_m_NEO_E = results_m_NEO_E.predict(X_m_scaled_int)>0.5
# Print classification reports
print(classification_report(y_f['NEO_E'], y_pred_f_NEO_E))
print(classification_report(y_m['NEO_E'], y_pred_m_NEO_E))
auc_f_NEO_E_score = roc_auc_score(y_f['NEO_E'], y_pred_f_NEO_E)
auc_m_NEO_E_score = roc_auc_score(y_m['NEO_E'], y_pred_m_NEO_E)
print(auc_f_NEO_E_score)
auc_m_NEO_E_score

NEO_O

In [None]:
results_f_NEO_O = sm.Logit(y_f['NEO_O'], X_f_scaled_int).fit(maxiter=100)
results_f_NEO_O.summary()

In [None]:
results_m_NEO_O = sm.Logit(y_m['NEO_O'], X_m_scaled_int).fit(maxiter=100)
results_m_NEO_O.summary()

In [None]:
# Make y pred and y pred proba for the models
y_pred_f_proba_NEO_O = results_f_NEO_O.predict(X_f_scaled_int)
y_pred_m_proba_NEO_O = results_m_NEO_O.predict(X_m_scaled_int)
y_pred_f_NEO_O = results_f_NEO_O.predict(X_f_scaled_int)>0.5
y_pred_m_NEO_O = results_m_NEO_O.predict(X_m_scaled_int)>0.5
# Print classification reports
print(classification_report(y_f['NEO_O'], y_pred_f_NEO_O))
print(classification_report(y_m['NEO_O'], y_pred_m_NEO_O))
auc_f_NEO_O_score = roc_auc_score(y_f['NEO_O'], y_pred_f_NEO_O)
auc_m_NEO_O_score = roc_auc_score(y_m['NEO_O'], y_pred_m_NEO_O)
print(auc_f_NEO_O_score)
auc_m_NEO_O_score

NEO_A

In [None]:
results_f_NEO_A = sm.Logit(y_f['NEO_A'], X_f_scaled_int).fit(maxiter=100)
results_f_NEO_A.summary()

In [None]:
results_m_NEO_A = sm.Logit(y_m['NEO_A'], X_m_scaled_int).fit(maxiter=100)
results_m_NEO_A.summary()

In [None]:
# Make y pred and y pred proba for the models
y_pred_f_proba_NEO_A = results_f_NEO_A.predict(X_f_scaled_int)
y_pred_m_proba_NEO_A = results_m_NEO_A.predict(X_m_scaled_int)
y_pred_f_NEO_A = results_f_NEO_A.predict(X_f_scaled_int)>0.5
y_pred_m_NEO_A = results_m_NEO_A.predict(X_m_scaled_int)>0.5
# Print classification reports
print(classification_report(y_f['NEO_A'], y_pred_f_NEO_A))
print(classification_report(y_m['NEO_A'], y_pred_m_NEO_A))
auc_f_NEO_A_score = roc_auc_score(y_f['NEO_A'], y_pred_f_NEO_A)
auc_m_NEO_A_score = roc_auc_score(y_m['NEO_A'], y_pred_m_NEO_A)
print(auc_f_NEO_A_score)
auc_m_NEO_A_score

NEO_C

In [None]:
results_f_NEO_C = sm.Logit(y_f['NEO_C'], X_f_scaled_int).fit(maxiter=100)
results_f_NEO_C.summary()

In [None]:
results_m_NEO_C = sm.Logit(y_m['NEO_C'], X_m_scaled_int).fit(maxiter=100)
results_m_NEO_C.summary()

In [None]:
# Make y pred and y pred proba for the models
y_pred_f_proba_NEO_C = results_f_NEO_C.predict(X_f_scaled_int)
y_pred_m_proba_NEO_C = results_m_NEO_C.predict(X_m_scaled_int)
y_pred_f_NEO_C = results_f_NEO_C.predict(X_f_scaled_int)>0.5
y_pred_m_NEO_C = results_m_NEO_C.predict(X_m_scaled_int)>0.5
# Print classification reports
print(classification_report(y_f['NEO_C'], y_pred_f_NEO_C))
print(classification_report(y_m['NEO_C'], y_pred_m_NEO_C))
auc_f_NEO_C_score = roc_auc_score(y_f['NEO_C'], y_pred_f_NEO_C)
auc_m_NEO_C_score = roc_auc_score(y_m['NEO_C'], y_pred_m_NEO_C)
print(auc_f_NEO_C_score)
auc_m_NEO_C_score

## Feature selection

Show the most important features for each target

In [None]:
NEO_N_f_best_feats = pd.DataFrame([results_f_NEO_N.params, results_f_NEO_N.pvalues], index=['coef', 'p-value']).T
NEO_N_f_best_feats[NEO_N_f_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

In [None]:
NEO_N_m_best_feats = pd.DataFrame([results_m_NEO_N.params, results_m_NEO_N.pvalues], index=['coef', 'p-value']).T
NEO_N_m_best_feats[NEO_N_m_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

In [None]:
NEO_E_f_best_feats = pd.DataFrame([results_f_NEO_E.params, results_f_NEO_E.pvalues], index=['coef', 'p-value']).T
NEO_E_f_best_feats[NEO_E_f_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

In [None]:
NEO_E_m_best_feats = pd.DataFrame([results_m_NEO_E.params, results_m_NEO_E.pvalues], index=['coef', 'p-value']).T
NEO_E_m_best_feats[NEO_E_m_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

In [None]:
NEO_O_f_best_feats = pd.DataFrame([results_f_NEO_O.params, results_f_NEO_O.pvalues], index=['coef', 'p-value']).T
NEO_O_f_best_feats[NEO_O_f_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

In [None]:
NEO_O_m_best_feats = pd.DataFrame([results_m_NEO_O.params, results_m_NEO_O.pvalues], index=['coef', 'p-value']).T
NEO_O_m_best_feats[NEO_O_m_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

In [None]:
NEO_A_f_best_feats = pd.DataFrame([results_f_NEO_A.params, results_f_NEO_A.pvalues], index=['coef', 'p-value']).T
NEO_A_f_best_feats[NEO_A_f_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

In [None]:
NEO_A_m_best_feats = pd.DataFrame([results_m_NEO_A.params, results_m_NEO_A.pvalues], index=['coef', 'p-value']).T
NEO_A_m_best_feats[NEO_A_m_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

In [None]:
NEO_C_f_best_feats = pd.DataFrame([results_f_NEO_C.params, results_f_NEO_C.pvalues], index=['coef', 'p-value']).T
NEO_C_f_best_feats[NEO_C_f_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

In [None]:
NEO_C_m_best_feats = pd.DataFrame([results_m_NEO_C.params, results_m_NEO_C.pvalues], index=['coef', 'p-value']).T
NEO_C_m_best_feats[NEO_C_m_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

In order to do feature permutation need to make an sklearn model as this function will not work with statsmodel.

NEO_N

In [None]:
# # Use this to get an idea how sklearn models are created
# # need to remove const from coef and put this in int- then use X without const
# sk_lr = LogisticRegression()
# sk_lr.fit(X_f_scaled_int, y_f['NEO_N'])
# sk_lr.coef_
# sk_lr.intercept_
# sk_lr.classes_

In [None]:
# Make the sklearn logreg model using results from them sm model
sk_lr = LogisticRegression()
sk_lr.coef_ = np.array([results_f_NEO_N.params[1:]])
sk_lr.intercept_ = np.array([results_f_NEO_N.params[0]])
sk_lr.classes_ = np.array([0, 1])
# Selection through multivariate analysis- showing the absolute value that feat contributes to score
# Make sure to use X without the constant
permutation_f_NEO_N_score = permutation_importance(sk_lr, X_f_scaled, y_f['NEO_N'], scoring='accuracy', n_repeats=10) # Perform Permutation
importance_f_NEO_N = pd.DataFrame(np.vstack((X.columns,\
        permutation_f_NEO_N_score.importances_mean)).T) # Unstack results
importance_f_NEO_N.columns=['feature','score decrease']
importance_f_NEO_N.sort_values(by="score decrease", ascending = False, inplace=True) # Order by importance
importance_f_NEO_N

In [None]:
# The model is just using size of the largest areas!
importance_f_NEO_N.head(10)

In [None]:
# Make the sklearn logreg model using results from them sm model
sk_lr = LogisticRegression()
sk_lr.coef_ = np.array([results_m_NEO_N.params[1:]])
sk_lr.intercept_ = np.array([results_m_NEO_N.params[0]])
sk_lr.classes_ = np.array([0, 1])
# Selection through multivariate analysis- showing the absolute value that feat contributes to score
# Make sure to use X without the constant
permutation_m_NEO_N_score = permutation_importance(sk_lr, X_m_scaled, y_m['NEO_N'], scoring='accuracy', n_repeats=10) # Perform Permutation
importance_m_NEO_N = pd.DataFrame(np.vstack((X.columns,\
        permutation_m_NEO_N_score.importances_mean)).T) # Unstack results
importance_m_NEO_N.columns=['feature','score decrease']
importance_m_NEO_N.sort_values(by="score decrease", ascending = False, inplace=True) # Order by importance
importance_m_NEO_N

In [None]:
# The model is just using size of the largest areas!
importance_m_NEO_N.head(10)

NEO_E

In [None]:
# Make the sklearn logreg model using results from them sm model
sk_lr = LogisticRegression()
sk_lr.coef_ = np.array([results_f_NEO_E.params[1:]])
sk_lr.intercept_ = np.array([results_f_NEO_E.params[0]])
sk_lr.classes_ = np.array([0, 1])
# Selection through multivariate analysis- showing the absolute value that feat contributes to score
# Make sure to use X without the constant
permutation_f_NEO_E_score = permutation_importance(sk_lr, X_f_scaled, y_f['NEO_E'], scoring='accuracy', n_repeats=10) # Perform Permutation
importance_f_NEO_E = pd.DataFrame(np.vstack((X.columns,\
        permutation_f_NEO_E_score.importances_mean)).T) # Unstack results
importance_f_NEO_E.columns=['feature','score decrease']
importance_f_NEO_E.sort_values(by="score decrease", ascending = False, inplace=True) # Order by importance
importance_f_NEO_E

In [None]:
# The model is just using size of the largest areas!
importance_f_NEO_E.head(10)

In [None]:
# Make the sklearn logreg model using results from them sm model
sk_lr = LogisticRegression()
sk_lr.coef_ = np.array([results_m_NEO_E.params[1:]])
sk_lr.intercept_ = np.array([results_m_NEO_E.params[0]])
sk_lr.classes_ = np.array([0, 1])
# Selection through multivariate analysis- showing the absolute value that feat contributes to score
# Make sure to use X without the constant
permutation_m_NEO_E_score = permutation_importance(sk_lr, X_m_scaled, y_m['NEO_E'], scoring='accuracy', n_repeats=10) # Perform Permutation
importance_m_NEO_E = pd.DataFrame(np.vstack((X.columns,\
        permutation_m_NEO_E_score.importances_mean)).T) # Unstack results
importance_m_NEO_E.columns=['feature','score decrease']
importance_m_NEO_E.sort_values(by="score decrease", ascending = False, inplace=True) # Order by importance
importance_m_NEO_E

In [None]:
# The model is just using size of the largest areas!
importance_m_NEO_E.head(10)

In [None]:
# Make the sklearn logreg model using results from them sm model
sk_lr = LogisticRegression()
sk_lr.coef_ = np.array([results_f_NEO_O.params[1:]])
sk_lr.intercept_ = np.array([results_f_NEO_O.params[0]])
sk_lr.classes_ = np.array([0, 1])
# Selection through multivariate analysis- showing the absolute value that feat contributes to score
# Make sure to use X without the constant
permutation_f_NEO_O_score = permutation_importance(sk_lr, X_f_scaled, y_f['NEO_O'], scoring='accuracy', n_repeats=10) # Perform Permutation
importance_f_NEO_O = pd.DataFrame(np.vstack((X.columns,\
        permutation_f_NEO_O_score.importances_mean)).T) # Unstack results
importance_f_NEO_O.columns=['feature','score decrease']
importance_f_NEO_O.sort_values(by="score decrease", ascending = False, inplace=True) # Order by importance
importance_f_NEO_O

In [None]:
# The model is just using size of the largest areas!
importance_f_NEO_O.head(10)

In [None]:
# Make the sklearn logreg model using results from them sm model
sk_lr = LogisticRegression()
sk_lr.coef_ = np.array([results_m_NEO_O.params[1:]])
sk_lr.intercept_ = np.array([results_m_NEO_O.params[0]])
sk_lr.classes_ = np.array([0, 1])
# Selection through multivariate analysis- showing the absolute value that feat contributes to score
# Make sure to use X without the constant
permutation_m_NEO_O_score = permutation_importance(sk_lr, X_m_scaled, y_m['NEO_O'], scoring='accuracy', n_repeats=10) # Perform Permutation
importance_m_NEO_O = pd.DataFrame(np.vstack((X.columns,\
        permutation_m_NEO_O_score.importances_mean)).T) # Unstack results
importance_m_NEO_O.columns=['feature','score decrease']
importance_m_NEO_O.sort_values(by="score decrease", ascending = False, inplace=True) # Order by importance
importance_m_NEO_O

In [None]:
# The model is just using size of the largest areas!
importance_m_NEO_O.head(10)

In [None]:
# Make the sklearn logreg model using results from them sm model
sk_lr = LogisticRegression()
sk_lr.coef_ = np.array([results_f_NEO_A.params[1:]])
sk_lr.intercept_ = np.array([results_f_NEO_A.params[0]])
sk_lr.classes_ = np.array([0, 1])
# Selection through multivariate analysis- showing the absolute value that feat contributes to score
# Make sure to use X without the constant
permutation_f_NEO_A_score = permutation_importance(sk_lr, X_f_scaled, y_f['NEO_A'], scoring='accuracy', n_repeats=10) # Perform Permutation
importance_f_NEO_A = pd.DataFrame(np.vstack((X.columns,\
        permutation_f_NEO_A_score.importances_mean)).T) # Unstack results
importance_f_NEO_A.columns=['feature','score decrease']
importance_f_NEO_A.sort_values(by="score decrease", ascending = False, inplace=True) # Order by importance
importance_f_NEO_A

In [None]:
# The model is just using size of the largest areas!
importance_f_NEO_A.head(10)

In [None]:
# Make the sklearn logreg model using results from them sm model
sk_lr = LogisticRegression()
sk_lr.coef_ = np.array([results_m_NEO_A.params[1:]])
sk_lr.intercept_ = np.array([results_m_NEO_A.params[0]])
sk_lr.classes_ = np.array([0, 1])
# Selection through multivariate analysis- showing the absolute value that feat contributes to score
# Make sure to use X without the constant
permutation_m_NEO_A_score = permutation_importance(sk_lr, X_m_scaled, y_m['NEO_A'], scoring='accuracy', n_repeats=10) # Perform Permutation
importance_m_NEO_A = pd.DataFrame(np.vstack((X.columns,\
        permutation_m_NEO_A_score.importances_mean)).T) # Unstack results
importance_m_NEO_A.columns=['feature','score decrease']
importance_m_NEO_A.sort_values(by="score decrease", ascending = False, inplace=True) # Order by importance
importance_m_NEO_A

In [None]:
# The model is just using size of the largest areas!
importance_m_NEO_A.head(10)

In [None]:
# Make the sklearn logreg model using results from them sm model
sk_lr = LogisticRegression()
sk_lr.coef_ = np.array([results_f_NEO_C.params[1:]])
sk_lr.intercept_ = np.array([results_f_NEO_C.params[0]])
sk_lr.classes_ = np.array([0, 1])
# Selection through multivariate analysis- showing the absolute value that feat contributes to score
# Make sure to use X without the constant
permutation_f_NEO_C_score = permutation_importance(sk_lr, X_f_scaled, y_f['NEO_C'], scoring='accuracy', n_repeats=10) # Perform Permutation
importance_f_NEO_C = pd.DataFrame(np.vstack((X.columns,\
        permutation_f_NEO_C_score.importances_mean)).T) # Unstack results
importance_f_NEO_C.columns=['feature','score decrease']
importance_f_NEO_C.sort_values(by="score decrease", ascending = False, inplace=True) # Order by importance
importance_f_NEO_C

In [None]:
# The model is just using size of the largest areas!
importance_f_NEO_C.head(10)

In [None]:
# Make the sklearn logreg model using results from them sm model
sk_lr = LogisticRegression()
sk_lr.coef_ = np.array([results_m_NEO_C.params[1:]])
sk_lr.intercept_ = np.array([results_m_NEO_C.params[0]])
sk_lr.classes_ = np.array([0, 1])
# Selection through multivariate analysis- showing the absolute value that feat contributes to score
# Make sure to use X without the constant
permutation_m_NEO_C_score = permutation_importance(sk_lr, X_m_scaled, y_m['NEO_C'], scoring='accuracy', n_repeats=10) # Perform Permutation
importance_m_NEO_C = pd.DataFrame(np.vstack((X.columns,\
        permutation_m_NEO_C_score.importances_mean)).T) # Unstack results
importance_m_NEO_C.columns=['feature','score decrease']
importance_m_NEO_C.sort_values(by="score decrease", ascending = False, inplace=True) # Order by importance
importance_m_NEO_C

In [None]:
# The model is just using size of the largest areas!
importance_m_NEO_C.head(10)

# Linear Regression

Female model

In [None]:
results_lin_f_NEO_N = sm.OLS(y_f_disc['NEO_N'], X_f_scaled_int).fit(maxiter=100)
results_lin_f_NEO_N.summary()

In [None]:
# Make y pred
y_pred_lin_f_NEO_N = results_lin_f_NEO_N.predict(X_f_scaled_int)

In [None]:
# Female regression metrics
mse_f = mean_squared_error(y_f_disc['NEO_N'], y_pred_lin_f_NEO_N)
rmse_f = math.sqrt(mse_f)
mae_f = mean_absolute_error(y_f_disc['NEO_N'], y_pred_lin_f_NEO_N)
rsquared_f = r2_score(y_f_disc['NEO_N'], y_pred_lin_f_NEO_N)
max_error_f = max_error(y_f_disc['NEO_N'], y_pred_lin_f_NEO_N)
print('MSE =', round(mse_f, 2))
print('RMSE =', round(rmse_f, 2))
print('MAE =', round(mae_f, 2))
print('R2 =', round(rsquared_f, 2))
print('Max Error =', round(max_error_f, 2))

# SVM Classification

In [None]:
# # equivalent but with SGD solver
# from sklearn.linear_model import SGDClassifier
# svc_bis = SGDClassifier(loss='hinge', penalty='l2', alpha=1/10)
# # Plot your instantiated classifier 
# from utils.plots import plot_decision_regions
# plot_decision_regions(X, y, classifier=svm_10) # svm_10 is the model
# # Do train/test split
# X_f_train, y_f_train, X_f_test, y_f_test = train_test_split(X_f,y_f,test_size=0.3)
# X_m_train, y_m_train, X_m_test, y_m_test = train_test_split(X_m,y_m,test_size=0.3)

In [None]:
# SVM Classification
svc_1 = SVC(kernel='rbf', C=1) # Linear kernel

In [None]:
cv_results = cross_validate(svc_1, X_f_scaled_int, y_f['NEO_N'], cv=5, scoring=['accuracy', 
                                                                                'precision', 'recall', 'f1'])

In [None]:
cv_results

# Logistic Regression with created targets

Use KMeans and Gaussian clustering to generate new targets for the model- This will take into account the overlap of personality types.

In [None]:
y_sex = sex_df.iloc[:, 6:11]
clusters = KMeans(n_clusters=5, random_state=0)
scaler = StandardScaler()
y_scaled = pd.DataFrame(scaler.fit_transform(y_sex), columns=y_disc.columns)
clusters.fit_transform(y_scaled)

cluster_names = ['Leader', 'Task Orientated', 'Maverick', 'Anxious', 'Workaholic']

centers = pd.DataFrame(scaler.inverse_transform(clusters.cluster_centers_),
                       columns=y_disc.columns, index=cluster_names)
print('KNN centers')
print('')
print(centers) # The centers may be different from what others use- They often change position too
print('')
print('KNN normalised')
centers2 = pd.DataFrame(clusters.cluster_centers_,columns=y_disc.columns,index=cluster_names)
centers2

Leader (Calm extrovert open agreeable organised)\
Task Orientated (Calm slightly extrovert close-minded organised)\
Maverick (Slightly neurotic slightly extrovert open-ish disagreeable disorganised)\
Anxious (Anxious introvert open-ish agreeable disorganised)\
Workaholic (Introvert closeminded)

In [None]:
# Luke's personality
luke = pd.DataFrame({'NEO_N':-1.28, 'NEO_E':1.3, 'NEO_O':-0.62, 'NEO_A':0.65, 'NEO_C':0.62}, index=['Luke'])
luke

In [None]:
clusters.predict(luke)

In [None]:
# Maria's personality
maria = pd.DataFrame({'NEO_N':-1.04, 'NEO_E':0.63, 'NEO_O':0.65, 'NEO_A':1.86, 'NEO_C':1.54}, index=['Maria'])
maria

In [None]:
clusters.predict(maria)

In [None]:
# Add the KNN labels to the master df
target_df = sex_df
knn_label = clusters.labels_
target_df['knn_label'] = knn_label
target_df.knn_label.value_counts()

In [None]:
# Gaussian clustering
gaussian_clusters = gm = GaussianMixture(n_components=5, random_state=0)
gaussian_clusters.fit(y_scaled)
gaussian_centers = pd.DataFrame(scaler.inverse_transform(gaussian_clusters.means_),columns=y_disc.columns)
gaussian_label = gaussian_clusters.predict(y_scaled)
target_df['gaussian_label'] = gaussian_label
target_df.gaussian_label.value_counts()

In [None]:
print('Gaussian centers')
print('')
print(gaussian_centers) # The centers may be different from what others use- They often change position too
print('')
print('Gaussian normalised')
centers2 = pd.DataFrame(gaussian_clusters.means_,columns=y_disc.columns)
centers2

0: Leadership (Calm extrovert slightly agreeable organised)\
1: Isolatedness (Introvert close-minded disagreeable slightly-disorganised)\
2: Efficiency (Agreeable organised)\
3: Anxiety (Anxious introvert open-ish agreeable disorganised)\
4: Criminality (Slightly neurotic slightly extrovert open-ish disagreeable disorganised)

In [None]:
target_df

In [None]:
# Make male and female df
f_target_df = target_df[sex_df.sex=='F'].reset_index(drop=True)
m_target_df = target_df[sex_df.sex=='M'].reset_index(drop=True)
y_f_knn = f_target_df.knn_label
y_m_knn = m_target_df.knn_label
y_f_gaussian = f_target_df.gaussian_label
y_m_gaussian = m_target_df.gaussian_label

In [None]:
y_f_knn.value_counts(normalize=True)

In [None]:
y_m_knn.value_counts(normalize=True)

### KNN clustering

In [None]:
results_f_knn_0 = sm.Logit(y_f_knn==0, X_f_scaled_int).fit(maxiter=100)
results_f_knn_0.summary()

In [None]:
results_f_knn_1 = sm.Logit(y_f_knn==1, X_f_scaled_int).fit(maxiter=100)
results_f_knn_1.summary()

In [None]:
results_f_knn_2 = sm.Logit(y_f_knn==2, X_f_scaled_int).fit(maxiter=100)
results_f_knn_2.summary()

In [None]:
results_f_knn_3 = sm.Logit(y_f_knn==3, X_f_scaled_int).fit(maxiter=100)
results_f_knn_3.summary()

In [None]:
results_f_knn_4 = sm.Logit(y_f_knn==4, X_f_scaled_int).fit(maxiter=100)
results_f_knn_4.summary()

In [None]:
results_m_knn_0 = sm.Logit(y_m_knn==0, X_m_scaled_int).fit(maxiter=100)
results_m_knn_0.summary()

In [None]:
results_m_knn_1 = sm.Logit(y_m_knn==1, X_m_scaled_int).fit(maxiter=100)
results_m_knn_1.summary()

In [None]:
results_m_knn_2 = sm.Logit(y_m_knn==2, X_m_scaled_int).fit(maxiter=100)
results_m_knn_2.summary()

In [None]:
results_m_knn_3 = sm.Logit(y_m_knn==3, X_m_scaled_int).fit(maxiter=100)
results_m_knn_3.summary()

In [None]:
results_m_knn_4 = sm.Logit(y_m_knn==4, X_m_scaled_int).fit(maxiter=100)
results_m_knn_4.summary()

In [None]:
# Make y pred proba for the models

y_pred_f_knn_0_proba = results_f_knn_0.predict(X_f_scaled_int) # add constant to feat
y_pred_m_knn_0_proba = results_m_knn_0.predict(X_m_scaled_int)
y_pred_f_knn_1_proba = results_f_knn_1.predict(X_f_scaled_int) # add constant to feat
y_pred_m_knn_1_proba = results_m_knn_1.predict(X_m_scaled_int)
y_pred_f_knn_2_proba = results_f_knn_2.predict(X_f_scaled_int) # add constant to feat
y_pred_m_knn_2_proba = results_m_knn_2.predict(X_m_scaled_int)
y_pred_f_knn_3_proba = results_f_knn_3.predict(X_f_scaled_int) # add constant to feat
y_pred_m_knn_3_proba = results_m_knn_3.predict(X_m_scaled_int)
y_pred_f_knn_4_proba = results_f_knn_4.predict(X_f_scaled_int) # add constant to feat
y_pred_m_knn_4_proba = results_m_knn_4.predict(X_m_scaled_int)

In [None]:
f_knn_proba = pd.DataFrame({'Leader': y_pred_f_knn_0_proba,
                            'Task orientatied': y_pred_f_knn_1_proba,
                            'Maverick': y_pred_f_knn_2_proba,
                            'Anxious': y_pred_f_knn_3_proba,
                            'Workaholic': y_pred_f_knn_4_proba})
f_knn_proba

In [None]:
m_knn_proba = pd.DataFrame({0: y_pred_m_knn_0_proba,
                            1: y_pred_m_knn_1_proba,
                            2: y_pred_m_knn_2_proba,
                            3: y_pred_m_knn_3_proba,
                            4: y_pred_m_knn_4_proba})
m_knn_proba

KNN OvM 

In [None]:
import matplotlib.pyplot as plt

In [None]:
f_dict={'Leader':0.57,'Task oriented':0.53,'Maverick':0.54,'Anxious':0.51,'Workaholic':.54}
m_dict={'Leader':0.64,'Task oriented':0.53,'Maverick':0.54,'Anxious':0.51,'Workaholic':.54}

In [None]:
print(classification_report(y_f_knn, np.argmax(f_knn_proba.values, axis = 1)))
print(classification_report(y_m_knn, np.argmax(m_knn_proba.values, axis = 1)))

## Maria and Luke

In [None]:
mllv = pd.read_excel(path_to_excel+'ml_lv_stats.xlsx')

In [None]:
ml = pd.DataFrame(mllv.loc[1,:][1:]).T
ml

In [None]:
# Normalise features

X_ml_scaled = scaler_f.transform(ml.values)
X_ml_scaled_int = np.insert(X_ml_scaled, 0, 1)
X_ml_df = pd.DataFrame(pd.DataFrame(X_ml_scaled_int)).T
X_ml_df.columns = pd.Index(['const']).append(X_f.columns)
X_ml_df

In [None]:
y_pred_ml_knn_0_proba = results_f_knn_0.predict(X_ml_df) # add constant to feat
y_pred_ml_knn_1_proba = results_f_knn_1.predict(X_ml_df) # add constant to feat
y_pred_ml_knn_2_proba = results_f_knn_2.predict(X_ml_df) # add constant to feat
y_pred_ml_knn_3_proba = results_f_knn_3.predict(X_ml_df) # add constant to feat
y_pred_ml_knn_4_proba = results_f_knn_4.predict(X_ml_df) # add constant to feat

In [None]:
ml_knn_proba = pd.DataFrame({'Leader': y_pred_ml_knn_0_proba,
                            'Task orientatied': y_pred_ml_knn_1_proba,
                            'Maverick': y_pred_ml_knn_2_proba,
                            'Anxious': y_pred_ml_knn_3_proba,
                            'Workaholic': y_pred_ml_knn_4_proba})
ml_knn_proba

In [None]:
lv = pd.DataFrame(mllv.loc[0,:][1:]).T
lv

In [None]:
# Normalise features

X_lv_scaled = scaler_m.transform(lv.values)
X_lv_scaled_int = np.insert(X_lv_scaled, 0, 1)
X_lv_df = pd.DataFrame(pd.DataFrame(X_lv_scaled_int)).T
X_lv_df.columns = pd.Index(['const']).append(X_f.columns)
X_lv_df

In [None]:
y_pred_lv_knn_0_proba = results_m_knn_0.predict(X_lv_df) # add constant to feat
y_pred_lv_knn_1_proba = results_m_knn_1.predict(X_lv_df) # add constant to feat
y_pred_lv_knn_2_proba = results_m_knn_2.predict(X_lv_df) # add constant to feat
y_pred_lv_knn_3_proba = results_m_knn_3.predict(X_lv_df) # add constant to feat
y_pred_lv_knn_4_proba = results_m_knn_4.predict(X_lv_df) # add constant to feat

In [None]:
lv_knn_proba = pd.DataFrame({'Leader': y_pred_lv_knn_0_proba,
                            'Task orientatied': y_pred_lv_knn_1_proba,
                            'Maverick': y_pred_lv_knn_2_proba,
                            'Anxious': y_pred_lv_knn_3_proba,
                            'Workaholic': y_pred_lv_knn_4_proba})
lv_knn_proba

### Gaussian clustering

In [None]:
results_f_gaussian_0 = sm.Logit(y_f_gaussian==0, X_f_scaled_int).fit(maxiter=100)
results_f_gaussian_0.summary()

In [None]:
results_m_gaussian_0 = sm.Logit(y_m_gaussian==0, X_m_scaled_int).fit(maxiter=100)
results_m_gaussian_0.summary()

In [None]:
results_f_gaussian_1 = sm.Logit(y_f_gaussian==1, X_f_scaled_int).fit(maxiter=100)
results_f_gaussian_1.summary()

In [None]:
results_m_gaussian_1 = sm.Logit(y_m_gaussian==1, X_m_scaled_int).fit(maxiter=100)
results_m_gaussian_1.summary()

In [None]:
results_f_gaussian_2 = sm.Logit(y_f_gaussian==2, X_f_scaled_int).fit(maxiter=100)
results_f_gaussian_2.summary()

In [None]:
results_m_gaussian_2 = sm.Logit(y_m_gaussian==2, X_m_scaled_int).fit(maxiter=100)
results_m_gaussian_2.summary()

In [None]:
results_f_gaussian_3 = sm.Logit(y_f_gaussian==3, X_f_scaled_int).fit(maxiter=100)
results_f_gaussian_3.summary()

In [None]:
results_m_gaussian_3 = sm.Logit(y_m_gaussian==3, X_m_scaled_int).fit(maxiter=100)
results_m_gaussian_3.summary()

In [None]:
results_f_gaussian_4 = sm.Logit(y_f_gaussian==4, X_f_scaled_int).fit(maxiter=100)
results_f_gaussian_4.summary()

In [None]:
results_m_gaussian_4 = sm.Logit(y_m_gaussian==4, X_m_scaled_int).fit(maxiter=100)
results_m_gaussian_4.summary()

In [None]:
# Make y pred proba for the models

y_pred_f_gaussian_0_proba = results_f_gaussian_0.predict(X_f_scaled_int) # add constant to feat
y_pred_m_gaussian_0_proba = results_m_gaussian_0.predict(X_m_scaled_int)
y_pred_f_gaussian_1_proba = results_f_gaussian_1.predict(X_f_scaled_int) # add constant to feat
y_pred_m_gaussian_1_proba = results_m_gaussian_1.predict(X_m_scaled_int)
y_pred_f_gaussian_2_proba = results_f_gaussian_2.predict(X_f_scaled_int) # add constant to feat
y_pred_m_gaussian_2_proba = results_m_gaussian_2.predict(X_m_scaled_int)
y_pred_f_gaussian_3_proba = results_f_gaussian_3.predict(X_f_scaled_int) # add constant to feat
y_pred_m_gaussian_3_proba = results_m_gaussian_3.predict(X_m_scaled_int)
y_pred_f_gaussian_4_proba = results_f_gaussian_4.predict(X_f_scaled_int) # add constant to feat
y_pred_m_gaussian_4_proba = results_m_gaussian_4.predict(X_m_scaled_int)

In [None]:
f_gaussian_proba = pd.DataFrame({0: y_pred_f_gaussian_0_proba,
                                 1: y_pred_f_gaussian_1_proba,
                                 2: y_pred_f_gaussian_2_proba,
                                 3: y_pred_f_gaussian_3_proba,
                                 4: y_pred_f_gaussian_4_proba})
f_gaussian_proba

In [None]:
m_gaussian_proba = pd.DataFrame({0: y_pred_m_gaussian_0_proba,
                                 1: y_pred_m_gaussian_1_proba,
                                 2: y_pred_m_gaussian_2_proba,
                                 3: y_pred_m_gaussian_3_proba,
                                 4: y_pred_m_gaussian_4_proba})
m_gaussian_proba

In [None]:
print('Female models')
print('')
print(classification_report(y_f_gaussian, np.argmax(f_gaussian_proba.values, axis = 1)))
print('')
print('Male models')
print('')
print(classification_report(y_m_gaussian, np.argmax(m_gaussian_proba.values, axis = 1)))

## Feature selection created targets

In [None]:
KNN_0_f_best_feats = pd.DataFrame([results_f_knn_0.params, results_f_knn_0.pvalues], index=['coef', 'p-value']).T
KNN_0_f_best_feats[KNN_0_f_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

In [None]:
KNN_1_f_best_feats = pd.DataFrame([results_f_knn_1.params, results_f_knn_1.pvalues], index=['coef', 'p-value']).T
KNN_1_f_best_feats[KNN_1_f_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

In [None]:
KNN_2_f_best_feats = pd.DataFrame([results_f_knn_2.params, results_f_knn_2.pvalues], index=['coef', 'p-value']).T
KNN_2_f_best_feats[KNN_2_f_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

In [None]:
KNN_3_f_best_feats = pd.DataFrame([results_f_knn_3.params, results_f_knn_3.pvalues], index=['coef', 'p-value']).T
KNN_3_f_best_feats[KNN_3_f_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

In [None]:
KNN_4_f_best_feats = pd.DataFrame([results_f_knn_4.params, results_f_knn_4.pvalues], index=['coef', 'p-value']).T
KNN_4_f_best_feats[KNN_1_f_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

In [None]:
KNN_0_m_best_feats = pd.DataFrame([results_m_knn_0.params, results_m_knn_0.pvalues], index=['coef', 'p-value']).T
KNN_0_m_best_feats[KNN_0_m_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

In [None]:
KNN_1_m_best_feats = pd.DataFrame([results_m_knn_1.params, results_m_knn_1.pvalues], index=['coef', 'p-value']).T
KNN_1_m_best_feats[KNN_1_m_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

In [None]:
KNN_2_m_best_feats = pd.DataFrame([results_m_knn_2.params, results_m_knn_2.pvalues], index=['coef', 'p-value']).T
KNN_2_m_best_feats[KNN_2_m_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

In [None]:
KNN_3_m_best_feats = pd.DataFrame([results_m_knn_3.params, results_m_knn_3.pvalues], index=['coef', 'p-value']).T
KNN_3_m_best_feats[KNN_3_m_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

In [None]:
KNN_4_m_best_feats = pd.DataFrame([results_m_knn_4.params, results_m_knn_4.pvalues], index=['coef', 'p-value']).T
KNN_4_m_best_feats[KNN_4_m_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

In [None]:
gaussian_0_f_best_feats = pd.DataFrame([results_f_gaussian_0.params, results_f_gaussian_0.pvalues], index=['coef', 'p-value']).T
gaussian_0_f_best_feats[gaussian_0_f_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

In [None]:
gaussian_1_f_best_feats = pd.DataFrame([results_f_gaussian_1.params, results_f_gaussian_1.pvalues], index=['coef', 'p-value']).T
gaussian_1_f_best_feats[gaussian_1_f_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

In [None]:
gaussian_2_f_best_feats = pd.DataFrame([results_f_gaussian_2.params, results_f_gaussian_2.pvalues], index=['coef', 'p-value']).T
gaussian_2_f_best_feats[gaussian_2_f_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

In [None]:
gaussian_3_f_best_feats = pd.DataFrame([results_f_gaussian_3.params, results_f_gaussian_3.pvalues], index=['coef', 'p-value']).T
gaussian_3_f_best_feats[gaussian_3_f_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

In [None]:
gaussian_4_f_best_feats = pd.DataFrame([results_f_gaussian_4.params, results_f_gaussian_4.pvalues], index=['coef', 'p-value']).T
gaussian_4_f_best_feats[gaussian_4_f_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

In [None]:
gaussian_0_m_best_feats = pd.DataFrame([results_m_gaussian_0.params, results_m_gaussian_0.pvalues], index=['coef', 'p-value']).T
gaussian_0_m_best_feats[gaussian_0_m_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

In [None]:
gaussian_1_m_best_feats = pd.DataFrame([results_m_gaussian_1.params, results_m_gaussian_1.pvalues], index=['coef', 'p-value']).T
gaussian_1_m_best_feats[gaussian_1_m_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

In [None]:
gaussian_2_m_best_feats = pd.DataFrame([results_m_gaussian_2.params, results_m_gaussian_2.pvalues], index=['coef', 'p-value']).T
gaussian_2_m_best_feats[gaussian_2_m_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

In [None]:
gaussian_3_m_best_feats = pd.DataFrame([results_m_gaussian_3.params, results_m_gaussian_3.pvalues], index=['coef', 'p-value']).T
gaussian_3_m_best_feats[gaussian_3_m_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

In [None]:
gaussian_4_m_best_feats = pd.DataFrame([results_m_gaussian_4.params, results_m_gaussian_4.pvalues], index=['coef', 'p-value']).T
gaussian_4_m_best_feats[gaussian_4_m_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

In [None]:
# Make the sklearn logreg model using results from them sm model
sk_lr = LogisticRegression()
sk_lr.coef_ = np.array([results_f_knn_0.params[1:]])
sk_lr.intercept_ = np.array([results_f_knn_0.params[0]])
sk_lr.classes_ = np.array([0, 1])
# Selection through multivariate analysis- showing the absolute value that feat contributes to score
# Make sure to use X without the constant
permutation_f_knn_0_score = permutation_importance(sk_lr, X_f_scaled, y_f_knn==0, scoring='accuracy', n_repeats=10) # Perform Permutation
importance_f_knn_0 = pd.DataFrame(np.vstack((X.columns,\
        permutation_f_knn_0_score.importances_mean)).T) # Unstack results
importance_f_knn_0.columns=['feature','score decrease']
importance_f_knn_0.sort_values(by="score decrease", ascending = False, inplace=True) # Order by importance
importance_f_knn_0.head(20)

In [None]:
# Make the sklearn logreg model using results from them sm model
sk_lr = LogisticRegression()
sk_lr.coef_ = np.array([results_f_knn_1.params[1:]])
sk_lr.intercept_ = np.array([results_f_knn_1.params[0]])
sk_lr.classes_ = np.array([0, 1])
# Selection through multivariate analysis- showing the absolute value that feat contributes to score
# Make sure to use X without the constant
permutation_f_knn_1_score = permutation_importance(sk_lr, X_f_scaled, y_f_knn==1, scoring='accuracy', n_repeats=10) # Perform Permutation
importance_f_knn_1 = pd.DataFrame(np.vstack((X.columns,\
        permutation_f_knn_1_score.importances_mean)).T) # Unstack results
importance_f_knn_1.columns=['feature','score decrease']
importance_f_knn_1.sort_values(by="score decrease", ascending = False, inplace=True) # Order by importance
importance_f_knn_1.head(20)

In [None]:
# Make the sklearn logreg model using results from them sm model
sk_lr = LogisticRegression()
sk_lr.coef_ = np.array([results_f_knn_2.params[1:]])
sk_lr.intercept_ = np.array([results_f_knn_2.params[0]])
sk_lr.classes_ = np.array([0, 1])
# Selection through multivariate analysis- showing the absolute value that feat contributes to score
# Make sure to use X without the constant
permutation_f_knn_2_score = permutation_importance(sk_lr, X_f_scaled, y_f_knn==2, scoring='accuracy', n_repeats=10) # Perform Permutation
importance_f_knn_2 = pd.DataFrame(np.vstack((X.columns,\
        permutation_f_knn_2_score.importances_mean)).T) # Unstack results
importance_f_knn_2.columns=['feature','score decrease']
importance_f_knn_2.sort_values(by="score decrease", ascending = False, inplace=True) # Order by importance
importance_f_knn_2.head(20)

In [None]:
# Make the sklearn logreg model using results from them sm model
sk_lr = LogisticRegression()
sk_lr.coef_ = np.array([results_f_knn_3.params[1:]])
sk_lr.intercept_ = np.array([results_f_knn_3.params[0]])
sk_lr.classes_ = np.array([0, 1])
# Selection through multivariate analysis- showing the absolute value that feat contributes to score
# Make sure to use X without the constant
permutation_f_knn_3_score = permutation_importance(sk_lr, X_f_scaled, y_f_knn==3, scoring='accuracy', n_repeats=10) # Perform Permutation
importance_f_knn_3 = pd.DataFrame(np.vstack((X.columns,\
        permutation_f_knn_3_score.importances_mean)).T) # Unstack results
importance_f_knn_3.columns=['feature','score decrease']
importance_f_knn_3.sort_values(by="score decrease", ascending = False, inplace=True) # Order by importance
importance_f_knn_3.head(20)

In [None]:
# Make the sklearn logreg model using results from them sm model
sk_lr = LogisticRegression()
sk_lr.coef_ = np.array([results_f_knn_4.params[1:]])
sk_lr.intercept_ = np.array([results_f_knn_4.params[0]])
sk_lr.classes_ = np.array([0, 1])
# Selection through multivariate analysis- showing the absolute value that feat contributes to score
# Make sure to use X without the constant
permutation_f_knn_4_score = permutation_importance(sk_lr, X_f_scaled, y_f_knn==4, scoring='accuracy', n_repeats=10) # Perform Permutation
importance_f_knn_4 = pd.DataFrame(np.vstack((X.columns,\
        permutation_f_knn_4_score.importances_mean)).T) # Unstack results
importance_f_knn_4.columns=['feature','score decrease']
importance_f_knn_4.sort_values(by="score decrease", ascending = False, inplace=True) # Order by importance
importance_f_knn_4.head(20)

In [None]:
# Make the sklearn logreg model using results from them sm model
sk_lr = LogisticRegression()
sk_lr.coef_ = np.array([results_f_gaussian_0.params[1:]])
sk_lr.intercept_ = np.array([results_f_gaussian_0.params[0]])
sk_lr.classes_ = np.array([0, 1])
# Selection through multivariate analysis- showing the absolute value that feat contributes to score
# Make sure to use X without the constant
permutation_f_gaussian_0_score = permutation_importance(sk_lr, X_f_scaled, y_f_knn==0, scoring='accuracy', n_repeats=10) # Perform Permutation
importance_f_gaussian_0 = pd.DataFrame(np.vstack((X.columns,\
        permutation_f_gaussian_0_score.importances_mean)).T) # Unstack results
importance_f_gaussian_0.columns=['feature','score decrease']
importance_f_gaussian_0.sort_values(by="score decrease", ascending = False, inplace=True) # Order by importance
importance_f_gaussian_0

In [None]:
# Make the sklearn logreg model using results from them sm model
sk_lr = LogisticRegression()
sk_lr.coef_ = np.array([results_f_gaussian_1.params[1:]])
sk_lr.intercept_ = np.array([results_f_gaussian_1.params[0]])
sk_lr.classes_ = np.array([0, 1])
# Selection through multivariate analysis- showing the absolute value that feat contributes to score
# Make sure to use X without the constant
permutation_f_gaussian_1_score = permutation_importance(sk_lr, X_f_scaled, y_f_knn==1, scoring='accuracy', n_repeats=10) # Perform Permutation
importance_f_gaussian_1 = pd.DataFrame(np.vstack((X.columns,\
        permutation_f_gaussian_1_score.importances_mean)).T) # Unstack results
importance_f_gaussian_1.columns=['feature','score decrease']
importance_f_gaussian_1.sort_values(by="score decrease", ascending = False, inplace=True) # Order by importance
importance_f_gaussian_1

In [None]:
# Make the sklearn logreg model using results from them sm model
sk_lr = LogisticRegression()
sk_lr.coef_ = np.array([results_f_gaussian_2.params[1:]])
sk_lr.intercept_ = np.array([results_f_gaussian_2.params[0]])
sk_lr.classes_ = np.array([0, 1])
# Selection through multivariate analysis- showing the absolute value that feat contributes to score
# Make sure to use X without the constant
permutation_f_gaussian_2_score = permutation_importance(sk_lr, X_f_scaled, y_f_knn==2, scoring='accuracy', n_repeats=10) # Perform Permutation
importance_f_gaussian_2 = pd.DataFrame(np.vstack((X.columns,\
        permutation_f_gaussian_2_score.importances_mean)).T) # Unstack results
importance_f_gaussian_2.columns=['feature','score decrease']
importance_f_gaussian_2.sort_values(by="score decrease", ascending = False, inplace=True) # Order by importance
importance_f_gaussian_2

In [None]:
# Make the sklearn logreg model using results from them sm model
sk_lr = LogisticRegression()
sk_lr.coef_ = np.array([results_f_gaussian_3.params[1:]])
sk_lr.intercept_ = np.array([results_f_gaussian_3.params[0]])
sk_lr.classes_ = np.array([0, 1])
# Selection through multivariate analysis- showing the absolute value that feat contributes to score
# Make sure to use X without the constant
permutation_f_gaussian_3_score = permutation_importance(sk_lr, X_f_scaled, y_f_knn==3, scoring='accuracy', n_repeats=10) # Perform Permutation
importance_f_gaussian_3 = pd.DataFrame(np.vstack((X.columns,\
        permutation_f_gaussian_3_score.importances_mean)).T) # Unstack results
importance_f_gaussian_3.columns=['feature','score decrease']
importance_f_gaussian_3.sort_values(by="score decrease", ascending = False, inplace=True) # Order by importance
importance_f_gaussian_3

In [None]:
# Make the sklearn logreg model using results from them sm model
sk_lr = LogisticRegression()
sk_lr.coef_ = np.array([results_f_gaussian_4.params[1:]])
sk_lr.intercept_ = np.array([results_f_gaussian_4.params[0]])
sk_lr.classes_ = np.array([0, 1])
# Selection through multivariate analysis- showing the absolute value that feat contributes to score
# Make sure to use X without the constant
permutation_f_gaussian_4_score = permutation_importance(sk_lr, X_f_scaled, y_f_knn==4, scoring='accuracy', n_repeats=10) # Perform Permutation
importance_f_gaussian_4 = pd.DataFrame(np.vstack((X.columns,\
        permutation_f_gaussian_4_score.importances_mean)).T) # Unstack results
importance_f_gaussian_4.columns=['feature','score decrease']
importance_f_gaussian_4.sort_values(by="score decrease", ascending = False, inplace=True) # Order by importance
importance_f_gaussian_4

## Logistic regression with thickness and volume data only

Thickness data

In [None]:
lt_feat = list(X_m_scaled_int.columns)[:36]
rt_feat = list(X_m_scaled_int.columns)[38:73]

In [None]:
X_f_scaled_int[lt_feat + rt_feat]

In [None]:
results_thick_f_gaussian_0 = sm.Logit(y_f_gaussian==0, X_f_scaled_int[lt_feat + rt_feat]).fit(maxiter=100)
results_thick_f_gaussian_1 = sm.Logit(y_f_gaussian==1, X_f_scaled_int[lt_feat + rt_feat]).fit(maxiter=100)
results_thick_f_gaussian_2 = sm.Logit(y_f_gaussian==2, X_f_scaled_int[lt_feat + rt_feat]).fit(maxiter=100)
results_thick_f_gaussian_3 = sm.Logit(y_f_gaussian==3, X_f_scaled_int[lt_feat + rt_feat]).fit(maxiter=100)
results_thick_f_gaussian_4 = sm.Logit(y_f_gaussian==4, X_f_scaled_int[lt_feat + rt_feat]).fit(maxiter=100)

In [None]:
results_thick_m_gaussian_0 = sm.Logit(y_m_gaussian==0, X_m_scaled_int[lt_feat + rt_feat]).fit(maxiter=100)
results_thick_m_gaussian_1 = sm.Logit(y_m_gaussian==1, X_m_scaled_int[lt_feat + rt_feat]).fit(maxiter=100)
results_thick_m_gaussian_2 = sm.Logit(y_m_gaussian==2, X_m_scaled_int[lt_feat + rt_feat]).fit(maxiter=100)
results_thick_m_gaussian_3 = sm.Logit(y_m_gaussian==3, X_m_scaled_int[lt_feat + rt_feat]).fit(maxiter=100)
results_thick_m_gaussian_4 = sm.Logit(y_m_gaussian==4, X_m_scaled_int[lt_feat + rt_feat]).fit(maxiter=100)

In [None]:
# Make y pred proba for the models

y_pred_thick_f_gaussian_0_proba = results_thick_f_gaussian_0.predict(X_f_scaled_int[lt_feat + rt_feat]) # add constant to feat
y_pred_thick_m_gaussian_0_proba = results_thick_m_gaussian_0.predict(X_m_scaled_int[lt_feat + rt_feat])
y_pred_thick_f_gaussian_1_proba = results_thick_f_gaussian_1.predict(X_f_scaled_int[lt_feat + rt_feat]) # add constant to feat
y_pred_thick_m_gaussian_1_proba = results_thick_m_gaussian_1.predict(X_m_scaled_int[lt_feat + rt_feat])
y_pred_thick_f_gaussian_2_proba = results_thick_f_gaussian_2.predict(X_f_scaled_int[lt_feat + rt_feat]) # add constant to feat
y_pred_thick_m_gaussian_2_proba = results_thick_m_gaussian_2.predict(X_m_scaled_int[lt_feat + rt_feat])
y_pred_thick_f_gaussian_3_proba = results_thick_f_gaussian_3.predict(X_f_scaled_int[lt_feat + rt_feat]) # add constant to feat
y_pred_thick_m_gaussian_3_proba = results_thick_m_gaussian_3.predict(X_m_scaled_int[lt_feat + rt_feat])
y_pred_thick_f_gaussian_4_proba = results_thick_f_gaussian_4.predict(X_f_scaled_int[lt_feat + rt_feat]) # add constant to feat
y_pred_thick_m_gaussian_4_proba = results_thick_m_gaussian_4.predict(X_m_scaled_int[lt_feat + rt_feat])

In [None]:
f_thick_gaussian_proba = pd.DataFrame({0: y_pred_thick_f_gaussian_0_proba,
                                       1: y_pred_thick_f_gaussian_1_proba,
                                       2: y_pred_thick_f_gaussian_2_proba,
                                       3: y_pred_thick_f_gaussian_3_proba,
                                       4: y_pred_thick_f_gaussian_4_proba})

m_thick_gaussian_proba = pd.DataFrame({0: y_pred_thick_m_gaussian_0_proba,
                                       1: y_pred_thick_m_gaussian_1_proba,
                                       2: y_pred_thick_m_gaussian_2_proba,
                                       3: y_pred_thick_m_gaussian_3_proba,
                                       4: y_pred_thick_m_gaussian_4_proba})

In [None]:
print(classification_report(y_f_gaussian, np.argmax(f_thick_gaussian_proba.values, axis = 1)))
print(classification_report(y_m_gaussian, np.argmax(m_thick_gaussian_proba.values, axis = 1)))

Volume data

In [None]:
vol_feat_1 = list(X_f_scaled_int.columns)[0]
vol_feat_2 = list(X_f_scaled_int.columns)[36:38]
vol_feat_3 = list(X_f_scaled_int.columns)[73:]
X_f_scaled_int[[vol_feat_1] + vol_feat_2 + vol_feat_3]

In [None]:
results_vol_f_gaussian_0 = sm.Logit(y_f_gaussian==0, X_f_scaled_int[[vol_feat_1] + vol_feat_2 + vol_feat_3]).fit(maxiter=100)
results_vol_f_gaussian_1 = sm.Logit(y_f_gaussian==1, X_f_scaled_int[[vol_feat_1] + vol_feat_2 + vol_feat_3]).fit(maxiter=100)
results_vol_f_gaussian_2 = sm.Logit(y_f_gaussian==2, X_f_scaled_int[[vol_feat_1] + vol_feat_2 + vol_feat_3]).fit(maxiter=100)
results_vol_f_gaussian_3 = sm.Logit(y_f_gaussian==3, X_f_scaled_int[[vol_feat_1] + vol_feat_2 + vol_feat_3]).fit(maxiter=100)
results_vol_f_gaussian_4 = sm.Logit(y_f_gaussian==4, X_f_scaled_int[[vol_feat_1] + vol_feat_2 + vol_feat_3]).fit(maxiter=100)

In [None]:
results_vol_m_gaussian_0 = sm.Logit(y_m_gaussian==0, X_m_scaled_int[[vol_feat_1] + vol_feat_2 + vol_feat_3]).fit(maxiter=100)
results_vol_m_gaussian_1 = sm.Logit(y_m_gaussian==1, X_m_scaled_int[[vol_feat_1] + vol_feat_2 + vol_feat_3]).fit(maxiter=100)
results_vol_m_gaussian_2 = sm.Logit(y_m_gaussian==2, X_m_scaled_int[[vol_feat_1] + vol_feat_2 + vol_feat_3]).fit(maxiter=100)
results_vol_m_gaussian_3 = sm.Logit(y_m_gaussian==3, X_m_scaled_int[[vol_feat_1] + vol_feat_2 + vol_feat_3]).fit(maxiter=100)
results_vol_m_gaussian_4 = sm.Logit(y_m_gaussian==4, X_m_scaled_int[[vol_feat_1] + vol_feat_2 + vol_feat_3]).fit(maxiter=100)

In [None]:
y_pred_vol_f_gaussian_0_proba = results_vol_f_gaussian_0.predict(X_f_scaled_int[[vol_feat_1] + vol_feat_2 + vol_feat_3]) # add constant to feat
y_pred_vol_m_gaussian_0_proba = results_vol_m_gaussian_0.predict(X_m_scaled_int[[vol_feat_1] + vol_feat_2 + vol_feat_3])
y_pred_vol_f_gaussian_1_proba = results_vol_f_gaussian_1.predict(X_f_scaled_int[[vol_feat_1] + vol_feat_2 + vol_feat_3]) # add constant to feat
y_pred_vol_m_gaussian_1_proba = results_vol_m_gaussian_1.predict(X_m_scaled_int[[vol_feat_1] + vol_feat_2 + vol_feat_3])
y_pred_vol_f_gaussian_2_proba = results_vol_f_gaussian_2.predict(X_f_scaled_int[[vol_feat_1] + vol_feat_2 + vol_feat_3]) # add constant to feat
y_pred_vol_m_gaussian_2_proba = results_vol_m_gaussian_2.predict(X_m_scaled_int[[vol_feat_1] + vol_feat_2 + vol_feat_3])
y_pred_vol_f_gaussian_3_proba = results_vol_f_gaussian_3.predict(X_f_scaled_int[[vol_feat_1] + vol_feat_2 + vol_feat_3]) # add constant to feat
y_pred_vol_m_gaussian_3_proba = results_vol_m_gaussian_3.predict(X_m_scaled_int[[vol_feat_1] + vol_feat_2 + vol_feat_3])
y_pred_vol_f_gaussian_4_proba = results_vol_f_gaussian_4.predict(X_f_scaled_int[[vol_feat_1] + vol_feat_2 + vol_feat_3]) # add constant to feat
y_pred_vol_m_gaussian_4_proba = results_vol_m_gaussian_4.predict(X_m_scaled_int[[vol_feat_1] + vol_feat_2 + vol_feat_3])

In [None]:
f_vol_gaussian_proba = pd.DataFrame({0: y_pred_vol_f_gaussian_0_proba,
                                       1: y_pred_vol_f_gaussian_0_proba,
                                       2: y_pred_vol_f_gaussian_0_proba,
                                       3: y_pred_vol_f_gaussian_0_proba,
                                       4: y_pred_vol_f_gaussian_0_proba})

m_vol_gaussian_proba = pd.DataFrame({0: y_pred_vol_m_gaussian_0_proba,
                                       1: y_pred_vol_m_gaussian_0_proba,
                                       2: y_pred_vol_m_gaussian_0_proba,
                                       3: y_pred_vol_m_gaussian_0_proba,
                                       4: y_pred_vol_m_gaussian_0_proba})

In [None]:
print(classification_report(y_f_gaussian, np.argmax(f_vol_gaussian_proba.values, axis = 1)))
print(classification_report(y_m_gaussian, np.argmax(m_vol_gaussian_proba.values, axis = 1)))

In [None]:
gaussian_0_thick_f_best_feats = pd.DataFrame([results_thick_f_gaussian_0.params, results_thick_f_gaussian_0.pvalues], index=['coef', 'p-value']).T
gaussian_0_thick_f_best_feats[gaussian_0_thick_f_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

In [None]:
gaussian_1_thick_f_best_feats = pd.DataFrame([results_thick_f_gaussian_1.params, results_thick_f_gaussian_1.pvalues], index=['coef', 'p-value']).T
gaussian_1_thick_f_best_feats[gaussian_1_thick_f_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

In [None]:
gaussian_2_thick_f_best_feats = pd.DataFrame([results_thick_f_gaussian_2.params, results_thick_f_gaussian_2.pvalues], index=['coef', 'p-value']).T
gaussian_2_thick_f_best_feats[gaussian_2_thick_f_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

In [None]:
gaussian_3_thick_f_best_feats = pd.DataFrame([results_thick_f_gaussian_3.params, results_thick_f_gaussian_3.pvalues], index=['coef', 'p-value']).T
gaussian_3_thick_f_best_feats[gaussian_3_thick_f_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

In [None]:
gaussian_4_thick_f_best_feats = pd.DataFrame([results_thick_f_gaussian_4.params, results_thick_f_gaussian_4.pvalues], index=['coef', 'p-value']).T
gaussian_4_thick_f_best_feats[gaussian_4_thick_f_best_feats['p-value']<0.05].sort_values('coef', ascending=False)

## Testing quartile targets

In [None]:
from sklearn.preprocessing import QuantileTransformer
qt=QuantileTransformer(n_quantiles=1350)

In [None]:
y_disc

In [None]:
qt.fit_transform(y_disc)

In [None]:
quantile_df=pd.DataFrame(qt.fit_transform(y_disc),columns=y_disc.columns)
quantile_df

In [None]:
quant_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
quantile_df=pd.DataFrame(quant_imputer.fit_transform(quantile_df),columns=quantile_df.columns)

In [None]:
quantile_kmeans=KMeans(n_clusters=5)

In [None]:
quantile_kmeans.fit_predict(quantile_df)

In [None]:
quantile_kmeans.labels_

In [None]:
quantile_kmeans.cluster_centers_

In [None]:
quant_target=quantile_kmeans.labels_
df['quant_target']=quant_target
df

In [None]:
y_quant=df['quant_target']
y_quant

In [None]:
df

In [None]:
sex_df2 = df
sex_df2.sex.replace('female', 'F', inplace=True)
sex_df2.sex.replace('male', 'M', inplace=True)
sex_df2.sex.replace(np.nan, 'F', inplace=True)
f_df2 = sex_df2[sex_df2.sex == 'F']
m_df2 = sex_df2[sex_df2.sex == 'M']
sex_df2.sex.unique()

In [None]:
# Feature creation
X_f2 = f_df2.iloc[:,11:].reset_index(drop=True).drop(columns='quant_target')
X_m2 = m_df2.iloc[:,11:].reset_index(drop=True).drop(columns='quant_target')
y_f2 = f_df2.quant_target.reset_index(drop=True)
y_m2 = m_df2.quant_target.reset_index(drop=True)

In [None]:
# Add a constant as a feature to the scaled X
X_f_2_int = sm.add_constant(X_f2)
X_f_2_int = pd.DataFrame(X_f_2_int, columns=pd.Index(['const']).append(X_f2.columns))

X_m_2_int = sm.add_constant(X_m2)
X_m_2_int = pd.DataFrame(X_m_2_int, columns=pd.Index(['const']).append(X_f2.columns))

In [None]:
results_f_q_0 = sm.Logit(y_f2==0, X_f_2_int).fit(maxiter=1000)
results_f_q_0.summary()

In [None]:
# results_f_q_1 = sm.Logit(y_f2==1, X_f_2_int).fit(maxiter=1000)
# results_f_q_2 = sm.Logit(y_f2==2, X_f_2_int).fit(maxiter=1000)
# results_f_q_3 = sm.Logit(y_f2==3, X_f_2_int).fit(maxiter=1000)
# results_f_q_4 = sm.Logit(y_f2==4, X_f_2_int).fit(maxiter=1000)

# results_m_q_0 = sm.Logit(y_m2==0, X_m_2_int).fit(maxiter=1000)
# results_m_q_1 = sm.Logit(y_m2==1, X_m_2_int).fit(maxiter=1000)
# results_m_q_2 = sm.Logit(y_m2==2, X_m_2_int).fit(maxiter=1000)
# results_m_q_3 = sm.Logit(y_m2==3, X_m_2_int).fit(maxiter=1000)
# results_m_q_4 = sm.Logit(y_m2==4, X_m_2_int).fit(maxiter=1000)

In [None]:
# y_pred_f_q_0_proba = results_f_q_0.predict(X_f_2_int) # add constant to feat
# y_pred_f_q_1_proba = results_f_q_1.predict(X_f_2_int)
# y_pred_f_q_2_proba = results_f_q_2.predict(X_f_2_int) # add constant to feat
# y_pred_f_q_3_proba = results_f_q_3.predict(X_f_2_int)
# y_pred_f_q_4_proba = results_f_q_4.predict(X_f_2_int) # add constant to feat
# y_pred_m_q_0_proba = results_f_q_0.predict(X_m_2_int)
# y_pred_m_q_1_proba = results_f_q_1.predict(X_m_2_int) # add constant to feat
# y_pred_m_q_2_proba = results_f_q_2.predict(X_m_2_int)
# y_pred_m_q_3_proba = results_f_q_3.predict(X_m_2_int) # add constant to feat
# y_pred_m_q_4_proba = results_f_q_4.predict(X_m_2_int)

In [None]:
# f_g_proba = pd.DataFrame({0: y_pred_f_q_0_proba,
#                                  1: y_pred_f_q_1_proba,
#                                  2: y_pred_f_q_2_proba,
#                                  3: y_pred_f_q_3_proba,
#                                  4: y_pred_f_q_4_proba})
# f_g_proba

In [None]:
# m_g_proba = pd.DataFrame({0: y_pred_m_q_0_proba,
#                                  1: y_pred_m_q_1_proba,
#                                  2: y_pred_m_q_2_proba,
#                                  3: y_pred_m_q_3_proba,
#                                  4: y_pred_m_q_4_proba})
# m_g_proba

In [None]:
# print(classification_report(y_f2, np.argmax(f_g_proba.values, axis = 1)))
# print(classification_report(y_m2, np.argmax(m_g_proba.values, axis = 1)))

# Notes

In [None]:
# Do permutation and coef with sex. Cars exercise.
# permutation is taking out one col at a time and working out the drop in score of the sklearn model- R2 in regression, acc in classification
# VIF- only good for linear models
# Work out different models for sex
# Work out for different characteristics