<a href="https://colab.research.google.com/github/nghitct/AlgorithmSupportedInductionPersonality/blob/main/TeamPersonality_Analysis_Compacted.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Preparing ...

In [8]:
#import basic libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
#warnings.filterwarnings(action='once')
import itertools
import re

#import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.tree import export_graphviz
from sklearn import tree
from sklearn.preprocessing import StandardScaler 

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import ElasticNetCV

from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

#import google driver
from google.colab import drive 
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Load Data

In [9]:
#import data
dat1=pd.read_csv("/content/gdrive/MyDrive/Colab Notebooks/TeamPersonality-Data1-full.csv")
dat2=pd.read_csv("/content/gdrive/MyDrive/Colab Notebooks/TeamPersonality-Data2-full.csv")

dat1=dat1.drop(['gender_all_sd','gender_exc_sd','size','size_all','GroupID'],axis=1)
dat2=dat2.drop(['gender_all_sd','gender_exc_sd','size','size_all','teamid'],axis=1)

dat2_names=[re.sub('neur','emos',x) for x in dat2.columns]
dat2.columns=dat2_names

features_exc=['emos_exc_mean', 'extr_exc_mean',
       'open_exc_mean', 'agree_exc_mean', 'cons_exc_mean', 'emos_exc_sd',
       'extr_exc_sd', 'open_exc_sd', 'agree_exc_sd', 'cons_exc_sd',
       'emos_exc_min', 'extr_exc_min', 'open_exc_min', 'agree_exc_min',
       'cons_exc_min', 'emos_exc_max', 'extr_exc_max', 'open_exc_max',
       'agree_exc_max', 'cons_exc_max', 'gender_exc_mean', 'gender_leader',
       'emos_leader', 'extr_leader', 'open_leader', 'agree_leader',
       'cons_leader']

features_all=['emos_all_mean', 'extr_all_mean', 'open_all_mean', 'agree_all_mean',
       'cons_all_mean', 'emos_all_sd', 'extr_all_sd', 'open_all_sd',
       'agree_all_sd', 'cons_all_sd', 'emos_all_min', 'extr_all_min',
       'open_all_min', 'agree_all_min', 'cons_all_min', 'emos_all_max',
       'extr_all_max', 'open_all_max', 'agree_all_max', 'cons_all_max',
       'gender_all_mean','gender_leader','emos_leader', 'extr_leader', 
       'open_leader', 'agree_leader','cons_leader']

# Define functions

In [14]:
def _high_level_vars(data):
  features=data.columns
  m=len(features)
  for i in range(0,(m-1)):
    for j in range((i+1),m) :
      var1=features[i]
      var2=features[j]
      name=var1+"*"+var2
      data[name]=pd.Series(data[var1]*data[var2],name=name)  
  for i in features:
    name=i+"2"
    data[name]=pd.Series(data[i]*data[i],name=name) 
  return data

def _AIC_linearmodel(X, y, features):
  model = sm.OLS(y, X[list(features)])
  regr = model.fit()
  AIC = regr.aic
  return {'model':regr, 'AIC':AIC}

def _forward(X, y, predictors):
  remaining_predictors = [p for p in X.columns.difference(['const']) if p not in predictors]
  results=[]
  for p in remaining_predictors:
    results.append(_AIC_linearmodel(X, y, features=predictors+[p]+['const']))
  models = pd.DataFrame(results)
  best_model = models.loc[models['AIC'].argmin()]
  return best_model

def _backward(X,y,predictors):
  results = []
  for combo in itertools.combinations(predictors, len(predictors) - 1):
    results.append(_AIC_linearmodel(X=X, y= y,features=list(combo)+['const']))
  models = pd.DataFrame(results)
  best_model = models.loc[models['AIC'].argmin()]
  return best_model

def _stepwise_model(X,y):
  Stepmodels = pd.DataFrame(columns=["AIC", "model"])
  predictors = []
  Smodel_before = _AIC_linearmodel(X,y,predictors+['const'])['AIC']
  for i in range(1, len(X.columns.difference(['const'])) + 1):
    Forward_result = _forward(X=X, y=y, predictors=predictors) # constant added
    #print('forward')
    Stepmodels.loc[i] = Forward_result
    predictors = Stepmodels.loc[i]["model"].model.exog_names
    predictors = [ k for k in predictors if k != 'const']
    Backward_result = _backward(X=X, y=y, predictors=predictors)  # Check if there is anything to remove
    if Backward_result['AIC']< Forward_result['AIC']:
      Stepmodels.loc[i] = Backward_result
      predictors = Stepmodels.loc[i]["model"].model.exog_names
      Smodel_before = Stepmodels.loc[i]["AIC"]
      predictors = [ k for k in predictors if k != 'const']
      #print('backward')
    if Stepmodels.loc[i]['AIC']> Smodel_before:
      break
    else:
      Smodel_before = Stepmodels.loc[i]["AIC"]
    return (Stepmodels['model'][len(Stepmodels['model'])])

def _fs_swAIC(X,y):
  sw=_stepwise_model(X,y)
  sw_p=sw.pvalues
  features=sw_p[sw_p<=0.05].index.tolist()
  features.remove('const') if 'const' in features else None
  return features

def _fs_randomforest(X,y,params,cv,n_iter,n_vars,randomstate):
  #Create the model to tune
  rf = RandomForestRegressor()
  #search across different combinations, and use all available cores
  rf_random = RandomizedSearchCV(estimator = rf, 
                                 param_distributions = params, 
                                 n_iter = n_iter, cv = cv, 
                                 verbose=2, 
                                 random_state=randomstate, 
                                 n_jobs = -1)
  # Fit the random search model with exc variables
  rf_random.fit(X,y)
  best_params=rf_random.best_params_
  rf_final = RandomForestRegressor(**best_params)
  rf_final.fit(X, y)
  features=X.columns
  f_i = list(zip(features,rf_final.feature_importances_))
  f_i.sort(key = lambda x : x[1],reverse=True)
  features=[x[0] for x in f_i[0:n_vars]]
  return features 

def _fs_gbr(X,y,params,cv,n_job,n_vars,randomstate):
  gb = GradientBoostingRegressor()
  gb_random = RandomizedSearchCV(estimator = gb,
                                 param_distributions = params,
                                 scoring = 'neg_mean_absolute_error',
                                 n_iter = n_iter,
                                 cv = cv,
                                 refit = True,
                                 return_train_score = True,
                                 random_state = randomstate)
  gb_random.fit(X,y)
  best_params=gb_random.best_params_
  gb_final=GradientBoostingRegressor(**best_params)
  gb_final.fit(X,y)
  features=X.columns
  f_i = list(zip(features,gb_final.feature_importances_))
  f_i.sort(key = lambda x : x[1],reverse=True)
  features=[x[0] for x in f_i[0:n_vars]]
  return features

def _fs_lasso(X,y,cvparams,alphas,n_vars):
  # define model evaluation method
  cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=123456)
  # define model
  ls = LassoCV(alphas=alphas, cv=cv, n_jobs=-1)
  # fit model
  ls.fit(X,y)
  # define model
  ls_final = Lasso(alpha=ls.alpha_)
  # fit model
  ls_final.fit(X,y)
  features=X.columns
  f_i = list(zip(features,abs(ls_final.coef_)))
  f_i.sort(key = lambda x : x[1],reverse=True)
  features=[x[0] for x in f_i[0:n_vars]]
  return features

def _fs_ElasticNet(X,y,cvparams,ratios,alphas,n_vars):
  cv = RepeatedKFold(**cvparams)
  #ratios = np.arange(0, 1, 0.01)
  #alphas = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0.0, 1.0, 10.0, 100.0]
  en = ElasticNetCV(l1_ratio=ratios, alphas=alphas, cv=cv, n_jobs=-1)
  en.fit(X,y)
  en_final = ElasticNet(alpha=en.alpha_, l1_ratio=en.l1_ratio_)
  en_final.fit(X,y)
  features=X.columns
  f_i = list(zip(features,abs(en_final.coef_)))
  f_i.sort(key = lambda x : x[1],reverse=True)
  features=[x[0] for x in f_i[0:n_vars]]
  return features

def _test_hypothesis(X,y,predictors):
  X_min=X[predictors]
  X_min = sm.add_constant(X_min)
  #fit linear regression model
  model = sm.OLS(y, X_min).fit()
  results = pd.DataFrame({'Coef':model.params[1:14],'p-value':model.pvalues[1:14]})
  results['vars']=results.index
  results.index=np.arange(0,len(predictors),1)
  return results




# Round 1: Random Forest + Lasso + 5 vars

## Define paramaters

In [10]:
# params for splitting
randomstate_list=[1,10,100,200,300]
testsize=[0.5,0.4,0.3,0.2]

n_vars=5

# params for random forest
params = {'n_estimators': [int(x) for x in np.arange(10,300,10)],
               'max_features': ['auto', 'sqrt'],
               'max_depth': [3,5,10],
               'min_samples_split': [2, 4, 6, 8, 10],
               'min_samples_leaf': [1,2,4,6,8,10],
               'bootstrap': [True, False]}
cv=10
n_iter=200

# params for cv
cvparams={'n_splits':10,
          'n_repeats': 3,
          'random_state': 123456}

# params for lasso:
ls_alphas=np.arange(0, 2, 0.1)

# params for Elastic Net:
en_ratios = np.arange(0, 1, 0.01)
en_alphas = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0.0, 1.0, 10.0, 100.0]

## Induction and Testing

In [11]:
results_all = pd.DataFrame(columns=['Coef','p-value','vars','randomstate','testsize'])
for rs in randomstate_list:
  for ts in testsize:
    train,test=train_test_split(dat1,test_size=ts,random_state=rs)
    train.shape
    test.shape

    X1_train_exc=train[features_exc]
    Y1_train=train['performance']
    X1_test_exc=test[features_exc]
    Y1_test=test['performance']

    rf_features=_fs_randomforest(X1_train_exc,Y1_train,params,cv,n_iter,n_vars,123456)
    X1_train_exc_poly=_high_level_vars(X1_train_exc[rf_features])
    
    ls_predictors=_fs_lasso(X1_train_exc_poly,Y1_train,cvparams,ls_alphas,n_vars)
    
    X1_test_exc_poly=_high_level_vars(X1_test_exc[rf_features])
    results=_test_hypothesis(X1_test_exc_poly,Y1_test,ls_predictors)

    results['randomstate']=rs
    results['testsize']=ts
    
    results_all = pd.concat([results_all,results],ignore_index=True)

path="/content/gdrive/MyDrive/Colab Notebooks/TeamPersonality-rf-lasso-5-sim1221-1.csv"
with open(path, 'w', encoding = 'utf-8-sig') as f:
  results_all.to_csv(f)

Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each

# Round 2: Random Forest + Elastic Net + 5 vars

## Define parameters

In [8]:
# params for splitting
randomstate_list=[1,10,100,200,300]
testsize=[0.5,0.4,0.3,0.2]

n_vars=5

# params for random forest
params = {'n_estimators': [int(x) for x in np.arange(10,300,10)],
               'max_features': ['auto', 'sqrt'],
               'max_depth': [3,5,10],
               'min_samples_split': [2, 4, 6, 8, 10],
               'min_samples_leaf': [1,2,4,6,8,10],
               'bootstrap': [True, False]}
cv=10
n_iter=200

# params for cv
cvparams={'n_splits':10,
          'n_repeats': 3,
          'random_state': 123456}

# params for lasso:
ls_alphas=np.arange(0, 2, 0.1)

# params for Elastic Net:
en_ratios = np.arange(0, 1, 0.01)
en_alphas = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0.0, 1.0, 10.0, 100.0]

## Induction and Testing



In [9]:
results_all = pd.DataFrame(columns=['Coef','p-value','vars','randomstate','testsize'])
for rs in randomstate_list:
  for ts in testsize:
    train,test=train_test_split(dat1,test_size=ts,random_state=rs)
    train.shape
    test.shape

    X1_train_exc=train[features_exc]
    Y1_train=train['performance']
    X1_test_exc=test[features_exc]
    Y1_test=test['performance']

    rf_features=_fs_randomforest(X1_train_exc,Y1_train,params,cv,n_iter,n_vars,123456)
    X1_train_exc_poly=_high_level_vars(X1_train_exc[rf_features])
    
    ls_predictors=_fs_ElasticNet(X1_train_exc_poly,Y1_train,cvparams,en_ratios,en_alphas,n_vars)
    
    X1_test_exc_poly=_high_level_vars(X1_test_exc[rf_features])
    results=_test_hypothesis(X1_test_exc_poly,Y1_test,ls_predictors)

    results['randomstate']=rs
    results['testsize']=ts
    
    results_all = pd.concat([results_all,results],ignore_index=True)

path="/content/gdrive/MyDrive/Colab Notebooks/TeamPersonality-rf-en-5-sim1221-1.csv"
with open(path, 'w', encoding = 'utf-8-sig') as f:
  results_all.to_csv(f)

Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each

# Round 3: Gradient Boosting + Lasso + 5 vars

## Define parameters

In [4]:
# params for splitting
randomstate_list=[1,10,100,200,300]
testsize=[0.5,0.4,0.3,0.2]

n_vars=5

# params for Gradient Boosting
params = {'n_estimators': [int(x) for x in np.arange(10,300,10)],
          'max_depth': [3,5,10],
          'min_samples_split': [2,4,6,8,10],
          'min_samples_leaf': [1,2,4,6,8,10],
          'learning_rate': [x for x in np.arange(0.1,1,0.1)],
          'criterion': ['friedman_mse','squared_error']}
cv=10
n_iter=200

# params for cv
cvparams={'n_splits':10,
          'n_repeats': 3,
          'random_state': 123456}

# params for lasso:
ls_alphas=np.arange(0, 2, 0.1)

# params for Elastic Net:
en_ratios = np.arange(0, 1, 0.01)
en_alphas = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0.0, 1.0, 10.0, 100.0]

## Induction and Testing

In [5]:
results_all = pd.DataFrame(columns=['Coef','p-value','vars','randomstate','testsize'])
for rs in randomstate_list:
  for ts in testsize:
    train,test=train_test_split(dat1,test_size=ts,random_state=rs)
    train.shape
    test.shape

    X1_train_exc=train[features_exc]
    Y1_train=train['performance']
    X1_test_exc=test[features_exc]
    Y1_test=test['performance']

    rf_features=_fs_gbr(X1_train_exc,Y1_train,params,cv,n_iter,n_vars,123456)
    X1_train_exc_poly=_high_level_vars(X1_train_exc[rf_features])
    
    ls_predictors=_fs_lasso(X1_train_exc_poly,Y1_train,cvparams,ls_alphas,n_vars)
    
    X1_test_exc_poly=_high_level_vars(X1_test_exc[rf_features])
    results=_test_hypothesis(X1_test_exc_poly,Y1_test,ls_predictors)

    results['randomstate']=rs
    results['testsize']=ts
    
    results_all = pd.concat([results_all,results],ignore_index=True)

path="/content/gdrive/MyDrive/Colab Notebooks/TeamPersonality-gbr-lasso-5-sim1221-1.csv"
with open(path, 'w', encoding = 'utf-8-sig') as f:
  results_all.to_csv(f)

# Round 4: Random Forest + Lasso + 10 vars

## Define parameters

In [16]:
# params for splitting
randomstate_list=[1,10,100,200,300]
testsize=[0.5,0.4,0.3,0.2]

n_vars=10

# params for random forest
params = {'n_estimators': [int(x) for x in np.arange(10,300,10)],
               'max_features': ['auto', 'sqrt'],
               'max_depth': [3,5,10],
               'min_samples_split': [2, 4, 6, 8, 10],
               'min_samples_leaf': [1,2,4,6,8,10],
               'bootstrap': [True, False]}
cv=10
n_iter=200

# params for cv
cvparams={'n_splits':10,
          'n_repeats': 3,
          'random_state': 123456}

# params for lasso:
ls_alphas=np.arange(0, 2, 0.1)

# params for Elastic Net:
en_ratios = np.arange(0, 1, 0.01)
en_alphas = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0.0, 1.0, 10.0, 100.0]

## Induction and Testing

In [17]:
results_all = pd.DataFrame(columns=['Coef','p-value','vars','randomstate','testsize'])
for rs in randomstate_list:
  for ts in testsize:
    train,test=train_test_split(dat1,test_size=ts,random_state=rs)
    train.shape
    test.shape

    X1_train_exc=train[features_exc]
    Y1_train=train['performance']
    X1_test_exc=test[features_exc]
    Y1_test=test['performance']

    rf_features=_fs_randomforest(X1_train_exc,Y1_train,params,cv,n_iter,n_vars,123456)
    X1_train_exc_poly=_high_level_vars(X1_train_exc[rf_features])
    
    ls_predictors=_fs_lasso(X1_train_exc_poly,Y1_train,cvparams,ls_alphas,n_vars)
    
    X1_test_exc_poly=_high_level_vars(X1_test_exc[rf_features])
    results=_test_hypothesis(X1_test_exc_poly,Y1_test,ls_predictors)

    results['randomstate']=rs
    results['testsize']=ts
    
    results_all = pd.concat([results_all,results],ignore_index=True)

path="/content/gdrive/MyDrive/Colab Notebooks/TeamPersonality-rf-lasso-10-sim1223-1.csv"
with open(path, 'w', encoding = 'utf-8-sig') as f:
  results_all.to_csv(f)

Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Fitting 10 folds for each

# Round 5: Random Forest + Lasso + 5 vars + X2 as test

## Define parameters

In [None]:
# params for splitting
randomstate_list=[1,10,100,200,300]
testsize=[0.5,0.4,0.3,0.2]

n_vars=5

# params for random forest
params = {'n_estimators': [int(x) for x in np.arange(10,300,10)],
               'max_features': ['auto', 'sqrt'],
               'max_depth': [3,5,10],
               'min_samples_split': [2, 4, 6, 8, 10],
               'min_samples_leaf': [1,2,4,6,8,10],
               'bootstrap': [True, False]}
cv=10
n_iter=200

# params for cv
cvparams={'n_splits':10,
          'n_repeats': 3,
          'random_state': 123456}

# params for lasso:
ls_alphas=np.arange(0, 2, 0.1)

# params for Elastic Net:
en_ratios = np.arange(0, 1, 0.01)
en_alphas = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0.0, 1.0, 10.0, 100.0]

## Induction and Testing

In [18]:
train=dat1
test=dat2

X1_train_exc=train[features_exc]
Y1_train=train['performance']
X1_test_exc=test[features_exc]
Y1_test=test['performance']

rf_features=_fs_randomforest(X1_train_exc,Y1_train,params,cv,n_iter,n_vars,123456)
X1_train_exc_poly=_high_level_vars(X1_train_exc[rf_features])
    
ls_predictors=_fs_lasso(X1_train_exc_poly,Y1_train,cvparams,ls_alphas,n_vars)
  
X1_test_exc_poly=_high_level_vars(X1_test_exc[rf_features])
results=_test_hypothesis(X1_test_exc_poly,Y1_test,ls_predictors)    

path="/content/gdrive/MyDrive/Colab Notebooks/TeamPersonality-rf-lasso-5-Dat2AsTest.csv"
with open(path, 'w', encoding = 'utf-8-sig') as f:
  results.to_csv(f)

Fitting 10 folds for each of 200 candidates, totalling 2000 fits


# Round 6: Random Forest + Lasso + 10 vars + X2 as test

## Define parameters

In [19]:
# params for splitting
randomstate_list=[1,10,100,200,300]
testsize=[0.5,0.4,0.3,0.2]

n_vars=10

# params for random forest
params = {'n_estimators': [int(x) for x in np.arange(10,300,10)],
               'max_features': ['auto', 'sqrt'],
               'max_depth': [3,5,10],
               'min_samples_split': [2, 4, 6, 8, 10],
               'min_samples_leaf': [1,2,4,6,8,10],
               'bootstrap': [True, False]}
cv=10
n_iter=200

# params for cv
cvparams={'n_splits':10,
          'n_repeats': 3,
          'random_state': 123456}

# params for lasso:
ls_alphas=np.arange(0, 2, 0.1)

# params for Elastic Net:
en_ratios = np.arange(0, 1, 0.01)
en_alphas = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0.0, 1.0, 10.0, 100.0]

## Induction and Testing

In [20]:
train=dat1
test=dat2

X1_train_exc=train[features_exc]
Y1_train=train['performance']
X1_test_exc=test[features_exc]
Y1_test=test['performance']

rf_features=_fs_randomforest(X1_train_exc,Y1_train,params,cv,n_iter,n_vars,123456)
X1_train_exc_poly=_high_level_vars(X1_train_exc[rf_features])
    
ls_predictors=_fs_lasso(X1_train_exc_poly,Y1_train,cvparams,ls_alphas,n_vars)
  
X1_test_exc_poly=_high_level_vars(X1_test_exc[rf_features])
results=_test_hypothesis(X1_test_exc_poly,Y1_test,ls_predictors)    

path="/content/gdrive/MyDrive/Colab Notebooks/TeamPersonality-rf-lasso-10-Dat2AsTest.csv"
with open(path, 'w', encoding = 'utf-8-sig') as f:
  results.to_csv(f)

Fitting 10 folds for each of 200 candidates, totalling 2000 fits


# Round 7: Random Forest + Elastic Net + 5 vars + X2 as test

## Define parameters

In [21]:
# params for splitting
randomstate_list=[1,10,100,200,300]
testsize=[0.5,0.4,0.3,0.2]

n_vars=5

# params for random forest
params = {'n_estimators': [int(x) for x in np.arange(10,300,10)],
               'max_features': ['auto', 'sqrt'],
               'max_depth': [3,5,10],
               'min_samples_split': [2, 4, 6, 8, 10],
               'min_samples_leaf': [1,2,4,6,8,10],
               'bootstrap': [True, False]}
cv=10
n_iter=200

# params for cv
cvparams={'n_splits':10,
          'n_repeats': 3,
          'random_state': 123456}

# params for lasso:
ls_alphas=np.arange(0, 2, 0.1)

# params for Elastic Net:
en_ratios = np.arange(0, 1, 0.01)
en_alphas = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0.0, 1.0, 10.0, 100.0]

## Induction and Testing

In [22]:
train=dat1
test=dat2

X1_train_exc=train[features_exc]
Y1_train=train['performance']
X1_test_exc=test[features_exc]
Y1_test=test['performance']

rf_features=_fs_randomforest(X1_train_exc,Y1_train,params,cv,n_iter,n_vars,123456)
X1_train_exc_poly=_high_level_vars(X1_train_exc[rf_features])
    
ls_predictors=_fs_ElasticNet(X1_train_exc_poly,Y1_train,cvparams,en_ratios,en_alphas,n_vars)
  
X1_test_exc_poly=_high_level_vars(X1_test_exc[rf_features])
results=_test_hypothesis(X1_test_exc_poly,Y1_test,ls_predictors)    

path="/content/gdrive/MyDrive/Colab Notebooks/TeamPersonality-rf-en-5-Dat2AsTest.csv"
with open(path, 'w', encoding = 'utf-8-sig') as f:
  results.to_csv(f)

Fitting 10 folds for each of 200 candidates, totalling 2000 fits


# Round 8: Gradient Boosting + Lasso + 5 vars + X2 as test

## Define parameters

In [23]:
# params for splitting
randomstate_list=[1,10,100,200,300]
testsize=[0.5,0.4,0.3,0.2]

n_vars=5

# params for Gradient Boosting
params = {'n_estimators': [int(x) for x in np.arange(10,300,10)],
          'max_depth': [3,5,10],
          'min_samples_split': [2,4,6,8,10],
          'min_samples_leaf': [1,2,4,6,8,10],
          'learning_rate': [x for x in np.arange(0.1,1,0.1)],
          'criterion': ['friedman_mse','squared_error']}
cv=10
n_iter=200

# params for cv
cvparams={'n_splits':10,
          'n_repeats': 3,
          'random_state': 123456}

# params for lasso:
ls_alphas=np.arange(0, 2, 0.1)

# params for Elastic Net:
en_ratios = np.arange(0, 1, 0.01)
en_alphas = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0.0, 1.0, 10.0, 100.0]

## Induction and Testing

In [24]:
train=dat1
test=dat2

X1_train_exc=train[features_exc]
Y1_train=train['performance']
X1_test_exc=test[features_exc]
Y1_test=test['performance']

rf_features=_fs_gbr(X1_train_exc,Y1_train,params,cv,n_iter,n_vars,123456)
X1_train_exc_poly=_high_level_vars(X1_train_exc[rf_features])
    
ls_predictors=_fs_lasso(X1_train_exc_poly,Y1_train,cvparams,ls_alphas,n_vars)
  
X1_test_exc_poly=_high_level_vars(X1_test_exc[rf_features])
results=_test_hypothesis(X1_test_exc_poly,Y1_test,ls_predictors)    

path="/content/gdrive/MyDrive/Colab Notebooks/TeamPersonality-gbr-lasso-5-Dat2AsTest.csv"
with open(path, 'w', encoding = 'utf-8-sig') as f:
  results.to_csv(f)

# Side Rounds: Select poly-vars directly

## Define parameters

In [11]:
# params for splitting
randomstate_list=[1,10,100,200,300]
testsize=[0.5,0.4,0.3,0.2]

n_vars=10

# params for cv
cvparams={'n_splits':10,
          'n_repeats': 3,
          'random_state': 123456}

# params for lasso:
ls_alphas=np.arange(0, 2, 0.1)

## Lasso model

In [7]:
results_all = pd.DataFrame(columns=['Coef','p-value','vars','randomstate','testsize'])
for rs in randomstate_list:
  for ts in testsize:
    train,test=train_test_split(dat1,test_size=ts,random_state=rs)
    train.shape
    test.shape

    X1_train_exc=train[features_exc]
    Y1_train=train['performance']
    X1_test_exc=test[features_exc]
    Y1_test=test['performance']

    X1_train_exc_poly=_high_level_vars(X1_train_exc)
    
    ls_predictors=_fs_lasso(X1_train_exc_poly,Y1_train,cvparams,ls_alphas,n_vars)
    
    X1_test_exc_poly=_high_level_vars(X1_test_exc)
    results=_test_hypothesis(X1_test_exc_poly,Y1_test,ls_predictors)

    results['randomstate']=rs
    results['testsize']=ts
    
    results_all = pd.concat([results_all,results],ignore_index=True)

path="/content/gdrive/MyDrive/Colab Notebooks/TeamPersonality-direct-lasso-sim1222-1.csv"
with open(path, 'w', encoding = 'utf-8-sig') as f:
  results_all.to_csv(f)

## Stepwise model

In [15]:
results_all = pd.DataFrame(columns=['Coef','p-value','vars','randomstate','testsize'])
for rs in randomstate_list:
  for ts in testsize:
    train,test=train_test_split(dat1,test_size=ts,random_state=rs)
    train.shape
    test.shape

    X1_train_exc=train[features_exc]
    Y1_train=train['performance']
    X1_test_exc=test[features_exc]
    Y1_test=test['performance']

    X1_train_exc_poly=_high_level_vars(X1_train_exc)
    X1_train_exc_poly['const']=1
    ls_predictors=_fs_swAIC(X1_train_exc_poly,Y1_train)
    
    X1_test_exc_poly=_high_level_vars(X1_test_exc)
    results=_test_hypothesis(X1_test_exc_poly,Y1_test,ls_predictors)

    results['randomstate']=rs
    results['testsize']=ts
    
    results_all = pd.concat([results_all,results],ignore_index=True)

path="/content/gdrive/MyDrive/Colab Notebooks/TeamPersonality-direct-sw-sim1222-1.csv"
with open(path, 'w', encoding = 'utf-8-sig') as f:
  results_all.to_csv(f)