In [54]:
#Import necessary packages 
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [55]:
# suppress future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [56]:
# load the dataset
df = pd.read_csv('df2.csv', index_col=0)

In [57]:
df.drop(axis='columns' , columns=['close', 'close_plus6', 'spx_close', 'spx_plus6', 'return', 'SP_return' ], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 232 entries, 10 to 539
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Year                  232 non-null    int64  
 1   Product_type          232 non-null    object 
 2   Ticker                232 non-null    object 
 3   market_cap_cur        232 non-null    float64
 4   shares_out            232 non-null    float64
 5   year_inc              232 non-null    float64
 6   Previous_SBs          232 non-null    int64  
 7   Yearly_Ad_Count       232 non-null    int64  
 8   New                   232 non-null    int64  
 9   overperform           232 non-null    int64  
 10  Ave_inflation_rate    232 non-null    float64
 11  CPI                   232 non-null    float64
 12  USD_per_euro          232 non-null    float64
 13  Annual_change_GDP     232 non-null    float64
 14  VIX                   232 non-null    float64
 15  change_in_businesses  

In [58]:
df.groupby(['Product_type','New'])['overperform'].mean()

Product_type  New
Alcohol       0      0.538462
Car           0      0.282051
Clothing      0      0.333333
Film          0      0.416667
Food          0      0.560000
              1      0.625000
Gaming        0      1.000000
Goods         0      0.437500
              1      0.000000
Service       0      0.555556
              1      0.600000
Soft drink    0      0.600000
              1      1.000000
TV            0      0.714286
Technology    0      0.400000
              1      0.000000
Website       0      0.363636
              1      0.000000
Wireless      0      0.333333
Name: overperform, dtype: float64

In [59]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, validation_curve
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

In [60]:
# Split the data
X = df.drop('overperform',axis=1)
y = df['overperform']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=714)

In [61]:
# pick transformers
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[('encoder', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

In [62]:
# split columns into numeric and categorical
numeric_features = ['Year', 'market_cap_cur', 'shares_out', 'year_inc', 'Previous_SBs', 'Yearly_Ad_Count','New', 'Ave_inflation_rate', 'CPI','USD_per_euro', 'Annual_change_GDP','VIX','change_in_businesses']
categorical_features = ['Product_type', 'Ticker']
preprocessor = ColumnTransformer(
   transformers=[
    ('numeric', numeric_transformer, numeric_features)
   ,('categorical', categorical_transformer, categorical_features)
]) 

### Logistic Regression

In [63]:
C_params = [0.001,0.01,0.1,1,10,100,1000, 1000000]

In [64]:
# loop through C_params
j = 0

table = pd.DataFrame(columns = ['C','Recall','Precision','ROC-AOC'])
table['C'] = C_params
for C in C_params:
    
    # Apply logistic regression model to training data
    pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor',LogisticRegression(C=C,random_state=42))
           ])
    lr_model = pipeline.fit(X_train, y_train) 
    predictions = lr_model.predict(X_test)
    probs = pipeline.predict_proba(X_test)[:, 1]
    # Predict using model
    #print('C is ',C)
    #print ('Recall: ',metrics.recall_score(y_test, predictions) )
    #print ('Precision: ',metrics.precision_score(y_test, predictions) )
    print (confusion_matrix(y_test, predictions))
    #print ('AOC: ',metrics.roc_auc_score(y, probs))
    table.iloc[j,1] = metrics.recall_score(y_test, predictions)
    table.iloc[j,2] = metrics.precision_score(y_test, predictions)
    table.iloc[j,3] = metrics.roc_auc_score(y_test, probs)
    j+=1
table



[[27  0]
 [20  0]]
[[24  3]
 [17  3]]
[[22  5]
 [16  4]]
[[22  5]
 [13  7]]
[[22  5]
 [14  6]]
[[22  5]
 [15  5]]
[[23  4]
 [15  5]]
[[23  4]
 [15  5]]


Unnamed: 0,C,Recall,Precision,ROC-AOC
0,0.001,0.0,0.0,0.514815
1,0.01,0.15,0.5,0.507407
2,0.1,0.2,0.444444,0.555556
3,1.0,0.35,0.583333,0.583333
4,10.0,0.3,0.545455,0.596296
5,100.0,0.25,0.5,0.605556
6,1000.0,0.25,0.555556,0.574074
7,1000000.0,0.25,0.555556,0.588889


Looks like decreasing penalties improves the model. If this ends up near the top this should come out in GridSearch

In [84]:
cv_scores_test= cross_val_score(lr_model,X_test,y_test,cv=4,scoring='roc_auc')
cv_scores_train= cross_val_score(lr_model,X_train,y_train,cv=4,scoring='roc_auc')
print(cv_scores_test)
cv_scores_lr_test= cv_scores_test.mean()
cv_scores_lr_train= cv_scores_train.mean()
cv_scores_std_lr= cv_scores_test.std()
print ('Mean cross validation test score: ' +str(cv_scores_lr_test))
print ('Mean cross validation train score: ' +str(cv_scores_lr_train))
print ('Standard deviation in cv test scores: ' +str(cv_scores_std_test_lr))

[0.34285714 0.65714286 0.51428571 0.53333333]
Mean cross validation test score: 0.5119047619047619
Mean cross validation train score: 0.6091575091575092
Standard deviation in cv test scores: 0.11195540900484524


Not really looking that good with the large difference between training and test

## KNN

In [66]:
# Reset the pipeline and try KNN
pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor', KNeighborsClassifier(p=2,weights='distance',n_neighbors=6))
           ])

knn = pipeline.fit(X_train,y_train)

# Predict using model:

y_predict_knn = pipeline.predict(X_test)

#Confusion matrix:

cnf_matrix = confusion_matrix(y_test, y_predict_knn)
print(cnf_matrix)

[[22  5]
 [12  8]]


In [67]:
cv_scores_test= cross_val_score(pipeline,X_test,y_test,cv=4,scoring='roc_auc')
cv_scores_train= cross_val_score(pipeline,X_train,y_train,cv=4,scoring='roc_auc')
print(cv_scores_test)
cv_scores_knn_test= cv_scores_test.mean()
cv_scores_knn_train= cv_scores_train.mean()
cv_scores_std_knn= cv_scores_test.std()
print ('Mean cross validation test score: ' +str(cv_scores_knn_test))
print ('Mean cross validation train score: ' +str(cv_scores_knn_train))
print ('Standard deviation in cv scores: ' +str(cv_scores_std_knn))

[0.77142857 0.62857143 0.45714286 0.83333333]
Mean cross validation test score: 0.6726190476190476
Mean cross validation train score: 0.5104510073260073
Standard deviation in cv scores: 0.14488149039302228


### Something seems strange here with an outlier model

Good choice to gridsearch?

## SVM

In [68]:
from sklearn.svm import SVC
pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor', SVC(kernel='linear'))
           ])
svm = pipeline.fit(X_train, y_train)

# Predict using model:

y_predict_svm = pipeline.predict(X_test)
#Confusion matrix:

cnf_matrix = confusion_matrix(y_test, y_predict_svm)
print(cnf_matrix)


[[22  5]
 [14  6]]


In [69]:
cv_scores_test= cross_val_score(pipeline,X_test,y_test,cv=5,scoring='roc_auc')
cv_scores_train= cross_val_score(pipeline,X_train,y_train,cv=5,scoring='roc_auc')
print(cv_scores_test)
cv_scores_svm_test= cv_scores_test.mean()
cv_scores_svm_train= cv_scores_train.mean()
cv_scores_std_svm= cv_scores_test.std()
print ('Mean cross validation test score: ' +str(cv_scores_svm_test))
print ('Mean cross validation train score: ' +str(cv_scores_svm_train))
print ('Standard deviation in cv scores: ' +str(cv_scores_std_svm))

[0.75       0.41666667 0.55       0.6        0.8       ]
Mean cross validation test score: 0.6233333333333334
Mean cross validation train score: 0.5753991596638655
Standard deviation in cv scores: 0.1384838538522733


Again there seem to be cuts that perform badly

## Random Forest

In [70]:
from sklearn.ensemble import RandomForestClassifier
pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor', RandomForestClassifier(bootstrap=True,n_estimators=40,criterion='entropy', max_depth=4))
           ])
rf = pipeline.fit(X_train, y_train)


#Predict using the model:

y_predict_rf = rf.predict(X_test)

#Confusion matrix:

cnf_matrix = confusion_matrix(y_test, y_predict_rf)
print(cnf_matrix)
Accuracy_rf=rf.score(X_test,y_test)
print(Accuracy_rf)


[[24  3]
 [18  2]]
0.5531914893617021


In [71]:
rf['regressor'].feature_importances_

array([2.85097641e-02, 2.73376464e-02, 6.09375813e-02, 5.26322237e-02,
       6.09329372e-02, 2.17861245e-02, 4.73590252e-03, 4.28723034e-02,
       2.87664590e-02, 3.85025830e-02, 4.96523840e-02, 4.58802446e-02,
       7.47038437e-02, 6.17166048e-03, 2.29966915e-02, 0.00000000e+00,
       1.25270855e-02, 1.50488805e-02, 0.00000000e+00, 1.80444004e-03,
       1.74513136e-02, 1.72840155e-02, 7.45580979e-03, 1.79881786e-02,
       1.80695228e-02, 1.47898935e-02, 2.06023112e-03, 0.00000000e+00,
       2.05655351e-02, 0.00000000e+00, 2.98045502e-03, 0.00000000e+00,
       1.98971472e-02, 5.99168705e-03, 0.00000000e+00, 0.00000000e+00,
       1.01133587e-02, 1.85322751e-03, 0.00000000e+00, 0.00000000e+00,
       6.34018094e-04, 1.23192819e-02, 1.70303801e-03, 3.62160402e-03,
       5.91061066e-03, 1.45479689e-03, 6.50197915e-03, 1.56997831e-02,
       1.26324860e-02, 0.00000000e+00, 1.15213451e-02, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.06111446e-03,
      

In [72]:
cv_scores_test= cross_val_score(rf,X_test,y_test,cv=4,scoring='roc_auc')
cv_scores_train= cross_val_score(rf,X_train,y_train,cv=4,scoring='roc_auc')
print(cv_scores_test)
cv_scores_rf_test= cv_scores_test.mean()
cv_scores_rf_train= cv_scores_train.mean()
cv_scores_std_rf= cv_scores_test.std()
print ('Mean cross validation test score: ' +str(cv_scores_rf_test))
print ('Mean cross validation train score: ' +str(cv_scores_rf_train))
print ('Standard deviation in cv scores: ' +str(cv_scores_std_rf))

[0.6        0.51428571 0.54285714 0.73333333]
Mean cross validation test score: 0.5976190476190477
Mean cross validation train score: 0.5707188644688646
Standard deviation in cv scores: 0.0842130437325114


RF seems far more consistent but performs generally worse. Average with it maybe?

## Gradient Boosting

In [73]:
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor',  GradientBoostingClassifier(subsample=0.8, learning_rate=0.01 , n_estimators=100, random_state=5, max_depth=4, max_leaf_nodes=30))
           ])
gbc = pipeline.fit(X_train, y_train)

#Predict using the model:

y_predict_gbc = gbc.predict(X_test)

#Confusion matrix:

cnf_matrix = confusion_matrix(y_test, y_predict_gbc)
print(cnf_matrix)
Accuracy_gbc=gbc.score(X_test,y_test)
print(Accuracy_gbc)

[[22  5]
 [12  8]]
0.6382978723404256


In [74]:
cv_scores_test= cross_val_score(gbc,X_test,y_test,cv=4,scoring='roc_auc')
cv_scores_train= cross_val_score(gbc,X_train,y_train,cv=4,scoring='roc_auc')
print(cv_scores_test)
cv_scores_gbc_test= cv_scores_test.mean()
cv_scores_gbc_train= cv_scores_train.mean()
cv_scores_std_gbc= cv_scores_test.std()
print ('Mean cross validation test score: ' +str(cv_scores_gbc_test))
print ('Mean cross validation train score: ' +str(cv_scores_gbc_train))
print ('Standard deviation in cv scores: ' +str(cv_scores_std_gbc))

[0.4        0.6        0.65714286 0.63333333]
Mean cross validation test score: 0.5726190476190476
Mean cross validation train score: 0.582257326007326
Standard deviation in cv scores: 0.1017073633278439


## Compare Models

In [85]:
myLabels = [ 'Logistic Regression','KNN','SVM','Random Forest','Gradient Boost']
score_test= [  cv_scores_lr_test,cv_scores_knn_test,cv_scores_svm_test,cv_scores_rf_test,cv_scores_gbc_test]
score_train= [  cv_scores_lr_train,cv_scores_knn_train,cv_scores_svm_train,cv_scores_rf_train,cv_scores_gbc_train]
std_test= [  cv_scores_std_lr,cv_scores_std_knn,cv_scores_std_svm,cv_scores_std_rf,cv_scores_std_gbc]

#score_tab_acc = pd.DataFrame(list(zip(myLabels, Accuracy_score)), 
               #columns =['Algorithm', 'Model accuracy score']) 

score_tab = pd.DataFrame(list(zip(myLabels, score_train, score_test, std_test)), 
               columns =['Algorithm', 'ROC-AUC train score', 'ROC-AUC test score',  'ROC-AUC test std']) 
#print(score_tab_acc)

score_tab

Unnamed: 0,Algorithm,ROC-AUC train score,ROC-AUC test score,ROC-AUC test std
0,Logistic Regression,0.609158,0.511905,0.111955
1,KNN,0.510451,0.672619,0.144881
2,SVM,0.575399,0.623333,0.138484
3,Random Forest,0.570719,0.597619,0.084213
4,Gradient Boost,0.582257,0.572619,0.101707


In [82]:
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier(max_features='auto',random_state=1, n_jobs=-1)

param_grid = { "criterion" : ["gini", "entropy"], "min_samples_leaf" : [1, 5, 10], "min_samples_split" : [2, 4, 10,12,16], "n_estimators": [50, 100,400,700,1000]}

#gs = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='roc_auc', cv=4, n_jobs=-1)

#gs = gs.fit(X_train, y_train)

grid = GridSearchCV(rf, cv=4, n_jobs=-1, param_grid=param_grid ,scoring='roc_auc')
grid.fit(X_train['text'], y_train['output'])
grid.score(X_test['text'], y_test['output'])
print(grid.best_estimator_) 
print(grid.best_score_)

KeyError: 'text'