In [53]:
#Import necessary packages 
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [54]:
# suppress future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [55]:
# load the dataset
df = pd.read_csv('df2.csv', index_col=0)

In [56]:
df.drop(axis='columns' , columns=['close', 'close_plus6', 'spx_close', 'spx_plus6', 'return', 'SP_return' ], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 232 entries, 10 to 539
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Year                  232 non-null    int64  
 1   Product_type          232 non-null    object 
 2   Ticker                232 non-null    object 
 3   market_cap_cur        232 non-null    float64
 4   shares_out            232 non-null    float64
 5   year_inc              232 non-null    float64
 6   Previous_SBs          232 non-null    int64  
 7   Yearly_Ad_Count       232 non-null    int64  
 8   New                   232 non-null    int64  
 9   overperform           232 non-null    int64  
 10  Ave_inflation_rate    232 non-null    float64
 11  CPI                   232 non-null    float64
 12  USD_per_euro          232 non-null    float64
 13  Annual_change_GDP     232 non-null    float64
 14  VIX                   232 non-null    float64
 15  change_in_businesses  

In [57]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, validation_curve
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

In [58]:
# Split the data
X = df.drop('overperform',axis=1)
y = df['overperform']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=714)

In [59]:
# pick transformers
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[('encoder', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

In [60]:
# split columns into numeric and categorical
numeric_features = ['Year', 'market_cap_cur', 'shares_out', 'year_inc', 'Previous_SBs', 'Yearly_Ad_Count','New', 'Ave_inflation_rate', 'CPI','USD_per_euro', 'Annual_change_GDP','VIX','change_in_businesses']
categorical_features = ['Product_type', 'Ticker']
preprocessor = ColumnTransformer(
   transformers=[
    ('numeric', numeric_transformer, numeric_features)
   ,('categorical', categorical_transformer, categorical_features)
]) 

### Logistic Regression

In [61]:
C_params = [0.001,0.01,0.1,1,10,100,1000, 1000000]

In [62]:
# loop through C_params
j = 0

table = pd.DataFrame(columns = ['C','Recall','Precision','ROC-AOC'])
table['C'] = C_params
for C in C_params:
    
    # Apply logistic regression model to training data
    pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor',LogisticRegression(C=C,random_state=42))
           ])
    lr_model = pipeline.fit(X_train, y_train) 
    predictions = lr_model.predict(X_test)
    probs = pipeline.predict_proba(X_test)[:, 1]
    # Predict using model
    #print('C is ',C)
    #print ('Recall: ',metrics.recall_score(y_test, predictions) )
    #print ('Precision: ',metrics.precision_score(y_test, predictions) )
    print (confusion_matrix(y_test, predictions))
    #print ('AOC: ',metrics.roc_auc_score(y, probs))
    table.iloc[j,1] = metrics.recall_score(y_test, predictions)
    table.iloc[j,2] = metrics.precision_score(y_test, predictions)
    table.iloc[j,3] = metrics.roc_auc_score(y_test, probs)
    j+=1
table



[[27  0]
 [20  0]]
[[24  3]
 [17  3]]
[[22  5]
 [16  4]]
[[22  5]
 [13  7]]
[[22  5]
 [14  6]]
[[22  5]
 [15  5]]
[[23  4]
 [15  5]]
[[23  4]
 [15  5]]


Unnamed: 0,C,Recall,Precision,ROC-AOC
0,0.001,0.0,0.0,0.514815
1,0.01,0.15,0.5,0.507407
2,0.1,0.2,0.444444,0.555556
3,1.0,0.35,0.583333,0.583333
4,10.0,0.3,0.545455,0.596296
5,100.0,0.25,0.5,0.605556
6,1000.0,0.25,0.555556,0.574074
7,1000000.0,0.25,0.555556,0.588889


Looks like decreasing penalties improves the model. If this ends up near the top this should come out in GridSearch

In [63]:
cv_scores_test= cross_val_score(lr_model,X_test,y_test,cv=4,scoring='roc_auc')
cv_scores_train= cross_val_score(lr_model,X_train,y_train,cv=4,scoring='roc_auc')
print(cv_scores_test)
cv_scores_lr_test= cv_scores_test.mean()
cv_scores_lr_train= cv_scores_train.mean()
cv_scores_std_test_lr= cv_scores_test.std()
print ('Mean cross validation test score: ' +str(cv_scores_lr_test))
print ('Mean cross validation train score: ' +str(cv_scores_lr_train))
print ('Standard deviation in cv test scores: ' +str(cv_scores_std_test_lr))

[0.34285714 0.65714286 0.51428571 0.53333333]
Mean cross validation test score: 0.5119047619047619
Mean cross validation train score: 0.6091575091575092
Standard deviation in cv test scores: 0.11195540900484524


## KNN

In [64]:
# Reset the pipeline and try KNN
pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor', KNeighborsClassifier(p=2,weights='distance',n_neighbors=20))
           ])

knn = pipeline.fit(X_train,y_train)

# Predict using model:

y_predict_knn = pipeline.predict(X_test)

#Confusion matrix:

cnf_matrix = confusion_matrix(y_test, y_predict_knn)
print(cnf_matrix)

[[20  7]
 [13  7]]


In [65]:
cv_scores_test= cross_val_score(pipeline,X_test,y_test,cv=4,scoring='roc_auc')
cv_scores_train= cross_val_score(pipeline,X_train,y_train,cv=4,scoring='roc_auc')
print(cv_scores_test)
cv_scores_knn_test= cv_scores_test.mean()
cv_scores_knn_train= cv_scores_train.mean()
cv_scores_std_knn= cv_scores_test.std()
print ('Mean cross validation test score: ' +str(cv_scores_knn_test))
print ('Mean cross validation train score: ' +str(cv_scores_knn_train))
print ('Standard deviation in cv scores: ' +str(cv_scores_std_knn))

[0.62857143 0.68571429 0.28571429 0.83333333]
Mean cross validation test score: 0.6083333333333334
Mean cross validation train score: 0.5724130036630037
Standard deviation in cv scores: 0.20068971210357964


### Something seems strange here with an outlier model

## SVM

In [66]:
from sklearn.svm import SVC
pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor', SVC(kernel='linear'))
           ])
svm = pipeline.fit(X_train, y_train)

# Predict using model:

y_predict_svm = pipeline.predict(X_test)
#Confusion matrix:

cnf_matrix = confusion_matrix(y_test, y_predict_svm)
print(cnf_matrix)


[[22  5]
 [14  6]]


In [67]:
cv_scores_test= cross_val_score(pipeline,X_test,y_test,cv=5,scoring='roc_auc')
cv_scores_train= cross_val_score(pipeline,X_train,y_train,cv=5,scoring='roc_auc')
print(cv_scores_test)
cv_scores_svm_test= cv_scores_test.mean()
cv_scores_svm_train= cv_scores_train.mean()
cv_scores_std_svm= cv_scores_test.std()
print ('Mean cross validation test score: ' +str(cv_scores_svm_test))
print ('Mean cross validation train score: ' +str(cv_scores_svm_train))
print ('Standard deviation in cv scores: ' +str(cv_scores_std_svm))

[0.75       0.41666667 0.55       0.6        0.8       ]
Mean cross validation test score: 0.6233333333333334
Mean cross validation train score: 0.5753991596638655
Standard deviation in cv scores: 0.1384838538522733


## Random Forest

In [68]:
from sklearn.ensemble import RandomForestClassifier
pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor', RandomForestClassifier(bootstrap=True,n_estimators=40,criterion='entropy', max_depth=9))
           ])
rf = pipeline.fit(X_train, y_train)


#Predict using the model:

y_predict_rf = rf.predict(X_test)

#Confusion matrix:

cnf_matrix = confusion_matrix(y_test, y_predict_rf)
print(cnf_matrix)
Accuracy_rf=rf.score(X_test,y_test)
print(Accuracy_rf)


[[25  2]
 [14  6]]
0.6595744680851063


In [69]:
rf['regressor'].feature_importances_

array([4.40769557e-02, 7.20657619e-02, 8.16812359e-02, 5.24665658e-02,
       5.86701134e-02, 2.79368861e-02, 8.34515035e-03, 3.60237740e-02,
       3.38157294e-02, 4.83520412e-02, 7.01659275e-02, 5.85266516e-02,
       6.20974621e-02, 4.57465415e-03, 9.80419177e-03, 3.66531451e-03,
       8.34124819e-03, 1.41771743e-02, 2.46874339e-03, 5.88308575e-03,
       1.21825930e-02, 1.04482949e-02, 8.42231800e-03, 4.34392577e-03,
       2.08919676e-02, 7.21588483e-03, 1.70645512e-03, 2.35661406e-03,
       5.83999172e-03, 7.58373507e-04, 7.35882496e-03, 1.34656993e-03,
       9.01841027e-03, 5.43868453e-03, 2.45321088e-03, 7.34483193e-04,
       9.81249970e-03, 0.00000000e+00, 1.20263334e-03, 0.00000000e+00,
       4.99074441e-03, 2.26787034e-03, 3.87257589e-03, 1.31843136e-03,
       9.78060459e-03, 6.72468394e-03, 1.86556174e-03, 1.71758832e-03,
       1.45905424e-03, 2.92053141e-03, 2.07248388e-03, 0.00000000e+00,
       9.56167830e-04, 2.16202172e-03, 7.37135990e-04, 1.77250808e-03,
      

In [70]:
cv_scores_test= cross_val_score(rf,X_test,y_test,cv=4,scoring='roc_auc')
cv_scores_train= cross_val_score(rf,X_train,y_train,cv=4,scoring='roc_auc')
print(cv_scores_test)
cv_scores_rf_test= cv_scores_test.mean()
cv_scores_rf_train= cv_scores_train.mean()
cv_scores_std_rf= cv_scores_test.std()
print ('Mean cross validation test score: ' +str(cv_scores_rf_test))
print ('Mean cross validation train score: ' +str(cv_scores_rf_train))
print ('Standard deviation in cv scores: ' +str(cv_scores_std_rf))

[0.55714286 0.58571429 0.48571429 0.6       ]
Mean cross validation test score: 0.5571428571428572
Mean cross validation train score: 0.5796245421245421
Standard deviation in cv scores: 0.04403152859263555


## Gradient Boosting

In [71]:
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor',  GradientBoostingClassifier(subsample=0.8, learning_rate=0.05 , n_estimators=140, random_state=5, max_depth=9, max_leaf_nodes=100))
           ])
gbc = pipeline.fit(X_train, y_train)

#Predict using the model:

y_predict_gbc = gbc.predict(X_test)

#Confusion matrix:

cnf_matrix = confusion_matrix(y_test, y_predict_gbc)
print(cnf_matrix)
Accuracy_gbc=gbc.score(X_test,y_test)
print(Accuracy_gbc)

[[20  7]
 [12  8]]
0.5957446808510638


In [72]:
cv_scores_test= cross_val_score(gbc,X_test,y_test,cv=4,scoring='roc_auc')
cv_scores_train= cross_val_score(gbc,X_train,y_train,cv=4,scoring='roc_auc')
print(cv_scores_test)
cv_scores_gbc_test= cv_scores_test.mean()
cv_scores_gbc_train= cv_scores_train.mean()
cv_scores_std_gbc= cv_scores_test.std()
print ('Mean cross validation test score: ' +str(cv_scores_rf_test))
print ('Mean cross validation train score: ' +str(cv_scores_rf_train))
print ('Standard deviation in cv scores: ' +str(cv_scores_std_rf))

[0.37142857 0.6        0.68571429 0.7       ]
Mean cross validation test score: 0.5571428571428572
Mean cross validation train score: 0.5796245421245421
Standard deviation in cv scores: 0.04403152859263555
