In [1]:
#Import necessary packages 
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [2]:
# suppress future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [3]:
# load the dataset
df = pd.read_csv('df2.csv', index_col=0)

In [4]:
df.drop(axis='columns' , columns=['close', 'close_plus6', 'spx_close', 'spx_plus6', 'return', 'SP_return' ], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 232 entries, 10 to 555
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Year             232 non-null    int64  
 1   Product_type     232 non-null    object 
 2   Ticker           232 non-null    object 
 3   market_cap_cur   232 non-null    float64
 4   shares_out       232 non-null    float64
 5   year_inc         232 non-null    float64
 6   Previous_SBs     232 non-null    int64  
 7   Yearly_Ad_Count  232 non-null    int64  
 8   New              232 non-null    int64  
 9   overperform      232 non-null    int64  
dtypes: float64(3), int64(5), object(2)
memory usage: 19.9+ KB


In [5]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, validation_curve
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

In [6]:
# Split the data
X = df.drop('overperform',axis=1)
y = df['overperform']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=714)

In [7]:
# pick transformers
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[('encoder', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

In [8]:
# split columns into numeric and categorical
numeric_features = ['Year', 'market_cap_cur', 'shares_out', 'year_inc', 'Previous_SBs', 'Yearly_Ad_Count','New']
categorical_features = ['Product_type', 'Ticker']
preprocessor = ColumnTransformer(
   transformers=[
    ('numeric', numeric_transformer, numeric_features)
   ,('categorical', categorical_transformer, categorical_features)
]) 

### Logistic Regression

In [9]:
C_params = [0.001,0.01,0.1,1,10,100,1000, 1000000]

In [10]:
# loop through C_params
j = 0

table = pd.DataFrame(columns = ['C','Recall','Precision','ROC-AOC'])
table['C'] = C_params
for C in C_params:
    
    # Apply logistic regression model to training data
    pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor',LogisticRegression(C=C,random_state=42))
           ])
    lr_model = pipeline.fit(X_train, y_train) 
    predictions = lr_model.predict(X_test)
    probs = pipeline.predict_proba(X_test)[:, 1]
    # Predict using model
    #print('C is ',C)
    #print ('Recall: ',metrics.recall_score(y_test, predictions) )
    #print ('Precision: ',metrics.precision_score(y_test, predictions) )
    print (confusion_matrix(y_test, predictions))
    #print ('AOC: ',metrics.roc_auc_score(y, probs))
    table.iloc[j,1] = metrics.recall_score(y_test, predictions)
    table.iloc[j,2] = metrics.precision_score(y_test, predictions)
    table.iloc[j,3] = metrics.roc_auc_score(y_test, probs)
    j+=1
table



[[27  0]
 [20  0]]
[[27  0]
 [20  0]]
[[25  2]
 [17  3]]
[[23  4]
 [15  5]]
[[21  6]
 [14  6]]
[[22  5]
 [15  5]]
[[23  4]
 [15  5]]
[[22  5]
 [15  5]]


Unnamed: 0,C,Recall,Precision,ROC-AOC
0,0.001,0.0,0.0,0.477778
1,0.01,0.0,0.0,0.483333
2,0.1,0.15,0.6,0.511111
3,1.0,0.25,0.555556,0.503704
4,10.0,0.3,0.5,0.546296
5,100.0,0.25,0.5,0.57963
6,1000.0,0.25,0.555556,0.562963
7,1000000.0,0.25,0.5,0.55


Looks like decreasing penalties improves the model. If this ends up near the top this should come out in GridSearch

In [11]:
cv_scores_test= cross_val_score(lr_model,X_test,y_test,cv=4,scoring='roc_auc')
cv_scores_train= cross_val_score(lr_model,X_train,y_train,cv=4,scoring='roc_auc')
print(cv_scores_test)
cv_scores_lr_test= cv_scores_test.mean()
cv_scores_lr_train= cv_scores_train.mean()
cv_scores_std_test_lr= cv_scores_test.std()
print ('Mean cross validation test score: ' +str(cv_scores_lr_test))
print ('Mean cross validation train score: ' +str(cv_scores_lr_train))
print ('Standard deviation in cv test scores: ' +str(cv_scores_std_test_lr))

[0.31428571 0.71428571 0.71428571 0.6       ]
Mean cross validation test score: 0.5857142857142857
Mean cross validation train score: 0.5995421245421246
Standard deviation in cv test scores: 0.1635074734608514


## KNN

In [12]:
# Reset the pipeline and try KNN
pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor', KNeighborsClassifier(p=2,weights='distance',n_neighbors=20))
           ])

knn = pipeline.fit(X_train,y_train)

# Predict using model:

y_predict_knn = pipeline.predict(X_test)

#Confusion matrix:

cnf_matrix = confusion_matrix(y_test, y_predict_knn)
print(cnf_matrix)

[[21  6]
 [14  6]]


In [13]:
cv_scores_test= cross_val_score(pipeline,X_test,y_test,cv=4,scoring='roc_auc')
cv_scores_train= cross_val_score(pipeline,X_train,y_train,cv=4,scoring='roc_auc')
print(cv_scores_test)
cv_scores_knn_test= cv_scores_test.mean()
cv_scores_knn_train= cv_scores_train.mean()
cv_scores_std_knn= cv_scores_test.std()
print ('Mean cross validation test score: ' +str(cv_scores_knn_test))
print ('Mean cross validation train score: ' +str(cv_scores_knn_train))
print ('Standard deviation in cv scores: ' +str(cv_scores_std_knn))

[0.6        0.65714286 0.42857143 0.73333333]
Mean cross validation test score: 0.6047619047619048
Mean cross validation train score: 0.5429945054945055
Standard deviation in cv scores: 0.11218303799418806


## SVM

In [14]:
from sklearn.svm import SVC
pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor', SVC(kernel='linear'))
           ])
svm = pipeline.fit(X_train, y_train)

# Predict using model:

y_predict_svm = pipeline.predict(X_test)
#Confusion matrix:

cnf_matrix = confusion_matrix(y_test, y_predict_svm)
print(cnf_matrix)


[[22  5]
 [15  5]]


In [15]:
cv_scores_test= cross_val_score(pipeline,X_test,y_test,cv=5,scoring='roc_auc')
cv_scores_train= cross_val_score(pipeline,X_train,y_train,cv=5,scoring='roc_auc')
print(cv_scores_test)
cv_scores_svm_test= cv_scores_test.mean()
cv_scores_svm_train= cv_scores_train.mean()
cv_scores_std_svm= cv_scores_test.std()
print ('Mean cross validation test score: ' +str(cv_scores_svm_test))
print ('Mean cross validation train score: ' +str(cv_scores_svm_train))
print ('Standard deviation in cv scores: ' +str(cv_scores_std_svm))

[0.5        0.54166667 0.7        0.65       0.7       ]
Mean cross validation test score: 0.6183333333333334
Mean cross validation train score: 0.55390756302521
Standard deviation in cv scores: 0.08273115763993906


## Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier
pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor', RandomForestClassifier(bootstrap=True,n_estimators=40,criterion='entropy', max_depth=9))
           ])
rf = pipeline.fit(X_train, y_train)


#Predict using the model:

y_predict_rf = rf.predict(X_test)

#Confusion matrix:

cnf_matrix = confusion_matrix(y_test, y_predict_rf)
print(cnf_matrix)
Accuracy_rf=rf.score(X_test,y_test)
print(Accuracy_rf)
rf.feature_importances_

[[23  4]
 [15  5]]
0.5957446808510638


AttributeError: 'Pipeline' object has no attribute 'feature_importances_'

In [17]:
rf['regressor'].feature_importances_

array([0.14205967, 0.07265566, 0.07562182, 0.08877706, 0.12324642,
       0.05669285, 0.01876243, 0.00161648, 0.01668869, 0.00398828,
       0.00696744, 0.01166974, 0.003659  , 0.00999613, 0.00542137,
       0.01232875, 0.01000341, 0.0051017 , 0.02206885, 0.00726788,
       0.00377965, 0.0006209 , 0.00730195, 0.00585833, 0.00302131,
       0.0009364 , 0.00978548, 0.00937591, 0.00078968, 0.00122133,
       0.01173282, 0.00124388, 0.00254359, 0.00078631, 0.00712139,
       0.00209036, 0.00299136, 0.00572115, 0.01186097, 0.00657231,
       0.00258148, 0.00618449, 0.00066295, 0.00294066, 0.00240249,
       0.00241183, 0.00096666, 0.00160355, 0.00718144, 0.00326131,
       0.00237553, 0.01454451, 0.0008354 , 0.0034755 , 0.00564567,
       0.00156644, 0.00214148, 0.00341168, 0.00544606, 0.0032216 ,
       0.00272622, 0.00416578, 0.00296942, 0.00209012, 0.00180607,
       0.00285006, 0.00635923, 0.00079783, 0.        , 0.00582849,
       0.0113266 , 0.0007686 , 0.00075209, 0.00143029, 0.     

In [None]:
cv_scores_test= cross_val_score(rf,X_test,y_test,cv=4,scoring='roc_auc')
cv_scores_train= cross_val_score(rf,X_train,y_train,cv=4,scoring='roc_auc')
print(cv_scores_test)
cv_scores_rf_test= cv_scores_test.mean()
cv_scores_rf_train= cv_scores_train.mean()
cv_scores_std_rf= cv_scores_test.std()
print ('Mean cross validation test score: ' +str(cv_scores_rf_test))
print ('Mean cross validation train score: ' +str(cv_scores_rf_train))
print ('Standard deviation in cv scores: ' +str(cv_scores_std_rf))

## Gradient Boosting

In [None]:
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor',  GradientBoostingClassifier(subsample=0.8, learning_rate=0.05 , n_estimators=140, random_state=5, max_depth=9, max_leaf_nodes=100))
           ])
gbc = pipeline.fit(X_train, y_train)

#Predict using the model:

y_predict_gbc = gbc.predict(X_test)

#Confusion matrix:

cnf_matrix = confusion_matrix(y_test, y_predict_gbc)
print(cnf_matrix)
Accuracy_gbc=gbc.score(X_test,y_test)
print(Accuracy_gbc)