# Import Packages and Data set

In [1]:
# EDA - Data Cleaning
import pandas as pd
import numpy as np
import missingno
from collections import Counter
import math

# EDA - Data Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
from pylab import rcParams
%matplotlib inline
import statsmodels.api as stats
from statsmodels.graphics.gofplots import ProbPlot
plt.style.use('seaborn') # pretty matplotlib plots
plt.rc('font', size=14)
plt.rc('figure', titlesize=18)
plt.rc('axes', labelsize=15)
plt.rc('axes', titlesize=18)

#Modeling
from sklearn import (datasets,
                     metrics,
                     model_selection as skms,
                     naive_bayes,
                     neighbors)

from sklearn.linear_model import (LogisticRegression,
                                 SGDClassifier)

from sklearn.model_selection import (cross_val_score,
                                     cross_val_predict,
                                     train_test_split,
                                     GridSearchCV)
from sklearn.preprocessing import RobustScaler

from sklearn.metrics import (accuracy_score,
                             precision_score,
                             recall_score,
                             confusion_matrix,
                             f1_score,
                             roc_curve,
                             auc,
                             classification_report,
                             precision_recall_curve)

from sklearn.feature_selection import RFE

from statsmodels.stats.outliers_influence import variance_inflation_factor

from imblearn.over_sampling import SMOTE

from sklearn.linear_model import RidgeClassifierCV 


# Remove warnings
import warnings
warnings.filterwarnings('ignore')

# logistic regression model
import statsmodels.api as sm 

pd.set_option('display.max_rows', 90)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

pd.options.display.float_format = '{:.2f}'.format

# Define Functions

In [2]:
def log_model_run(X_train, X_test, y_train, y_test):
    logistic_regression = LogisticRegression(n_jobs=-1, random_state=15).fit(X_train, y_train)
    model_pred = logistic_regression.predict(X_test)

    acc_logistic_regression = round(logistic_regression.score(X_test, y_test) * 100, 2)
    prec_logistic_regression = round(precision_score(y_pred=model_pred, y_true=y_test)* 100, 2)
    recall_logistic_regression = round(recall_score(y_pred=model_pred, y_true=y_test)* 100, 2)
    f1_logistic_regression = round(f1_score(y_pred=model_pred, y_true=y_test)* 100, 2)
    
    print('Summary of Modeled Results: ')
    print('   General Accuracy: {:6,.1f}%'.format(acc_logistic_regression))
    print('   ROC AUC Score:    {:6,.1f}%'.format(metrics.roc_auc_score(y_test, model_pred)*100))
    print('   Precision Score:  {:6,.1f}%'.format(prec_logistic_regression))
    print('   Recall Score:     {:6,.1f}%'.format(recall_logistic_regression))
    print('   F1 Score:         {:6,.1f}%'.format(f1_logistic_regression)+'\n')

    res_1 = cross_val_score(logistic_regression, X_train, y_train, scoring = 'accuracy', cv = 10)

    # The confusion matrix
    sns.set(font_scale = 1.5)
    logistic_regression_cm = confusion_matrix(y, logistic_regression.predict(x))
    f, ax = plt.subplots(figsize=(5,5))
    sns.heatmap(logistic_regression_cm, 
                annot=True, 
                linewidth=0.7, 
                linecolor='black', 
                fmt='g', 
                ax=ax, 
                cmap="BuPu")
    plt.xlabel('Suit Prediction')
    plt.ylabel('Suit Actual')
    plt.show()
    
    probs = logistic_regression1.predict_proba(X_test)
    f, ax = plt.subplots(figsize=(5, 5))
    # Calculate the fpr and tpr for all thresholds of the classification
    fpr, tpr, threshold = roc_curve(y_test, probs[:,1])
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
    
    return logistic_regression, acc_logistic_regression, prec_logistic_regression, recall_logistic_regression, f1_logistic_regression
        
        
def show_values(axs, orient="v", space=.01):
    def _single(ax):
        if orient == "v":
            for p in ax.patches:
                _x = p.get_x() + p.get_width() / 2
                _y = p.get_y() + p.get_height() + (p.get_height()*0.01)
                value = '{:.0f}'.format(p.get_height())
                ax.text(_x, _y, value, ha="center") 
        elif orient == "h":
            for p in ax.patches:
                _x = p.get_x() + p.get_width() + float(space)
                _y = p.get_y() + p.get_height() - (p.get_height()*0.5)
                value = '{:.0f}'.format(p.get_width())
                ax.text(_x, _y, value, ha="left")

    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _single(ax)
    else:
        _single(axs)

# Import Dataset

In [9]:
folder_loc = '../01_data/02_modified/'
X_train = pd.read_csv(folder_loc+'X_train.csv', index_col='gvkey')
X_test = pd.read_csv(folder_loc+'X_test.csv', index_col='gvkey')
y_train = pd.read_csv(folder_loc+'y_train.csv', index_col='gvkey')
y_test = pd.read_csv(folder_loc+'y_test.csv', index_col='gvkey')
X_train_smote = pd.read_csv(folder_loc+'X_train_smote.csv', index_col='gvkey')
y_train_smote = pd.read_csv(folder_loc+'y_train_smote.csv', index_col='gvkey')

ValueError: Index gvkey invalid

In [8]:
X_test

Unnamed: 0,gvkey,Vol_at_Variance,Vol_capx_Variance,Vol_cshfd_Variance,Vol_emp_Variance,Vol_epspi_Variance,Vol_ni_Variance,Vol_sale_Variance,Vol_teq_Variance,Vol_wcap_Variance,Vol_roa_Variance,Vol_roe_Variance,StdDev_cshfd,StdDev_emp,StdDev_epspi,StdDev_wcap,StdDev_xido,StdDev_roa,StdDev_roe,at_PercentChange,capx_PercentChange,cogs_PercentChange,dp_PercentChange,emp_PercentChange,epspi_PercentChange,ni_PercentChange,sale_PercentChange,txt_PercentChange,wcap_PercentChange,xido_PercentChange,xint_PercentChange,roe_PercentChange,Foreign_and_Domestic_indicator,GIC_SI_Agricultural_Farm_Machinery,GIC_SI_Construction_Engineering,GIC_SI_Construction_Farm_Machinery_Heavy_Trucks,GIC_SI_Electrical_Components_Equipment,GIC_SI_Heavy_Electrical_Equipment,GIC_SI_Industrial_Conglomerates,GIC_SI_Industrial_Machinary
0,142499,0.05,0.04,0.01,0.06,0.05,0.03,0.00,0.06,0.07,0.06,0.01,0.72,0.31,0.39,0.38,0.18,0.10,0.10,0.00,-0.12,0.01,-0.03,0.00,0.00,0.00,0.00,0.00,-0.00,0.00,0.00,0.00,0,0,0,0,0,0,0,1
1,12722,0.05,0.04,0.01,0.06,0.05,0.03,0.00,0.06,0.07,0.06,0.01,0.13,0.31,0.36,0.38,0.19,0.10,0.18,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0,0,0,0,0,0,0,1
2,135844,0.05,0.04,0.01,0.06,0.05,0.03,0.00,0.06,0.07,0.06,0.01,0.13,0.30,0.36,0.35,0.19,0.10,0.18,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,-0.00,0.00,0.00,0.00,0.00,0,0,0,0,0,0,0,0
3,4807,0.05,0.04,0.01,0.06,0.05,0.03,0.00,0.06,0.07,0.06,0.01,0.12,0.30,0.32,0.34,0.18,0.10,0.17,0.00,0.00,0.00,0.00,0.00,-0.00,-0.00,0.00,-0.00,0.00,-0.00,0.00,-0.00,0,0,0,0,0,0,0,1
4,18799,0.05,0.04,0.01,0.06,0.05,0.03,0.00,0.06,0.07,0.06,0.01,0.14,0.31,0.39,0.38,0.19,0.10,0.18,0.00,0.00,0.00,0.00,0.00,-0.00,-0.00,0.00,0.00,-0.00,0.00,0.00,0.00,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,161843,0.05,0.04,0.01,0.06,0.05,0.03,0.00,0.06,0.07,0.06,0.01,0.13,0.28,0.25,0.32,0.19,0.10,0.18,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0,0,0,1,0,0,0,0
90,237820,0.05,0.04,0.01,0.06,0.05,0.03,0.00,0.06,0.07,0.06,0.01,0.14,3.00,0.38,0.22,0.19,0.10,0.18,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1,0,1,0,0,0,0,0
91,17420,0.05,0.04,0.01,0.06,0.05,0.03,0.00,0.06,0.07,0.06,0.01,0.13,0.47,0.29,2.35,0.19,0.10,0.17,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1,0,0,0,0,0,0,1
92,11703,0.06,0.04,0.44,0.06,0.05,0.07,0.00,0.05,0.07,0.05,0.33,0.02,0.31,0.38,0.38,0.19,0.08,0.09,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0,0,0,0,1,0,0,0


In [6]:
print('X_train shape: ', X_train_smote.shape)
print('y_train.shape: ', y_train_smote.shape)
print('X_test.shape: ', X_test.shape)
# Logistic regression model
logm1 = stats.GLM(y_train_smote,(stats.add_constant(X_train_smote)), family = stats.families.Binomial())
print(logm1.fit().summary())

X_train shape:  (664, 40)
y_train.shape:  (664, 2)
X_test.shape:  (94, 40)
                     Generalized Linear Model Regression Results                      
Dep. Variable:     ['Unnamed: 0', 'suitflag']   No. Observations:                  664
Model:                                    GLM   Df Residuals:                      623
Model Family:                        Binomial   Df Model:                           40
Link Function:                          logit   Scale:                          1.0000
Method:                                  IRLS   Log-Likelihood:                -449.87
Date:                        Wed, 03 Nov 2021   Deviance:                       239.38
Time:                                22:06:59   Pearson chi2:                 1.78e+03
No. Iterations:                            14                                         
Covariance Type:                    nonrobust                                         
                                                      c

In [None]:
X_test.columns

In [None]:
cols_to_keep 'Vol_at_Variance', 'Vol_ni_Variance', 'Vol_roa_Variance', 'Vol_roe_Variance', 'StdDev_emp', 'StdDev_xido', 'StdDev_roa', 'StdDev_roe', 
'capx_PercentChange','sale_PercentChange', 'sale_PercentChange', 'wcap_PercentChange', 
'Foreign_and_Domestic_indicator','GIC_SI_Agricultural_Farm_Machinery', 'GIC_SI_Construction_Engineering', 
'GIC_SI_Construction_Farm_Machinery_Heavy_Trucks', 'GIC_SI_Electrical_Components_Equipment', 'GIC_SI_Industrial_Conglomerates', 'GIC_SI_Industrial_Machinary'

## Model Run Results - without SMOTE

In [1]:
logistic_regression1, acc_logistic_regression1, prec_logistic_regression1, recall_logistic_regression1, f1_logistic_regression1 = log_model_run(X_train, X_test, y_train, y_test)

NameError: name 'log_model_run' is not defined

In [None]:
logistic_regression1, acc_logistic_regression1, prec_logistic_regression1, recall_logistic_regression1, f1_logistic_regression1 = log_model_run(X_train_sm, y_train_sm)

In [None]:
def visualize_features(model, feature_names):
    feature_imp = pd.Series(model.coef_[0],index=feature_names).sort_values(ascending=False)
    sns.barplot(x=feature_imp, y=feature_imp.index)
    plt.xlabel('Feature Coefficients')
    plt.ylabel('Features')
    plt.title('Visualizing Important Features')
    plt.show()
    print(feature_imp)

In [None]:
visualize_features(logistic_regression1, x.columns)

In [None]:
logistic_regression2, acc_logistic_regression2, prec_logistic_regression2, recall_logistic_regression2, f1_logistic_regression2 = log_model_run(X_train_sm, y_train_sm)

## Confusion Matrix

<b>Review: </b>There are a total of  variables that are significant. Below are the variables to select when re-running the model:

    'Vol_capx_Variance', 'Vol_teq_Variance', 'Vol_roa_Variance', 'StdDev_emp', 'StdDev_epspi', 'StdDev_wcap', 'StdDev_xido', 'StdDev_roe', 
    'capx_PercentChange', 'epspi_PercentChange', 'sale_PercentChange', 'sale_PercentChange', 'txt_PercentChange', 'wcap_PercentChange', 
    'xido_PercentChange', 'roe_PercentChange', 'GIC_SI_Agricultural_Farm_Machinery', 'GIC_SI_Construction_Engineering', 
    'GIC_SI_Construction_Farm_Machinery_Heavy_Trucks', 'GIC_SI_Electrical_Components_Equipment', 'GIC_SI_Heavy_Electrical_Equipment',
    'GIC_SI_Industrial_Machinary


# Logistic Regression - Second Run

In [None]:
sig_variables = ['Vol_capx_Variance', 'Vol_teq_Variance', 'Vol_roa_Variance', 'StdDev_emp', 'StdDev_epspi', 'StdDev_wcap', 'StdDev_xido', 'StdDev_roe', 
                 'capx_PercentChange', 'epspi_PercentChange', 'sale_PercentChange', 'sale_PercentChange', 'txt_PercentChange', 'wcap_PercentChange', 
                 'xido_PercentChange', 'roe_PercentChange', 'GIC_SI_Agricultural_Farm_Machinery', 'GIC_SI_Construction_Engineering', 
                 'GIC_SI_Construction_Farm_Machinery_Heavy_Trucks', 'GIC_SI_Electrical_Components_Equipment', 'GIC_SI_Heavy_Electrical_Equipment', 'GIC_SI_Industrial_Machinary', 
                 'suitflag']
df3 = df2[sig_variables]

## Review Data Correlations

In [None]:
corr_matrix = df3.corr()
mask = np.zeros_like(corr_matrix, dtype=np.bool)
mask[np.triu_indices_from(mask)]= True

f, ax = plt.subplots(figsize=(20, 20))

sns.set(font_scale = 1.3)

heatmap = sns.heatmap(corr_matrix,
                      mask = mask,
                      square = True,
                      linewidths = .5,
                      cmap = 'coolwarm',
                      cbar_kws = {'shrink': .4,
                                'ticks' : [-1, -.5, 0, 0.5, 1]},
                      vmin = -1,
                      vmax = 1,
                      annot = True)


#add the column names as labels
ax.set_yticklabels(corr_matrix.columns, rotation = 0)
ax.set_xticklabels(corr_matrix.columns)

sns.set_style({'xtick.bottom': True}, {'ytick.left': True})

In [None]:
# Output any correlated values over 0.65 - isolating high-correlated variables to remove from the analysis
def high_corr_and_check(X):
    corr_matrix = X.corr().abs()
    sol = (corr_matrix.where(np.triu(np.ones(corr_matrix.shape), 
                                     k=1).astype(np.bool))
                      .stack()
                      .sort_values(ascending=False))
    for index, value in sol.items():
        if value > 0.65:
            print(index,value)
            
high_corr_and_check(df3)

<b>Action: </b>There are no highly correlated variables I believe would help the analysis to remove.

In [None]:
df3.info()

## Split the Dataset

In [None]:
x = df3.drop(columns='suitflag')

y = df3[['suitflag']] 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, 
                                                    y,
                                                   test_size=0.45,
                                                   random_state=16,
                                                   stratify=y)

print('X_train shape: ', X_train.shape)
print('y_train.shape: ', y_train.shape)
print('X_test.shape: ', X_test.shape)

## Model

In [None]:
# Logistic regression model
logm2 = sm.GLM(y_train,(sm.add_constant(X_train)), 
               family = sm.families.Binomial())
print(logm2.fit().summary())

In [None]:
logistic_regression2 = LogisticRegression(n_jobs=-1, random_state=15)
logistic_regression2.fit(X_train, y_train)

acc_logistic_regression2 = round(logistic_regression2.score(X_test, y_test) * 100, 2)
prec_logistic_regression2 = round(precision_score(y_pred=logistic_regression2.predict(X_test), y_true=y_test)* 100, 2)
recall_logistic_regression2 = round(recall_score(y_pred=logistic_regression2.predict(X_test), y_true=y_test)* 100, 2)
f1_logistic_regression2 = round(f1_score(y_pred=logistic_regression2.predict(X_test), y_true=y_test)* 100, 2)

print_score(logistic_regression2, X_train, y_train, X_test, y_test, train=True)
print_score(logistic_regression2, X_train, y_train, X_test, y_test, train=False)

res_2 = cross_val_score(logistic_regression2, X_train, y_train, scoring = 'accuracy', cv = 10)

## Confusion Matrix

In [None]:
# The confusion matrix
sns.set(font_scale = 1.5)
logistic_regression_cm = confusion_matrix(y_test, 
                                          logistic_regression2.predict(X_test))
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(logistic_regression_cm, 
            annot=True, 
            linewidth=0.7, 
            linecolor='black', 
            fmt='g', 
            ax=ax, 
            cmap="BuPu")
plt.title('Logistic Regression - Classification Confusion Matrix')
plt.xlabel('Y predict')
plt.ylabel('Y test')
plt.show()

In [None]:
probs = logistic_regression2.predict_proba(X_test)
# Calculate the fpr and tpr for all thresholds of the classification
fpr, tpr, threshold = roc_curve(y_test, probs[:,1])
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

<b>Review: </b>After a second model run, I am going to drop the following columns and re-run:

1) Vol_roe_Variance
2) GIC_SubIndustry_Heavy

# Logistic Regression - Third Run

In [None]:
df4 = df3.drop(columns=['Vol_emp_Variance', 'Vol_wcap_Variance', 'StdDev_wcap', 'StdDev_cogs','GIC_SI_Construction_Engineering',])

## Review Data Correlations

In [None]:
corr_matrix = df4.corr()
mask = np.zeros_like(corr_matrix, dtype=np.bool)
mask[np.triu_indices_from(mask)]= True

f, ax = plt.subplots(figsize=(20, 20))

sns.set(font_scale = 1.3)

heatmap = sns.heatmap(corr_matrix,
                      mask = mask,
                      square = True,
                      linewidths = .5,
                      cmap = 'coolwarm',
                      cbar_kws = {'shrink': .4,
                                'ticks' : [-1, -.5, 0, 0.5, 1]},
                      vmin = -1,
                      vmax = 1,
                      annot = True)


#add the column names as labels
ax.set_yticklabels(corr_matrix.columns, rotation = 0)
ax.set_xticklabels(corr_matrix.columns)

sns.set_style({'xtick.bottom': True}, {'ytick.left': True})

In [None]:
# Output any correlated values over 0.65 - isolating high-correlated variables to remove from the analysis
def high_corr_and_check(X):
    corr_matrix = X.corr().abs()
    sol = (corr_matrix.where(np.triu(np.ones(corr_matrix.shape), 
                                     k=1).astype(np.bool))
                      .stack()
                      .sort_values(ascending=False))
    for index, value in sol.items():
        if value > 0.65:
            print(index,value)
            
high_corr_and_check(df4)

<b>Action: </b>There are no highly correlated variables I believe would help the analysis to remove.

In [None]:
df4.info()

## Split the Dataset

In [None]:
x = df4.drop(columns='suitflag')

y = df4[['suitflag']] 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, 
                                                    y,
                                                   test_size=0.46,
                                                   random_state=16,
                                                   stratify=y)

print('X_train shape: ', X_train.shape)
print('y_train.shape: ', y_train.shape)
print('X_test.shape: ', X_test.shape)

## Model

In [None]:
# Logistic regression model
X_train_sm = sm.add_constant(X_train)

logm3 = sm.GLM(y_train,(sm.add_constant(X_train)), 
               family = sm.families.Binomial())
print(logm3.fit().summary())

res = logm3.fit()

In [None]:
logistic_regression3 = LogisticRegression(n_jobs=-1, random_state=15)
logistic_regression3.fit(X_train, y_train)

acc_logistic_regression3 = round(logistic_regression3.score(X_test, y_test) * 100, 2)
prec_logistic_regression3 = round(precision_score(y_pred=logistic_regression3.predict(X_test), y_true=y_test)* 100, 2)
recall_logistic_regression3 = round(recall_score(y_pred=logistic_regression3.predict(X_test), y_true=y_test)* 100, 2)
f1_logistic_regression3 = round(f1_score(y_pred=logistic_regression3.predict(X_test), y_true=y_test)* 100, 2)

print_score(logistic_regression3, X_train, y_train, X_test, y_test, train=True)
print_score(logistic_regression3, X_train, y_train, X_test, y_test, train=False)

res_3 = cross_val_score(logistic_regression3, X_train, y_train, scoring = 'accuracy', cv = 10)

## Confusion Matrix

In [None]:
# The confusion matrix
sns.set(font_scale = 1.5)
logistic_regression_cm = confusion_matrix(y_test, 
                                          logistic_regression3.predict(X_test))
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(logistic_regression_cm, 
            annot=True, 
            linewidth=0.7, 
            linecolor='black', 
            fmt='g', 
            ax=ax, 
            cmap="BuPu")
plt.title('Logistic Regression - Classification Confusion Matrix')
plt.xlabel('Y predict')
plt.ylabel('Y test')
plt.show()

<b>Review: </b>After a third model run, all variable remain significant and thus the analysis will conclude and the confusion matrix reviewed.

In [None]:
probs = logistic_regression3.predict_proba(X_test)
# Calculate the fpr and tpr for all thresholds of the classification
fpr, tpr, threshold = roc_curve(y_test, probs[:,1])
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# Model Evaluation

In [None]:
# Create a list which contains classifiers 
classifiers = []
classifiers.append(logistic_regression1)
classifiers.append(logistic_regression2)
classifiers.append(logistic_regression3)
print('Number of Classifiers: ',len(classifiers))

# Number of Cross Validations
cv = 10
print('Number of Cross Validations: ', cv, '\n','-'*40)

# Create a list which contains cross validation results for each classifier
cv_results = []
cv_results.append(res_1)
cv_results.append(res_2)
cv_results.append(res_3)

# for classifier in classifiers:
#     cv_results.append(cross_val_score(classifier, X_train, y_train, scoring = 'accuracy', cv = 10))
    
# Mean and standard deviation of cross validation results for each classifier  
cv_mean = []
cv_std = []
for cv_result in cv_results:
    cv_mean.append(round(cv_result.mean()*100,2))
    cv_std.append(round(cv_result.std(),3))

algos = ['Logistic Regression - Initial Run',
         'Logistic Regression - Second Run',
         'Logistic Regression - Third Run'
        ]

acc_scores = [acc_logistic_regression1,
              acc_logistic_regression2,
              acc_logistic_regression3
             ]

prec_scores = [prec_logistic_regression1,
               prec_logistic_regression2,
               prec_logistic_regression3
              ]

recall_scores = [recall_logistic_regression1,
                 recall_logistic_regression2,
                 recall_logistic_regression3
                ]

f1_scores = [f1_logistic_regression1,
             f1_logistic_regression2,
             f1_logistic_regression3
            ]

    
cv_res = pd.DataFrame({'Algorithm': algos,
                       'Initial Accuracy Scores': acc_scores,
                       'Cross Validation Mean': cv_mean, 
                       'Cross Validation Std': cv_std,
                       'Precision Score': prec_scores,
                       'Recall Scores': recall_scores,
                       'F1 Scores': f1_scores
                       })

cv_res.sort_values(by = 'F1 Scores', ascending = False).set_index('Algorithm')

In [None]:
cv_res1 = cv_res.drop(columns='Cross Validation Std').set_index('Algorithm').T
sns.set(font_scale = 1.8)
fig, ax = plt.subplots(figsize=(24,8))
cv_res1.plot(kind='bar', ax=ax)
plt.xticks(rotation = 45)
plt.ylabel('Percent')
plt.legend(loc='top right')
show_values(ax)
plt.ylim(0, 100)
ax.grid();

## Visualize F1 Scores to Identify the Best Model

In [None]:
sns.barplot('F1 Scores', 
            'Algorithm', 
            data = cv_res, 
            order = cv_res.sort_values(by = 'F1 Scores', 
                                       ascending = False)['Algorithm'], 
            palette = 'Set3', 
            **{'xerr': cv_std})

plt.ylabel('Algorithm')
plt.title('F1 Scores')

## Receiver Operating Characteristic Curve

In [None]:
probs = logistic_regression3.predict_proba(X_test)
# Calculate the fpr and tpr for all thresholds of the classification
fpr, tpr, threshold = roc_curve(y_test, probs[:,1])
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# Combine Prediction Results to Test Dataset

In [None]:
x

In [None]:
Y_pred = logistic_regression3.predict_proba(x)[:, 1]
Y_pred

In [None]:
df.reset_index(inplace=True)
df.reset_index(inplace=True)

In [None]:
df.set_index('gvkey', inplace=True)

In [None]:
print('The number of companies in this prediction is: ' ,len(Y_pred))

In [None]:
submit = pd.DataFrame({'index': df['index'], 'suit_pred': Y_pred})
submit = submit[['suit_pred']]
submit

In [None]:
df = df.join(submit)
df.drop(columns='index', inplace=True)
df

In [None]:
df['suitflag']

In [None]:
def custom_round(x, base=5):
    return int(base * round(float(x)/base))

In [None]:
df['Prediction_Score'] = df['suit_pred']*100
df['Prediction_Score'] = df['Prediction_Score'].apply(lambda x: custom_round(x, base=5))

In [None]:
# df['Prediction_Score'] = round(df['suit_pred']*100, 2)
pd.value_counts(df['Prediction_Score'], dropna=False)

In [None]:
df.hist('suit_pred', bins=10)

In [None]:
cv_res1 = cv_res.drop(columns='Cross Validation Std').set_index('Algorithm').T
sns.set(font_scale = 1.8)
fig, ax = plt.subplots(figsize=(24,8))
cv_res1.plot(kind='bar', ax=ax)
plt.xticks(rotation = 45)
plt.ylabel('Percent')
plt.legend(loc='top right')
show_values(ax)
plt.ylim(0, 100)
ax.grid();

In [None]:
plt.figure(figsize = (25,10))
sns.set(font_scale = 2.5)


var = df['Prediction_Score']
ax = sns.countplot(x = var,
                   data = df, )

show_values(ax)
plt.ylim(0, 150)

plt.ylabel('Company Count')
plt.xlabel('Prediction Scores \n(in %)')
# plt.title('Company Count by Prediction Score')
plt.xticks(rotation = 0)

# Model Interpretation

In [2]:
model  = pd.DataFrame({"Features": X_train_sm.columns,"Coefficient":res_3.params.values})
model["Odds_Ratio"] = model["Coefficient"].apply(lambda x: np.exp(x))
model[["Coefficient","Odds_Ratio"]] = model[["Coefficient","Odds_Ratio"]].apply(lambda x: round(x,2))
model["Perc_Impact"] = model["Odds_Ratio"].apply(lambda x: (x-1)*100)
model = model.loc[model['Features']!='const'].sort_values(by='Odds_Ratio', ascending=False)
model

NameError: name 'pd' is not defined

In [None]:
model_int = model.drop(columns=['Perc_Impact', 'Coefficient']).set_index('Features').T
model_int= model_int[['Vol_sale_Variance', 'StdDev_capx', 'StdDev_txt', 'GIC_SI_Heavy_Electrical_Equipment']]

sns.set(font_scale = 1.8)
fig, ax = plt.subplots(figsize=(24,8))
model_int.plot(kind='bar', ax=ax)
plt.xticks(rotation = 45)
plt.ylabel('Percent')
plt.legend(loc='top right')
ax.grid();

In [None]:
df

In [None]:
df.to_csv('../01_data/03_final/suit_prediction_final.csv')

# END