# Import Packages and Data set

In [None]:
# EDA - Data Cleaning
import pandas as pd
import numpy as np
import missingno
from collections import Counter
import math

# EDA - Data Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
from pylab import rcParams
%matplotlib inline
import statsmodels.api as stats
from statsmodels.graphics.gofplots import ProbPlot
plt.style.use('seaborn') # pretty matplotlib plots
plt.rc('font', size=14)
plt.rc('figure', titlesize=18)
plt.rc('axes', labelsize=15)
plt.rc('axes', titlesize=18)

#Modeling
from sklearn import (datasets,
                     metrics,
                     model_selection as skms,
                     naive_bayes,
                     neighbors)

from sklearn.linear_model import (LogisticRegression,
                                 SGDClassifier)

from sklearn.model_selection import (cross_val_score,
                                     cross_val_predict,
                                     train_test_split,
                                     GridSearchCV)
from sklearn.preprocessing import RobustScaler

from sklearn.metrics import (accuracy_score,
                             precision_score,
                             recall_score,
                             confusion_matrix,
                             f1_score,
                             roc_curve,
                             auc,
                             classification_report,
                             precision_recall_curve)

from sklearn.ensemble import (RandomForestClassifier,
                              AdaBoostClassifier,
                              ExtraTreesClassifier,
                              GradientBoostingClassifier)

from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_selection import RFE

from statsmodels.stats.outliers_influence import variance_inflation_factor

from imblearn.over_sampling import SMOTE

from sklearn.linear_model import RidgeClassifierCV 

# Remove warnings
import warnings
warnings.filterwarnings('ignore')

# logistic regression model
import statsmodels.api as sm 

pd.set_option('display.max_rows', 90)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

pd.options.display.float_format = '{:.2f}'.format

# Define Functions

In [None]:
def test_train_summary(X_train, y_train):

    print('\nTraining Dataset - Before Synthetic Minority Oversampling Technique (SMOTE): ')
    print('   Number  of Companies sued:               {:6,.0f}'.format(sum(y_train['suitflag'])))
    print('   Number of Companies not sued:            {:6,.0f}'.format(len(y_train)-sum(y_train['suitflag'])))
    print('   Total Companies:                         {:6,.0f}'.format(len(y_train)))
    print('   Percent of Companies who Filed Suit:    {:6,.0f}%'.format(sum(y_train['suitflag'])/len(y_train)*100))
    print('   Columns and Row count Review: ')
    print('      Column count:                         {:6,.0f}'.format(len(x.columns)))
    print('      Column-to-Row ratio:                 {:6,.0f}%'.format(len(x.columns)/len(y_train)*100)+'\n')
    
    print('Training Dataset - After SMOTE: ')
    print('   Training Dataset - Overview: ')
    print('   Number  of Companies sued (Increase):    {:6,.0f}'.format(sum(y_train_smote['suitflag'])))
    print('   Number of Companies not sued:            {:6,.0f}'.format(len(y_train_smote)-sum(y_train_smote['suitflag'])))
    print('   Percent of Companies who Filed Suit:    {:6,.0f}%'.format(sum(y_train_smote['suitflag'])/len(y_train_smote)*100))
    print('   Columns and Row count Review: ')
    print('      Column count:                         {:6,.0f}'.format(len(x.columns)))
    print('      Column-to-Row ratio:                 {:6,.0f}%'.format(len(x.columns)/len(y_train)*100)+'\n')
    
    print('Test Dataset - Overview: ')
    print('   Number  of Companies sued:               {:6,.0f}'.format(sum(y_test['suitflag'])))
    print('   Number of Companies not sued:            {:6,.0f}'.format(len(y_test)-sum(y_test['suitflag'])))
    print('   Total Companies:                         {:6,.0f}'.format(len(y_test)))
    print('   Percent of Companies who Filed Suit:    {:6,.0f}%'.format(sum(y_test['suitflag'])/len(y_test)*100))
    

    
    
def model_run(X_train, y_train, model_run, results_review):
    if results_review == 'train':    
        
        accuracy = round(model_run.score(X_test, y_test) * 100, 2)

        prec_score = round(precision_score(y_true = y_train, 
                                           y_pred = model_pred_train)* 100, 2)

        recall = round(recall_score(y_true = y_train, 
                                    y_pred = model_pred_train)* 100, 2)

        f1 = round(f1_score(y_true = y_train, 
                            y_pred = model_pred_train)* 100, 2)

        print('Summary of Modeled Results: ')
        print('   General Accuracy: {:6,.1f}%'.format(accuracy))
        print('   ROC AUC Score:    {:6,.1f}%'.format(metrics.roc_auc_score(y_train, model_pred_train)*100))
        print('   Precision Score:  {:6,.1f}%'.format(prec_score))
        print('   Recall Score:     {:6,.1f}%'.format(recall))
        print('   F1 Score:         {:6,.1f}%'.format(f1)+'\n')


        # The confusion matrix
        sns.set(font_scale = 1.5)
        cm = confusion_matrix(y_train, model_run.predict(X_train))
        f, ax = plt.subplots(figsize=(5,5))
        sns.heatmap(cm, 
                    annot=True, 
                    linewidth=0.7, 
                    linecolor='black', 
                    fmt='g', 
                    ax=ax, 
                    cmap="BuPu")
        plt.xlabel('Suit Prediction')
        plt.ylabel('Suit Actual')
        plt.show()

    else:
        
        accuracy = round(model_run.score(X_test, y_test) * 100, 2)

        prec_score = round(precision_score(y_true = y_test, 
                                           y_pred = model_pred_test)* 100, 2)

        recall = round(recall_score(y_true = y_test, 
                                    y_pred = model_pred_test)* 100, 2)

        f1 = round(f1_score(y_true = y_test, 
                            y_pred = model_pred_test)* 100, 2)

        print('Summary of Modeled Results: ')
        print('   General Accuracy: {:6,.1f}%'.format(accuracy))
        print('   ROC AUC Score:    {:6,.1f}%'.format(metrics.roc_auc_score(y_train, model_pred_train)*100))
        print('   Precision Score:  {:6,.1f}%'.format(prec_score))
        print('   Recall Score:     {:6,.1f}%'.format(recall))
        print('   F1 Score:         {:6,.1f}%'.format(f1)+'\n')


        # The confusion matrix
        sns.set(font_scale = 1.5)
        cm = confusion_matrix(y_test, model_run.predict(X_test))
        f, ax = plt.subplots(figsize=(5,5))
        sns.heatmap(cm, 
                    annot=True, 
                    linewidth=0.7, 
                    linecolor='black', 
                    fmt='g', 
                    ax=ax, 
                    cmap="BuPu")
        plt.xlabel('Suit Prediction')
        plt.ylabel('Suit Actual')
        plt.show()          

    probs = model_run.predict_proba(X_test)
    f, ax = plt.subplots(figsize=(5, 5))
    # Calculate the fpr and tpr for all thresholds of the classification
    fpr, tpr, threshold = roc_curve(y_test, probs[:,1])
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
        
    return accuracy, prec_score, recall, f1




def show_values(axs, orient="v", space=.01):
    def _single(ax):
        if orient == "v":
            for p in ax.patches:
                _x = p.get_x() + p.get_width() / 2
                _y = p.get_y() + p.get_height() + (p.get_height()*0.01)
                value = '{:.0f}'.format(p.get_height())
                ax.text(_x, _y, value, ha="center") 
        elif orient == "h":
            for p in ax.patches:
                _x = p.get_x() + p.get_width() + float(space)
                _y = p.get_y() + p.get_height() - (p.get_height()*0.5)
                value = '{:.0f}'.format(p.get_width())
                ax.text(_x, _y, value, ha="left")

    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _single(ax)
    else:
        _single(axs)

# Import Dataset

In [None]:
df = pd.read_csv('../01_data/02_modified/capstone_modeling_final1.csv', index_col='gvkey')

# Pre-Modeling Work - Final Prep

## Review Dependant Variable Split (DV)

In [None]:
# Data to plot for the % of target variable
sns.set(font_scale = 1.7)
labels =df['suitflag'].value_counts(sort = True).index
sizes = df['suitflag'].value_counts(sort = True)
colors = ["lightsteelblue","salmon"]
explode = (0.1,0)  # explode 1st slice
 
rcParams['figure.figsize'] = 8,8
# Plot
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=270,)
plt.title('Percent of Customers Who Have Had a Suit Filed Against Them - 0 = No; 1 = Yes')
plt.show()

## Generate Dummy Variables

In [None]:
df.to_csv('capstone_modeling1.csv', index=False)

In [None]:
cols = ['restatementflag']

for ea in range(len(cols)):
    df[cols[ea]] = df[cols[ea]].astype(str)

In [None]:
binary_cols=[]
multi_Value=[]
for col in df.columns:
    if df[col].dtype =='object':
        if df[col].unique().shape[0]==2:
            binary_cols.append(col)
        else:
            multi_Value.append(col)
            
print('Multi-value columns include: ',multi_Value)
print('Binary-value columns include: ',binary_cols)

In [None]:
for col in df.dtypes[df.dtypes == object].index:
    print(col,'\n', df[col].unique())

In [None]:
# Convert IDBflag, with Foreign_and_Domestic_indicator (1 = yes; 0 = no)
df['Foreign_and_Domestic_indicator'] = df['idbflag'].replace({'D':0, 'B':1})
df.drop(columns='idbflag', inplace=True)

In [None]:
df['GIC_SubIndustry'] = df['GIC_SubIndustry'].replace({r' & ':'_', r' ':'_'}, regex=True)
df['GIC_Industry'] = df['GIC_Industry'].replace({r' & ':'_', r' ':'_'}, regex=True)

In [None]:
# GIC_Industry = pd.get_dummies(df['GIC_Industry'], prefix='GIC_Industry').drop(columns=['GIC_Industry_Aeorspace & Defense'])
# df1 = df.join(GIC_Industry)


GIC_SubIndustry = pd.get_dummies(df['GIC_SubIndustry'], prefix='GIC_SI').drop(columns=['GIC_SI_Aerospace_Defense'])
df1 = df.join(GIC_SubIndustry)

# stko = pd.get_dummies(df['stko'], prefix='stko').drop(columns=['stko_0'])
# df1 = df1.join(stko)

In [None]:
# df1.head(3).T

In [None]:
#Dropping columns due to dummy variable additions
df2 = df1.drop(columns=['GIC_Industry', 
                        'GIC_SubIndustry'
                       ])

In [None]:
df2['restatementflag'] = df2['restatementflag'].astype(int)
df2['suitflag'] = df2['suitflag'].replace({'Yes':1, 'No':0})

# drop restatment flag, as this was used in the prior steps when calculating restatement variances.
df2.drop(columns='restatementflag', inplace=True)

In [None]:
df2

## Review Data Correlations

In [None]:
# Output any correlated values over 0.67 - isolating high-correlated variables to remove from the analysis
def high_corr_and_check(X):
    corr_matrix = X.corr().abs()
    sol = (corr_matrix.where(np.triu(np.ones(corr_matrix.shape), 
                                     k=1).astype(np.bool))
                      .stack()
                      .sort_values(ascending=False))
    for index, value in sol.items():
        if value > 0.65:
            print(index,value)
            
high_corr_and_check(df2)

<b>Action: </b>Remove the highly correlated variables I believe would help the analysis and re-run the high_corr_and_check command.

In [None]:
df2.drop(columns=['StdDev_dp', 'StdDev_at', 'StdDev_cogs', 'Vol_xido_Variance', 'Vol_txt_Variance', 
                  'StdDev_teq', 'StdDev_txt', 'StdDev_capx', 'xsga_PercentChange', 'ppent_PercentChange', 
                  'cshfd_PercentChange', 'roa_PercentChange', 'Vol_xsga_Variance', 'Vol_ppent_Variance', 
                  'Vol_dltt_Variance', 'Vol_dp_Variance', 'Vol_xint_Variance', 'StdDev_sale', 'StdDev_xint', 
                  'StdDev_ppent', 'StdDev_xsga', 'Vol_cogs_Variance', 'StdDev_dltt', 'teq_PercentChange', 
                  'dltt_PercentChange', 'StdDev_ni'], 
         inplace=True)

In [None]:
# Output any correlated values over 0.67 - isolating high-correlated variables to remove from the analysis
def high_corr_and_check(X):
    corr_matrix = X.corr().abs()
    sol = (corr_matrix.where(np.triu(np.ones(corr_matrix.shape), 
                                     k=1).astype(np.bool))
                      .stack()
                      .sort_values(ascending=False))
    for index, value in sol.items():
        if value > 0.65:
            print(index,value)
            
high_corr_and_check(df2)

In [None]:
corr_matrix = df2.corr()
mask = np.zeros_like(corr_matrix, dtype=np.bool)
mask[np.triu_indices_from(mask)]= True

f, ax = plt.subplots(figsize=(20, 20))
# sns.set(font_scale = .6)

heatmap = sns.heatmap(corr_matrix,
                      mask = mask,
                      square = True,
                      linewidths = .5,
                      cmap = 'coolwarm',
                      cbar_kws = {'shrink': .6,
                                'ticks' : [-1, -.5, 0, 0.5, 1]},
                      vmin = -1,
                      vmax = 1,
                      annot = True,
                      annot_kws = {'size': 8})




#add the column names as labels
ax.set_yticklabels(corr_matrix.columns, rotation = 0)
ax.set_xticklabels(corr_matrix.columns, rotation = 45, horizontalalignment='right')
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)

sns.set_style({'xtick.bottom': True}, {'ytick.left': True})

# plt.savefig('corr_plot.png')

In [None]:
#Check to ensure all values are numeric and convert all as necessary
df2.info()

In [None]:
#Review the columns with null values in the dataset
nulls = pd.DataFrame(df2.isnull().sum().sort_values(ascending = False), columns=['Amount'])
nulls = nulls.loc[nulls['Amount'] > 0]
# print('Shape of train dataset:', train.shape, '\nMissing values for train dataset below:\n', nulls)
nulls = pd.DataFrame(nulls.loc[nulls['Amount'] > 0])
# print('Shape of train dataset:', train.shape, '\nMissing values for train dataset below:\n', nulls)
nulls.index.name='Columns With Missing Values'
nulls

In [None]:
df2.fillna(0, inplace=True)

In [None]:
#Review the columns with null values in the dataset
nulls = pd.DataFrame(df2.isnull().sum().sort_values(ascending = False), columns=['Amount'])
nulls = nulls.loc[nulls['Amount'] > 0]
# print('Shape of train dataset:', train.shape, '\nMissing values for train dataset below:\n', nulls)
nulls = pd.DataFrame(nulls.loc[nulls['Amount'] > 0])
# print('Shape of train dataset:', train.shape, '\nMissing values for train dataset below:\n', nulls)
nulls.index.name='Columns With Missing Values'
nulls

In [None]:
df2.describe(include='all').T

## Split the Dataset

In [None]:
x = df2.drop(columns=['suitflag'])
y = df2[['suitflag']] 

#split data by 80/20
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20)

#add SMOTE
sm = SMOTE(random_state = 30)
X_train_smote, y_train_smote = sm.fit_resample(X_train, y_train)

In [None]:
x[x.index.isin(['3580'])]

In [None]:
test_train_summary(X_train, y_train)

# Model Runs

## Testing the Models

### Logistic Regression

In [None]:
logistic_regression = LogisticRegression(n_jobs=-1, random_state=15).fit(X_train, y_train)

model_pred_train = logistic_regression.predict(X_train)
model_pred_test = logistic_regression.predict(X_test)

res_1 = cross_val_score(logistic_regression, X_train, y_train, scoring = 'accuracy', cv = 10)

acc_logistic_regression, prec_logistic_regression, recall_logistic_regression, f1_logistic_regression = model_run(X_train, y_train, logistic_regression, 'test')

### Logistic Regression - with SMOTE

In [None]:
logistic_regression_sm = LogisticRegression(n_jobs=-1, random_state=15).fit(X_train_smote, y_train_smote)

model_pred_train = logistic_regression_sm.predict(X_train_smote)
model_pred_test = logistic_regression_sm.predict(X_test)

res_2 = cross_val_score(logistic_regression_sm, X_train_smote, y_train_smote, scoring = 'accuracy', cv = 10)

acc_logistic_regression_sm, prec_logistic_regression_sm, recall_logistic_regression_sm, f1_logistic_regression_sm = model_run(X_train_smote, y_train_smote, logistic_regression_sm, 'test')

### Logistic Regression - with SMOTE and hyper paramters tuned

In [None]:
LR = LogisticRegression(n_jobs=-1)

parameters = {"C":np.logspace(-3,3,7), 'penalty': ['l1', 'l2'], 'solver' : ['liblinear', 'sag', 'saga'], 'random_state':(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15)}


grid_LR = GridSearchCV(estimator=LR, param_grid = parameters, cv = 2, n_jobs=-1)
grid_LR.fit(X_train_smote, y_train_smote)

print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",grid_LR.best_estimator_)
print("\n The best score across ALL searched params:\n",grid_LR.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_LR.best_params_)

In [None]:
# generate variables based on list of best parameters
print('Below are the variables to be used based on the gridsearch hyperparamter technique applied: ')
C = list(grid_LR.best_params_.values())[0]
print('   C = ', C)
penalty = list(grid_LR.best_params_.values())[1]
print('   penalty = ', penalty)
random_state = list(grid_LR.best_params_.values())[2]
print('   random_state = ', random_state)
solver = list(grid_LR.best_params_.values())[3]
print('   solver = ', solver)

In [None]:
logistic_regression_sm_hp = LogisticRegression(C=C, penalty=penalty, random_state=random_state, solver=solver, n_jobs=-1).fit(X_train_smote, y_train_smote)

model_pred_train = logistic_regression_sm_hp.predict(X_train_smote)
model_pred_test = logistic_regression_sm_hp.predict(X_test)

res_3 = cross_val_score(logistic_regression_sm_hp, X_train_smote, y_train_smote, scoring = 'accuracy', cv = 10)

acc_logistic_regression_sm_hp, prec_logistic_regression_sm_hp, recall_logistic_regression_sm_hp, f1_logistic_regression_sm_hp = model_run(X_train_smote, y_train_smote, logistic_regression_sm_hp, 'test')

### DecisionTree - with SMOTE

In [None]:
random_forest_sm = RandomForestClassifier().fit(X_train_smote, y_train_smote)

model_pred_train = random_forest_sm.predict(X_train_smote)
model_pred_test = random_forest_sm.predict(X_test)

res_4 = cross_val_score(random_forest_sm, X_train_smote, y_train_smote, scoring = 'accuracy', cv = 10)

acc_random_forest_sm, prec_random_forest_sm, recall_random_forest_sm, f1_random_forest_sm = model_run(X_train_smote, y_train_smote, random_forest_sm, 'test')

### Decision Tree - with SMOTE and hyper parameters tuned

In [None]:
from sklearn import tree
rand_for = RandomForestClassifier().fit(X_train_smote, y_train_smote)

parameters = {'n_estimators': [200, 300, 400, 500], 'max_features': ['auto', 'sqrt', 'log2'], 
               'max_depth': [4, 5, 6, 7, 8], 'criterion': ['gini', 'entropy']
             }


grid = GridSearchCV(estimator=rand_for, param_grid = parameters, cv = 2, n_jobs=-1)
grid.fit(X_train_smote, y_train_smote)

print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",grid.best_estimator_)
print("\n The best score across ALL searched params:\n",grid.best_score_)
print("\n The best parameters across ALL searched params:\n",grid.best_params_)

In [None]:
# generate variables based on list of best parameters
print('Below are the variables to be used based on the gridsearch hyperparamter technique applied: ')
criterion = list(grid.best_params_.values())[0]
print('   criterion = ', criterion)
max_depth = list(grid.best_params_.values())[1]
print('   max_depth = ', max_depth)
max_features = list(grid.best_params_.values())[2]
print('   max_features = ', max_features)
n_estimators = list(grid.best_params_.values())[3]
print('   n_estimators = ', n_estimators)

In [None]:
random_forest_sm_hp = RandomForestClassifier(criterion=criterion, max_depth=max_depth, max_features=max_features, n_estimators=n_estimators).fit(X_train_smote, y_train_smote)

model_pred_train = random_forest_sm_hp.predict(X_train_smote)
model_pred_test = random_forest_sm_hp.predict(X_test)

res_5 = cross_val_score(random_forest_sm_hp, X_train_smote, y_train_smote, scoring = 'accuracy', cv = 10)

acc_random_forest_sm_hp, prec_random_forest_sm_hp, recall_random_forest_sm_hp, f1_random_forest_sm_hp = model_run(X_train_smote, y_train_smote, random_forest_sm_hp, 'test')

## Testing - Model Evaluation

In [None]:
# Create a list which contains classifiers 
classifiers = []
classifiers.append(logistic_regression)
classifiers.append(logistic_regression_sm)
classifiers.append(logistic_regression_sm_hp)
classifiers.append(random_forest_sm)
classifiers.append(random_forest_sm_hp)
print('Number of Classifiers: ',len(classifiers))

# Number of Cross Validations
cv = 10
print('Number of Cross Validations: ', cv, '\n','-'*40)

# Create a list which contains cross validation results for each classifier
cv_results = []
cv_results.append(res_1)
cv_results.append(res_2)
cv_results.append(res_3)
cv_results.append(res_4)
cv_results.append(res_5)

# for classifier in classifiers:
#     cv_results.append(cross_val_score(classifier, X_train, y_train, scoring = 'accuracy', cv = 10))
    
# Mean and standard deviation of cross validation results for each classifier  
cv_mean = []
cv_std = []
for cv_result in cv_results:
    cv_mean.append(round(cv_result.mean()*100,2))
    cv_std.append(round(cv_result.std(),3))

algos = ['Logistic Regression - Initial Run',
         'Logistic Regression - With SMOTE',
         'Logistic Regression - With SMOTE and Hyperparamters Tuned',
         'Random Forest - With SMOTE',
         'Random Forest - With SMOTE and Hyperparamters Tuned'
        ]

acc_scores = [acc_logistic_regression,
              acc_logistic_regression_sm,
              acc_logistic_regression_sm_hp,
              acc_random_forest_sm,
              acc_random_forest_sm_hp
             ]

prec_scores = [prec_logistic_regression,
              prec_logistic_regression_sm,
              prec_logistic_regression_sm_hp,
              prec_random_forest_sm,
              prec_random_forest_sm_hp
              ]

recall_scores = [recall_logistic_regression,
                 recall_logistic_regression_sm,
                 recall_logistic_regression_sm_hp,
                 recall_random_forest_sm,
                 recall_random_forest_sm_hp
                ]

f1_scores = [f1_logistic_regression,
             f1_logistic_regression_sm,
             f1_logistic_regression_sm_hp,
             f1_random_forest_sm,
             f1_random_forest_sm_hp
            ]

    
cv_res = pd.DataFrame({'Algorithm': algos,
                       'Initial Accuracy Scores': acc_scores,
                       'Cross Validation Mean': cv_mean, 
                       'Cross Validation Std': cv_std,
                       'Precision Score': prec_scores,
                       'Recall Scores': recall_scores,
                       'F1 Scores': f1_scores
                       })


cv_res.sort_values(by = 'F1 Scores', ascending = False).set_index('Algorithm')

In [None]:
cv_res1 = cv_res.drop(columns='Cross Validation Std').set_index('Algorithm').T
sns.set(font_scale = 1.7)
fig, ax = plt.subplots(figsize=(40,18))
cv_res1.plot(kind='bar', ax=ax)
plt.xticks(rotation = 45)
plt.ylabel('Percent')
plt.legend(loc='top right')
show_values(ax)
plt.ylim(0, 110)
ax.grid();

## Visualize F1 Scores to Identify the Best Model

In [None]:
sns.barplot('F1 Scores', 
            'Algorithm', 
            data = cv_res, 
            order = cv_res.sort_values(by = 'F1 Scores', 
                                       ascending = False)['Algorithm'], 
            palette = 'Set3', 
            **{'xerr': cv_std})

plt.ylabel('Algorithm')
plt.title('F1 Scores')

## Receiver Operating Characteristic Curve

In [None]:
selected_model = logistic_regression_sm

In [None]:
probs = selected_model.predict_proba(X_test)
# Calculate the fpr and tpr for all thresholds of the classification
fpr, tpr, threshold = roc_curve(y_test, probs[:,1])
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# Combine Prediction Results to Test Dataset

In [None]:
x

In [None]:
Y_pred = selected_model.predict_proba(x)[:, 1]
# Y_pred

In [None]:
df.reset_index(inplace=True)
df.reset_index(inplace=True)

In [None]:
df.set_index('gvkey', inplace=True)

In [None]:
print('The number of companies in this prediction is: ' ,len(Y_pred))

In [None]:
submit = pd.DataFrame({'index': df['index'], 'suit_pred': Y_pred})
submit = submit[['suit_pred']]
submit

In [None]:
df = df.join(submit)
df.drop(columns='index', inplace=True)
df

In [None]:
df['suitflag']

In [None]:
def custom_round(x, base=5):
    return int(base * round(float(x)/base))

In [None]:
df['Prediction_Score'] = df['suit_pred']*100
df['Prediction_Score'] = df['Prediction_Score'].apply(lambda x: custom_round(x, base=5))

In [None]:
# df['Prediction_Score'] = round(df['suit_pred']*100, 2)
pd.value_counts(df['Prediction_Score'], dropna=False)

In [None]:
df.hist('suit_pred', bins=10)

In [None]:
# plt.figure(figsize = (25,10))
# sns.set(font_scale = 2.5)


# var = df['Prediction_Score']
# ax = sns.countplot(x = var,
#                    data = df, )

# show_values(ax)
# plt.ylim(0, 150)

# plt.ylabel('Company Count')
# plt.xlabel('Prediction Scores \n(in %)')
# # plt.title('Company Count by Prediction Score')
# plt.xticks(rotation = 0)

# Model Interpretation

In [None]:
# res = cross_val_score(logistic_regression_sm, X_test, y_test, cv=cv, scoring='accuracy')
# model  = pd.DataFrame({"Features": X_train_smote.columns,"Coefficient":res.params.values})
# model["Odds_Ratio"] = model["Coefficient"].apply(lambda x: np.exp(x))
# model[["Coefficient","Odds_Ratio"]] = model[["Coefficient","Odds_Ratio"]].apply(lambda x: round(x,2))
# model["Perc_Impact"] = model["Odds_Ratio"].apply(lambda x: (x-1)*100)
# model = model.loc[model['Features']!='const'].sort_values(by='Odds_Ratio', ascending=False)
# model

In [None]:
x[x.index.isin(['3580'])]

In [None]:
logistic_regression_sm_hp.predict_proba(x[x.index.isin(['3580'])])[:,1]

In [None]:
logistic_regression_sm.predict_proba(x[x.index.isin(['3580'])])[:,1]

In [None]:
logistic_regression.predict_proba(x[x.index.isin(['3580'])])[:,1]

In [None]:
coef = pd.DataFrame(logistic_regression_sm.coef_[0],index=x.columns, columns=['Coef']).sort_values('Coef',ascending=False)
coef[abs(coef['Coef'])>.2]


In [None]:
col_to_keep = list(coef[coef['Coef']>0.00].index)
X_train_smote2 = X_train_smote[col_to_keep]
X_test2 = X_test[col_to_keep]

In [None]:
model_run(X_train_smote2, X_test2, y_train_smote, y_test, DecisionTreeClassifier, 'test')

In [None]:
# Model Results WITH SMOTE
logistic_regression2, acc_logistic_regression2, prec_logistic_regression2, recall_logistic_regression2, f1_logistic_regression2 = model_run(X_train_smote2, X_test2, y_train_smote, y_test, LogisticRegression, 'test')
#  = model_run(X_train_smote, X_test, y_train_smote, y_test, 'test')

In [None]:
model_run(X_train_smote2, X_test2, y_train_smote, y_test, RandomForestClassifier, 'test')

In [None]:
def visualize_features(model, feature_names):
    
    print(feature_imp)

In [None]:
visualize_features(logistic_regression, x.columns)

In [None]:
# folder_loc = '../01_data/02_modified/'
# X_train.to_csv(folder_loc+'X_train.csv')
# X_test.to_csv(folder_loc+'X_test.csv')
# y_train.to_csv(folder_loc+'y_train.csv')
# y_test.to_csv(folder_loc+'y_test.csv')
# X_train_smote.to_csv(folder_loc+'X_train_smote.csv', index=False)
# y_train_smote.to_csv(folder_loc+'y_train_smote.csv', index=False)

# Finish