In [None]:
# Import necessary packages and load
import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns

In [None]:
# Read data in Python
dev_data = pd.read_csv('raw-data_assgn_transformed.csv')

In [None]:
# Data Shape
dev_data.shape

In [None]:
# View Data
dev_data.head()

In [None]:
# Variable information
dev_data.info()

In [None]:
# Check for missing value
dev_data.isnull().any().any()

In [None]:
dev_data['Default'].value_counts(normalize=True)

In [None]:
dev_data['Default'].value_counts()
# 7.01% (169 out of 2,409) accounts / companies have Deafulted

In [None]:
dev_data_select = dev_data[[
'Default',
'Total Income/Total assets',
'Change in stock/Total Income',
'Profit after tax/Total assets',
'PBDITA/Total assets',
'PBT/Total assets',
'Cash profit as % of total income',
'PBT as % of total income',
'PAT as % of total income',
'Sales/Total assets',
'Income from financial services/Total Income',
'Other income/Total Income',
'Total capital/Total_Assets',
'Reserves and funds/Total_Assets',
'Borrowings/Total_Assets',
'Current liabilities & provisions/Total_assets',
'Deferred tax liability/Total_Assets',
'Shareholders funds/Total_assets',
'Cumulative retained profits/Total Income',
'Capital employed/Total assets',
'Contingent liabilities/Total Assets',
'Net fixed assets/Total Assets',
'Investments/Total Income',
'Current assets/Total_Assets',
'Net working capital/Total Capital',
'Quick ratio (times)',
'Current ratio (times)',
'Debt to equity ratio (times)',
'EPS']]

dev_data_select.shape

In [None]:
#List column names from Development Data selected
train_features = (dev_data_select.columns).tolist()
train_features.remove('Default')

features_columns=train_features
Target_val='Default'

In [None]:
#Execute model on selected variables - First attempt
logit_model_attempt1 = sm.Logit(dev_data_select[Target_val],dev_data_select[features_columns])
result_attempt1 = logit_model_attempt1.fit()
print(result_attempt1.summary())

In [None]:
dev_data_select2 = dev_data[[
'Default',
'Cash profit as % of total income',
'Shareholders funds/Total_assets',
'Debt to equity ratio (times)',
'Quick ratio (times)',
'PBT as % of total income',
'Change in stock/Total Income',
'Deferred tax liability/Total_Assets']]

#List column names from Development Data selected
train_features = (dev_data_select2.columns).tolist()
train_features.remove('Default')

features_columns=train_features

In [None]:
#Execute model on selected variables - Second attempt - Check
logit_model_attempt2 = sm.Logit(dev_data_select2[Target_val],dev_data_select2[features_columns])
result_attempt2 = logit_model_attempt2.fit()
print(result_attempt2.summary())

In [None]:
#Now if we see the variables are significant as well as have correct signs for coefficient
# But in order to balance the equation we might need to add variables for future expectations about the company

# create a new variable as below
df = pd.DataFrame(dev_data)

df['Cumulative retained profits / Sales'] = (df['Cumulative retained profits'] / df['Sales'])

sns.barplot(x='Default', y='Cumulative retained profits / Sales', data=dev_data)

In [None]:
dev_data_select3 = df[[
'Default',
'Cash profit as % of total income',
'Shareholders funds/Total_assets',
'Debt to equity ratio (times)',
'Quick ratio (times)',
'PBT as % of total income',
'Change in stock/Total Income',
'Deferred tax liability/Total_Assets',
'Cumulative retained profits / Sales']]

#List column names from Development Data selected
train_features = (dev_data_select3.columns).tolist()
train_features.remove('Default')

features_columns=train_features
dev_data_select3.head()

In [None]:
#Execute model on selected variables - Third attempt - Check
logit_model_attempt3 = sm.Logit(dev_data_select3[Target_val],dev_data_select3[features_columns])
result_attempt3 = logit_model_attempt3.fit()
print(result_attempt3.summary())

In [None]:
# Compare prbability of 20 companies in excel
prob_output = pd.DataFrame(round(result_attempt3.predict(dev_data_select3[features_columns]),4))
prob_output.join(dev_data['Num'], how='left').sort_values(by = 'Num').head(20).reset_index()

# Compare probability values (Column = '0') with Altman Z Score

In [None]:
#Execute model through sklearn on final data and validate
X = dev_data_select3[features_columns]
y = dev_data_select3[Target_val]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

print(y_test.value_counts(normalize=True))
print(y_test.value_counts())

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report

# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression()
model_corporate = model.fit(X_train, y_train)

predicted_corporate = np.where(model_corporate.predict_proba(X_test)[:,1] > 0.07, 1, 0)

In [None]:
# Confusion matrix
print(confusion_matrix(y_test,predicted_corporate))

# Classification Report
print(classification_report(predicted_corporate, y_test))

# Accuracy
print("Accuracy:",model_corporate.score(X_test, y_test))