# 1. Imports and Functions

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd 
import numpy as np

In [2]:
def evaluation_by_race(X_test, y_test, y_predict):
    races_test = X_test[:, 1]
    
    y_test_black = []
    y_pred_black = []
    y_test_white = []
    y_pred_white = []

    # splitting up the y_test and y_pred values by race to then use for race specific classification reports
    for index, race in enumerate(races_test):
        if(race == 0):  # black
            y_test_black.append(y_test[index])
            y_pred_black.append(y_predict[index])
        elif(race == 1):  # white
            y_test_white.append(y_test[index])
            y_pred_white.append(y_predict[index])
        else:
            print('You should not end up here...')
            
    print('EVALUATION FOR BLACK GROUP')
    print(confusion_matrix(y_test_black, y_pred_black))
    print(classification_report(y_test_black, y_pred_black)) 
    
    print('EVALUATION FOR WHITE GROUP')
    print(confusion_matrix(y_test_white, y_pred_white))
    print(classification_report(y_test_white, y_pred_white)) 

# 2. Extract data from csv

In [3]:
data = pd.read_csv('/home/mackenzie/git_repositories/delayedimpact/data/simData_oom10.csv')
data[['score', 'race']] = data[['score', 'race']].astype(int)
print(data)

      score  repay_probability  race  repay_indices
0       610              78.90     1              1
1       568              47.77     0              0
2       750              98.13     1              1
3       775              98.45     1              1
4       704              95.88     1              1
...     ...                ...   ...            ...
9995    832              98.99     1              1
9996    416              10.91     1              0
9997    444              14.63     1              0
9998    778              98.47     1              1
9999    738              97.68     1              1

[10000 rows x 4 columns]


# 3. Make data into train/test form

In [4]:
x = data[['score', 'race']].values
y = data['repay_indices'].values
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# do I need to scale the data??
# Standardize features by removing mean and scaling to unit variance:
#scaler = StandardScaler()
#scaler.fit(X_train)

#X_train = scaler.transform(X_train)
#X_test = scaler.transform(X_test) 

# 4. Train+Test KNN classifier

In [5]:
# Ref: https://www.activestate.com/resources/quick-reads/how-to-classify-data-in-python/

# Use the KNN classifier to fit data:
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)

KNeighborsClassifier()

In [6]:
# Predict y data with classifier: 
y_predict = classifier.predict(X_test)

# Print results: 
print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict)) 

[[ 634  215]
 [ 144 2007]]
              precision    recall  f1-score   support

           0       0.81      0.75      0.78       849
           1       0.90      0.93      0.92      2151

    accuracy                           0.88      3000
   macro avg       0.86      0.84      0.85      3000
weighted avg       0.88      0.88      0.88      3000



In [7]:
evaluation_by_race(X_test, y_test, y_predict)

EVALUATION FOR BLACK GROUP
[[198  35]
 [ 30  96]]
              precision    recall  f1-score   support

           0       0.87      0.85      0.86       233
           1       0.73      0.76      0.75       126

    accuracy                           0.82       359
   macro avg       0.80      0.81      0.80       359
weighted avg       0.82      0.82      0.82       359

EVALUATION FOR WHITE GROUP
[[ 436  180]
 [ 114 1911]]
              precision    recall  f1-score   support

           0       0.79      0.71      0.75       616
           1       0.91      0.94      0.93      2025

    accuracy                           0.89      2641
   macro avg       0.85      0.83      0.84      2641
weighted avg       0.89      0.89      0.89      2641



# 5. Train+Test Gaussian Naive Bayes classifier

In [8]:
# Initialize classifier:
gnb = GaussianNB()

# Train the classifier:
model = gnb.fit(X_train, y_train)

In [9]:
# Make predictions with the classifier:
y_predict = gnb.predict(X_test)
print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict)) 

[[ 617  232]
 [ 188 1963]]
              precision    recall  f1-score   support

           0       0.77      0.73      0.75       849
           1       0.89      0.91      0.90      2151

    accuracy                           0.86      3000
   macro avg       0.83      0.82      0.82      3000
weighted avg       0.86      0.86      0.86      3000



In [10]:
evaluation_by_race(X_test, y_test, y_predict)

EVALUATION FOR BLACK GROUP
[[233   0]
 [122   4]]
              precision    recall  f1-score   support

           0       0.66      1.00      0.79       233
           1       1.00      0.03      0.06       126

    accuracy                           0.66       359
   macro avg       0.83      0.52      0.43       359
weighted avg       0.78      0.66      0.54       359

EVALUATION FOR WHITE GROUP
[[ 384  232]
 [  66 1959]]
              precision    recall  f1-score   support

           0       0.85      0.62      0.72       616
           1       0.89      0.97      0.93      2025

    accuracy                           0.89      2641
   macro avg       0.87      0.80      0.82      2641
weighted avg       0.88      0.89      0.88      2641



# 6. Train+Test Decision Tree Classifier

In [11]:
# Reference: https://www.datacamp.com/community/tutorials/decision-tree-classification-python

# Initialize classifier:
clf = DecisionTreeClassifier()

# Train the classifier:
clf = clf.fit(X_train,y_train)

In [12]:
# Make predictions with the classifier:
y_pred = clf.predict(X_test)

print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict)) 

[[ 617  232]
 [ 188 1963]]
              precision    recall  f1-score   support

           0       0.77      0.73      0.75       849
           1       0.89      0.91      0.90      2151

    accuracy                           0.86      3000
   macro avg       0.83      0.82      0.82      3000
weighted avg       0.86      0.86      0.86      3000



In [13]:
evaluation_by_race(X_test, y_test, y_predict)

EVALUATION FOR BLACK GROUP
[[233   0]
 [122   4]]
              precision    recall  f1-score   support

           0       0.66      1.00      0.79       233
           1       1.00      0.03      0.06       126

    accuracy                           0.66       359
   macro avg       0.83      0.52      0.43       359
weighted avg       0.78      0.66      0.54       359

EVALUATION FOR WHITE GROUP
[[ 384  232]
 [  66 1959]]
              precision    recall  f1-score   support

           0       0.85      0.62      0.72       616
           1       0.89      0.97      0.93      2025

    accuracy                           0.89      2641
   macro avg       0.87      0.80      0.82      2641
weighted avg       0.88      0.89      0.88      2641



# 7. Train+Test Logistic Regression 

In [14]:
# Reference: https://towardsdatascience.com/logistic-regression-using-python-sklearn-numpy-mnist-handwriting-recognition-matplotlib-a6b31e2b166a

# Instantiate classifier:
logisticRegr = LogisticRegression()

# Train the classifier:
logisticRegr.fit(X_train, y_train)

LogisticRegression()

In [15]:
# Make predictions with the classifier:
y_pred = logisticRegr.predict(X_test)

print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict)) 

[[ 617  232]
 [ 188 1963]]
              precision    recall  f1-score   support

           0       0.77      0.73      0.75       849
           1       0.89      0.91      0.90      2151

    accuracy                           0.86      3000
   macro avg       0.83      0.82      0.82      3000
weighted avg       0.86      0.86      0.86      3000



In [16]:
evaluation_by_race(X_test, y_test, y_predict)

EVALUATION FOR BLACK GROUP
[[233   0]
 [122   4]]
              precision    recall  f1-score   support

           0       0.66      1.00      0.79       233
           1       1.00      0.03      0.06       126

    accuracy                           0.66       359
   macro avg       0.83      0.52      0.43       359
weighted avg       0.78      0.66      0.54       359

EVALUATION FOR WHITE GROUP
[[ 384  232]
 [  66 1959]]
              precision    recall  f1-score   support

           0       0.85      0.62      0.72       616
           1       0.89      0.97      0.93      2025

    accuracy                           0.89      2641
   macro avg       0.87      0.80      0.82      2641
weighted avg       0.88      0.89      0.88      2641



# 8. Train+Test Support Vector Machines

Reference: https://www.datacamp.com/community/tutorials/svm-classification-scikit-learn-python

## 8.1 Linear Kernel

In [17]:
# Instantiate classifier:
clf = svm.SVC(kernel='linear')  # can try other kernels

#Train the model using the training sets
clf.fit(X_train, y_train)

SVC(kernel='linear')

In [18]:
# Make predictions
y_pred = clf.predict(X_test)

print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict))

[[ 617  232]
 [ 188 1963]]
              precision    recall  f1-score   support

           0       0.77      0.73      0.75       849
           1       0.89      0.91      0.90      2151

    accuracy                           0.86      3000
   macro avg       0.83      0.82      0.82      3000
weighted avg       0.86      0.86      0.86      3000



In [19]:
evaluation_by_race(X_test, y_test, y_predict)

EVALUATION FOR BLACK GROUP
[[233   0]
 [122   4]]
              precision    recall  f1-score   support

           0       0.66      1.00      0.79       233
           1       1.00      0.03      0.06       126

    accuracy                           0.66       359
   macro avg       0.83      0.52      0.43       359
weighted avg       0.78      0.66      0.54       359

EVALUATION FOR WHITE GROUP
[[ 384  232]
 [  66 1959]]
              precision    recall  f1-score   support

           0       0.85      0.62      0.72       616
           1       0.89      0.97      0.93      2025

    accuracy                           0.89      2641
   macro avg       0.87      0.80      0.82      2641
weighted avg       0.88      0.89      0.88      2641



# Extra Notes

TODO: increase dataset oom even more
TODO: try other svm kernels

https://stackabuse.com/overview-of-classification-methods-in-python-with-scikit-learn