# 1. Imports and Functions

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.naive_bayes import GaussianNB
import pandas as pd 
import numpy as np

In [2]:
def evaluation_by_race(X_test, y_test, y_predict):
    races_test = X_test[:, 1]
    
    y_test_black = []
    y_pred_black = []
    y_test_white = []
    y_pred_white = []

    # splitting up the y_test and y_pred values by race to then use for race specific classification reports
    for index, race in enumerate(races_test):
        if(race == 0):  # black
            y_test_black.append(y_test[index])
            y_pred_black.append(y_predict[index])
        elif(race == 1):  # white
            y_test_white.append(y_test[index])
            y_pred_white.append(y_predict[index])
        else:
            print('You should not end up here...')
            
    print('EVALUATION FOR BLACK GROUP')
    print(confusion_matrix(y_test_black, y_pred_black))
    print(classification_report(y_test_black, y_pred_black)) 
    
    print('EVALUATION FOR WHITE GROUP')
    print(confusion_matrix(y_test_white, y_pred_white))
    print(classification_report(y_test_white, y_pred_white)) 

# 2. Extract data from csv

In [3]:
data = pd.read_csv('/home/mackenzie/git_repositories/delayedimpact/data/simData_oom2.csv')
data[['score', 'race']] = data[['score', 'race']].astype(int)
print(data)

      score  repay_probability  race  repay_indices
0       479              20.74     1              1
1       722              96.99     1              1
2       713              96.44     1              1
3       811              98.85     1              1
4       396               8.55     1              0
...     ...                ...   ...            ...
1995    524              34.32     1              1
1996    777              98.46     1              1
1997    719              96.80     1              1
1998    767              98.39     1              1
1999    560              42.45     0              1

[2000 rows x 4 columns]


# 3. Make data into train/test form

In [4]:
x = data[['score', 'race']].values
y = data['repay_indices'].values
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# do I need to scale the data??
# Standardize features by removing mean and scaling to unit variance:
#scaler = StandardScaler()
#scaler.fit(X_train)

#X_train = scaler.transform(X_train)
#X_test = scaler.transform(X_test) 

# 4. Train+Test KNN classifier

In [5]:
# Ref: https://www.activestate.com/resources/quick-reads/how-to-classify-data-in-python/

# Use the KNN classifier to fit data:
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)

KNeighborsClassifier()

In [6]:
# Predict y data with classifier: 
y_predict = classifier.predict(X_test)

# Print results: 
print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict)) 

[[115  48]
 [ 36 401]]
              precision    recall  f1-score   support

           0       0.76      0.71      0.73       163
           1       0.89      0.92      0.91       437

    accuracy                           0.86       600
   macro avg       0.83      0.81      0.82       600
weighted avg       0.86      0.86      0.86       600



In [7]:
evaluation_by_race(X_test, y_test, y_predict)

EVALUATION FOR BLACK GROUP
[[32  8]
 [ 7 21]]
              precision    recall  f1-score   support

           0       0.82      0.80      0.81        40
           1       0.72      0.75      0.74        28

    accuracy                           0.78        68
   macro avg       0.77      0.78      0.77        68
weighted avg       0.78      0.78      0.78        68

EVALUATION FOR WHITE GROUP
[[ 83  40]
 [ 29 380]]
              precision    recall  f1-score   support

           0       0.74      0.67      0.71       123
           1       0.90      0.93      0.92       409

    accuracy                           0.87       532
   macro avg       0.82      0.80      0.81       532
weighted avg       0.87      0.87      0.87       532



# 5. Train+Test Gaussian Naive Bayes classifier

In [8]:
# Initialize classifier:
gnb = GaussianNB()

# Train the classifier:
model = gnb.fit(X_train, y_train)

In [9]:
# Make predictions with the classifier:
y_predict = gnb.predict(X_test)
print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict)) 

[[116  47]
 [ 40 397]]
              precision    recall  f1-score   support

           0       0.74      0.71      0.73       163
           1       0.89      0.91      0.90       437

    accuracy                           0.85       600
   macro avg       0.82      0.81      0.81       600
weighted avg       0.85      0.85      0.85       600



In [11]:
evaluation_by_race(X_test, y_test, y_predict)

EVALUATION FOR BLACK GROUP
[[40  0]
 [28  0]]
              precision    recall  f1-score   support

           0       0.59      1.00      0.74        40
           1       0.00      0.00      0.00        28

    accuracy                           0.59        68
   macro avg       0.29      0.50      0.37        68
weighted avg       0.35      0.59      0.44        68

EVALUATION FOR WHITE GROUP
[[ 76  47]
 [ 12 397]]
              precision    recall  f1-score   support

           0       0.86      0.62      0.72       123
           1       0.89      0.97      0.93       409

    accuracy                           0.89       532
   macro avg       0.88      0.79      0.83       532
weighted avg       0.89      0.89      0.88       532



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# 6. TODO: Train+Test other classifier models!!
Try decision tree and logistic regression too
https://stackabuse.com/overview-of-classification-methods-in-python-with-scikit-learn