In [462]:
import numpy as np
import pandas as pd
import sklearn as sk

from sklearn.model_selection import train_test_split

In [463]:
# import the generated csv file for Income Data 
income = pd.read_csv('Income Data File.csv' , index_col = 0)

### Pre-Processing

In [464]:
# For cleaning, need to split the columns that have more than two categories, into the # of categories as columns
Uni = pd.get_dummies(income['University'], drop_first = True)
Pro = pd.get_dummies(income['Program'], drop_first = True)
# Schol = pd.get_dummies(income['Scholarship'], drop_first = True)

In [465]:
# include the dummy variables into the dataframe
income['Uni1'] = Uni[1]
income['Uni2'] = Uni[2]
income['Uni3'] = Uni[3]
income['Uni4'] = Uni[4]
income['Uni5'] = Uni[5]

income['Pro1'] = Pro[1]
income['Pro2'] = Pro[2]
income['Pro3'] = Pro[3]
income['Pro4'] = Pro[4]

In [466]:
# remove the now redundant columns
income = income.drop(['University', 'Program', 'Criminal Record', 'Scholarship'], axis = 1)

In [467]:
# Create train-test split set

X = income.drop(['Income'], axis=1)
Y = income['Income']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=100)

## We've used three different training models and are moving forward with KNearest Neighbours based on the results here. 

# Decision tree classifier 

In [468]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [469]:
# Evaluating the decision tree 
predictions = dtree.predict(X_test)
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

        high       0.99      0.99      0.99      2990
         low       0.47      0.30      0.37      1515
         mid       0.90      0.95      0.92     10495

    accuracy                           0.89     15000
   macro avg       0.79      0.75      0.76     15000
weighted avg       0.88      0.89      0.88     15000



In [470]:
print(confusion_matrix(y_test,predictions))

[[2975    0   15]
 [   0  461 1054]
 [  24  525 9946]]


## Random Forest Classifier

In [471]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [472]:
# Random Forest Classifier predictions
rfc_pred = rfc.predict(X_test)
print(confusion_matrix(y_test,rfc_pred))
print(classification_report(y_test,rfc_pred))

[[2975    0   15]
 [   0  461 1054]
 [  24  525 9946]]
              precision    recall  f1-score   support

        high       0.99      0.99      0.99      2990
         low       0.47      0.30      0.37      1515
         mid       0.90      0.95      0.92     10495

    accuracy                           0.89     15000
   macro avg       0.79      0.75      0.76     15000
weighted avg       0.88      0.89      0.88     15000



## K-Nearest Neighbours 

In [473]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [474]:
pred = knn.predict(X_test)

In [475]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,pred))

[[2975    0   15]
 [   0  494 1021]
 [  24  512 9959]]


In [476]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

        high       0.99      0.99      0.99      2990
         low       0.49      0.33      0.39      1515
         mid       0.91      0.95      0.93     10495

    accuracy                           0.90     15000
   macro avg       0.80      0.76      0.77     15000
weighted avg       0.88      0.90      0.89     15000



In [477]:
# sample set
from random import choice

#t_1 = np.array([choice([0,1]) for i in range(10)]).reshape(1,-1)

t_1 = np.array([1, 0, 0, 0, 0, 1, 0, 0, 0]).reshape(1,-1)

pred = knn.predict(t_1)

In [478]:
# What we will be returning to the front-end
# if the value is low, send 45000 into the front-end
# if the value is mid, send 60000 into the front-end
# if the value is high, send 75000 into the front-end
if pred[0] == 'low':
    incomevalue = 45000
elif pred[0] == 'mid':
    incomevalue = 60000
else:
    incomevalue = 75000


In [479]:
incomevalue

45000