# Project 4 - Hackathon (Predicting Income under Team Features Constraint)

* Choose algorithm and choice of samples
* Limited to a maximum of 20 features

### This exercise is to build the best model possible under those constraints. 

### The task is to predict if a person's income is in excess of 50,000 dollars given certain profile information, and more specifically to generate predicted probabilities of income being above 50,000 dollars for each row in the test set. The output will be a .csv file with a single column of the probability with 'wage' as a header. The file is to be submitted by the end of the day.

### This section is the Modeling  portion. 

Perform the following model and evaluate the results

- Logistic Regresion 
- Knn Regression


The data set is the 'train_clean_data.cvs' file.
 

In [24]:
# Import some important stuff
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from scipy import stats

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score



In [7]:
# This funtion was created by Magnus Bigelow
# Function to calculate and display classification metrics, works for bernoulli y
def class_metrics(model, X, y):
    # Generate predictions
    preds = model.predict(X)
    # Get confusion matrix and unravel
    tn, fp, fn, tp = confusion_matrix(y,preds).ravel()
    # Accuracy
    print(f'Accuracy: {round((tp+tn)/len(y),3)}')
    # Sensitivity
    print(f'Sensitivity: {round(tp/(tp+fn),3)}')
    # Specificity
    print(f'Specificity: {round(tn/(tn+fp),3)}')
    # Precision
    print(f'Precision: {round(tp/(tp+fp),3)}')

In [8]:
# This funtion was created by Magnus Bigelow
# Function to print train and test F1 score
def f1(model, X_train, y_train, X_test, y_test):
    y_train_p = model.predict(X_train)
    y_test_p = model.predict(X_test)
    f_train = f1_score(y_train,y_train_p)
    f_test = f1_score(y_test,y_test_p)
    print(f'Train F1: {round(f_train,3)}')
    print(f'Test F1: {round(f_test,3)}')

In [9]:
np.random.seed(33) 

In [10]:
df_data_clean = pd.read_csv('./data/clean_train.csv')

In [11]:
df_data_clean.shape

(32561, 20)

In [12]:
df_data_clean.head().T

Unnamed: 0,0,1,2,3,4
age,39.0,50.0,38.0,53.0,28.0
education-num,13.0,13.0,9.0,7.0,13.0
sex,1.0,1.0,1.0,1.0,1.0
capital-gain,2174.0,0.0,0.0,0.0,0.0
capital-loss,0.0,0.0,0.0,0.0,0.0
hours-per-week,40.0,13.0,40.0,40.0,40.0
wage,0.0,0.0,0.0,0.0,0.0
marital_status_num,0.0,1.0,0.0,1.0,1.0
occupation_com_House_Services,0.0,0.0,1.0,1.0,0.0
occupation_com_Other,1.0,0.0,0.0,0.0,0.0


In [13]:
df_data_clean.columns

Index(['age', 'education-num', 'sex', 'capital-gain', 'capital-loss',
       'hours-per-week', 'wage', 'marital_status_num',
       'occupation_com_House_Services', 'occupation_com_Other',
       'occupation_com_Professional', 'occupation_com_Specialty',
       'occupation_com_Tech/sales', 'workclass_com_ Government',
       'workclass_com_ Other', 'workclass_com_ Private',
       'workclass_com_ Self-employed', 'cap_gain_binary', 'cap_loss_binary',
       'gdp_pc'],
      dtype='object')

In [14]:
list = ['age', 'education-num', 'sex', 'hours-per-week', 'marital_status_num',
       'occupation_com_House_Services',
       'occupation_com_Professional', 'occupation_com_Specialty',
       'occupation_com_Tech/sales', 'workclass_com_ Government', 'workclass_com_ Private',
       'workclass_com_ Self-employed', 'cap_gain_binary', 'cap_loss_binary', 'gdp_pc']

In [8]:
# Routine to create a combinations of features combination to try out in a Logistic Regression
# set up list data
df_data = pd.DataFrame()
data = {}

ndx = 0
ndx_1 = 0


while ndx_1 < len(list):
#    data = {'name':[list[ndx_1]]}
#    df_data = df_data.append(data, ignore_index = True)
    ndx += 1
    ndx_2 = ndx_1 + 1
    while ndx_2 < len(list):
        data = {'name':[list[ndx_1],list[ndx_2]]}
        df_data = df_data.append(data, ignore_index = True)
        ndx_3 = ndx_2 + 1
        while ndx_3 < len(list):
            data = {'name':[list[ndx_1],list[ndx_2],list[ndx_3]]}
            df_data = df_data.append(data, ignore_index = True)
            ndx_4 = ndx_3 + 1
            while ndx_4 < len(list):
                data = {'name':[list[ndx_1],list[ndx_2],list[ndx_3],list[ndx_4]]}
                df_data = df_data.append(data, ignore_index = True)
                ndx_5 = ndx_4 + 1
                while ndx_5 < len(list):
                    data = {'name':[list[ndx_1],list[ndx_2],list[ndx_3],list[ndx_4],list[ndx_5]]}
                    df_data = df_data.append(data, ignore_index = True)
                    ndx_6 = ndx_5 + 1
                    while ndx_6 < len(list):
                        data = {'name':[list[ndx_1],list[ndx_2],list[ndx_3],list[ndx_4],list[ndx_5],list[ndx_6]]}
                        df_data = df_data.append(data, ignore_index = True)
                        ndx_7 = ndx_6 + 1
                        while ndx_7 < len(list):
                            data = {'name':[list[ndx_1],list[ndx_2],list[ndx_3],list[ndx_4],list[ndx_5],list[ndx_6],list[ndx_7]]}
                            df_data = df_data.append(data, ignore_index = True)
                            ndx_8 = ndx_7 + 1
                            while ndx_8 < len(list):
                                data = {'name':[list[ndx_1],list[ndx_2],list[ndx_3],list[ndx_4],list[ndx_5],list[ndx_6],list[ndx_7],list[ndx_8]]}
                                df_data = df_data.append(data, ignore_index = True)
                                ndx_9 = ndx_8 + 1
                                while ndx_9 < len(list):
                                    data = {'name':[list[ndx_1],list[ndx_2],list[ndx_3],list[ndx_4],list[ndx_5],list[ndx_6],list[ndx_7],list[ndx_8],list[ndx_9]]}
                                    df_data = df_data.append(data, ignore_index = True)
                                    ndx_10 = ndx_9 + 1
                                    while ndx_10 < len(list):
                                        data = {'name':[list[ndx_1],list[ndx_2],list[ndx_3],list[ndx_4],list[ndx_5],list[ndx_6],list[ndx_7],list[ndx_8],list[ndx_9],list[ndx_10]]}
                                        df_data = df_data.append(data, ignore_index = True)
                                        ndx_11 = ndx_10 + 1
                                        while ndx_11 < len(list):
                                            data = {'name':[list[ndx_1],list[ndx_2],list[ndx_3],list[ndx_4],list[ndx_5],list[ndx_6],list[ndx_7],list[ndx_8],list[ndx_9],list[ndx_10],list[ndx_11]]}
                                            df_data = df_data.append(data, ignore_index = True)
                                            ndx_12 = ndx_11 + 1
                                            while ndx_12 < len(list):
                                                data = {'name':[list[ndx_1],list[ndx_2],list[ndx_3],list[ndx_4],list[ndx_5],list[ndx_6],list[ndx_7],list[ndx_8],list[ndx_9],list[ndx_10]]}
                                                df_data = df_data.append(data, ignore_index = True)
                                                ndx_13 = ndx_12 + 1
                                                while ndx_13 < len(list):
                                                    data = {'name':[list[ndx_1],list[ndx_2],list[ndx_3],list[ndx_4],list[ndx_5],list[ndx_6],list[ndx_7],list[ndx_8],list[ndx_9],list[ndx_10],list[ndx_11]]}
                                                    df_data = df_data.append(data, ignore_index = True)
                                                    ndx_14 = ndx_13 + 1
                                                    while ndx_14 < len(list):
                                                        data = {'name':[list[ndx_1],list[ndx_2],list[ndx_3],list[ndx_4],list[ndx_5],list[ndx_6],list[ndx_7],list[ndx_8],list[ndx_9],list[ndx_10]]}
                                                        df_data = df_data.append(data, ignore_index = True)
                                                        ndx_15 = ndx_14 + 1
                                                        while ndx_15 < len(list):
                                                            data = {'name':[list[ndx_1],list[ndx_2],list[ndx_3],list[ndx_4],list[ndx_5],list[ndx_6],list[ndx_7],list[ndx_8],list[ndx_9],list[ndx_10],list[ndx_11]]}
                                                            df_data = df_data.append(data, ignore_index = True)
                                                            ndx_15 += 1
                                                        ndx_14 += 1
                                                    ndx_13 += 1      
                                                ndx_12 += 1                
                                            ndx_11 += 1
                                        ndx_10 += 1
                                    ndx_9 += 1      
                                ndx_8 += 1                
                            ndx_7 += 1
                        ndx_6 += 1
                    ndx_5 += 1      
                ndx_4 += 1                
            ndx_3 += 1
        ndx_2 += 1          
    ndx_1 += 1

In [9]:
df_results_LogReg = pd.DataFrame(columns=['train_score', 'test_score', 'diff', 'age', 'education-num', 'sex', 'hours-per-week', 'marital_status_num',
       'occupation_com_House_Services',
       'occupation_com_Professional', 'occupation_com_Specialty',
       'occupation_com_Tech/sales', 'workclass_com_ Government', 'workclass_com_ Private',
       'workclass_com_ Self-employed', 'cap_gain_binary', 'cap_loss_binary', 'gdp_pc'])

In [38]:
# This routine will iterate through the list of features combination to get information on 
#     the training and testing score along with the coeficient with each feature used.
ndx = 0
y = df_data_clean['wage'] 
logreg = LogisticRegression(solver='liblinear')

while ndx < len(df_data):
    X = df_data_clean[df_data['name'][ndx]] 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=42)
    logreg.fit(X_train, y_train)
    
    temp_dict = {}

    temp_dict = {'train_score': round(logreg.score(X_train, y_train), 4),  
                 'test_score': round(logreg.score(X_test, y_test), 4),
                 'diff': round(logreg.score(X_train, y_train) - logreg.score(X_test, y_test), 4)}

    ndx_x = 0
    while ndx_x < len(X.columns):
        temp_dict.update({X.columns.values[ndx_x] : np.round(logreg.coef_[0][ndx_x], 4)})
        ndx_x += 1

    df_results_LogReg = df_results_LogReg.append(temp_dict, ignore_index=True)
    
    ndx += 1 



In [45]:
df_results_LogReg.sort_values(by='train_score', ascending = False).head()

Unnamed: 0,train_score,test_score,diff,age,education-num,sex,hours-per-week,marital_status_num,occupation_com_House_Services,occupation_com_Professional,occupation_com_Specialty,occupation_com_Tech/sales,workclass_com_ Government,workclass_com_ Private,workclass_com_ Self-employed,cap_gain_binary,cap_loss_binary,gdp_pc
885,0.8389,0.8418,-0.0029,0.0287,0.2758,-4.4562,0.0303,2.3694,,0.9178,,0.6414,0.2719,0.3346,,1.6280,1.0022,
4469,0.8389,0.8416,-0.0027,0.0278,0.2704,,0.0297,2.3695,-0.5775,0.9011,,0.6184,0.2779,0.3376,,1.6305,1.0055,
4877,0.8388,0.8414,-0.0026,0.0277,0.2738,,0.0292,2.3498,,1.0511,0.1847,0.7726,,0.0924,-0.2158,1.6267,1.0034,
924,0.8388,0.8408,-0.0020,0.0275,0.2733,-4.2884,0.0308,2.3797,,0.9421,,0.6703,,,-0.2733,1.6331,1.0098,
185,0.8388,0.8406,-0.0018,0.0273,0.2746,-4.2496,0.0310,2.3786,-0.6864,0.9059,,0.6373,,,-0.2602,1.6280,1.0418,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4981,0.7124,0.7245,-0.0121,0.0107,0.0039,,0.0071,0.0019,,0.0011,,0.0001,0.0002,-0.0007,,0.0006,0.0003,-0.0
4982,0.7124,0.7245,-0.0121,0.0107,0.0039,,0.0071,0.0019,,0.0011,,0.0001,0.0002,-0.0007,,0.0007,,-0.0
831,0.7124,0.7245,-0.0121,0.0107,0.0039,-0.0005,0.0071,0.0019,,0.0011,-0.0005,,0.0002,,0.0003,0.0007,,-0.0
981,0.7124,0.7245,-0.0121,0.0107,0.0039,-0.0005,0.0071,0.0019,,0.0011,,,,-0.0007,,0.0006,,-0.0


From the results, the best training score was 0.8389 with a testing score of 0.8418

The features used was age, education-num, sex, marital_tatus_num, occupation_com_Professional, occupation_com_tech/sales, workclass_com_Government, workclass_Private, cap_gain_binary, and cap_loss_binary

10 features found.

# Knn model

In [15]:
X = df_data_clean[['age', 'education-num', 'sex',
       'hours-per-week', 'marital_status_num',
       'occupation_com_House_Services',
       'occupation_com_Professional', 'occupation_com_Specialty',
       'occupation_com_Tech/sales', 'workclass_com_ Government', 'workclass_com_ Private',
       'workclass_com_ Self-employed', 'cap_gain_binary', 'cap_loss_binary',
       'gdp_pc']]

y = df_data_clean['wage'] 

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=33)

In [17]:
# will need to scale the data
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [18]:
# Instantiate our model (use default parameters).
knn = KNeighborsClassifier()

In [19]:
# Fit our model to our data.
knn.fit(X_train_sc, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [20]:
# Get the training score
knn.score(X_train_sc, y_train)

0.8714187485674995

In [21]:
# Get the testing score
knn.score(X_test_sc, y_test)

0.8203052298529685

In [25]:
f1(knn, X_train_sc, y_train, X_test_sc, y_test)

Train F1: 0.72
Test F1: 0.607


In [26]:
class_metrics(knn, X_test_sc, y_test)

Accuracy: 0.82
Sensitivity: 0.575
Specificity: 0.898
Precision: 0.643


From our analysis from all the modeling, we determined that the best model to use was the Knn because it gave us a similiar score, with the best specificity. We were looking for a good balance model.

The next section is to run the Knn through a few hyperparameters, to get the best values.



# Knn Through GridSearch



In [38]:
knn_params = {'n_neighbors' : [3, 5, 15, 25],
              'p':[1,2,3]}


In [39]:
knn_gridsearch = GridSearchCV(KNeighborsClassifier(), 
                              knn_params, 
                              cv=5, 
                              verbose=1)

In [40]:
knn_gridsearch.fit(X_train_sc, y_train);

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:  1.8min finished


In [41]:
knn_gridsearch.best_params_

{'n_neighbors': 25, 'p': 1}

In [42]:
f1(knn_gridsearch, X_train_sc, y_train, X_test_sc, y_test)

Train F1: 0.663
Test F1: 0.635


In [43]:
class_metrics(knn_gridsearch, X_test_sc, y_test)

Accuracy: 0.837
Sensitivity: 0.589
Specificity: 0.916
Precision: 0.69


### We will set the n_neighbor to 25 and p to 1  when setting up our testing data

code will be in the test_workbook