In [1]:
# Nataly Valenzuela
import pandas as pd
import numpy as np
from sklearn.preprocessing import (MinMaxScaler, StandardScaler)
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from math import sqrt
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from scipy.stats import mode
from sklearn.model_selection import train_test_split

## KNN Classifier

### Research question 

Is KNN a good model for predicting whether someone will want more kids based on their age, ethinicity, work hours, and gender of their first 2 children?

### Data pre-processing 

In [12]:
kids=pd.read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/AER/Fertility.csv")
kids.drop(columns=['Unnamed: 0'], inplace=True)
kids[['morekids', 'afam', 'hispanic', 'other']]=kids[['morekids', 'afam', 'hispanic', 'other']].eq('yes').mul(1)
kids[['gender1', 'gender2']]=kids[['gender1', 'gender2']].eq('male').mul(1)
#Check for null values
kids.isnull().sum()

morekids    0
gender1     0
gender2     0
age         0
afam        0
hispanic    0
other       0
work        0
dtype: int64

In [13]:
def gender (x):
    g=0
    if x[1]==1:
        if x[2]==1:
            g="MM"
            return g
        if x[2]==0:
            g="MF"
            return g
    if x[1]==0:
        if x[2]==1:
            g="MF"
            return g
        if x[2]==0:
            g="FF"
            return g
kids['genders']=kids.apply(gender, axis=1)
kids

Unnamed: 0,morekids,gender1,gender2,age,afam,hispanic,other,work,genders
0,0,1,0,27,0,0,0,0,MF
1,0,0,1,30,0,0,0,30,MF
2,0,1,0,27,0,0,0,0,MF
3,0,1,0,35,1,0,0,0,MF
4,0,0,0,30,0,0,0,22,FF
...,...,...,...,...,...,...,...,...,...
254649,1,0,0,35,0,0,0,0,FF
254650,1,1,1,29,0,0,0,0,MM
254651,1,0,1,34,0,0,0,38,MF
254652,1,0,0,30,0,0,0,26,FF


In [14]:
#Min max scaling for continuous data
cont=['age', 'work']
scaler = MinMaxScaler()
norm_kids=kids.copy()
norm_kids[cont] = scaler.fit_transform(norm_kids[cont])
# drop_first=True
norm_kids=pd.get_dummies(norm_kids, columns=['genders'])
norm_kids.drop(columns=['gender1', 'gender2'], inplace=True)
norm_kids

Unnamed: 0,morekids,age,afam,hispanic,other,work,genders_FF,genders_MF,genders_MM
0,0,0.428571,0,0,0,0.000000,0,1,0
1,0,0.642857,0,0,0,0.576923,0,1,0
2,0,0.428571,0,0,0,0.000000,0,1,0
3,0,1.000000,1,0,0,0.000000,0,1,0
4,0,0.642857,0,0,0,0.423077,1,0,0
...,...,...,...,...,...,...,...,...,...
254649,1,1.000000,0,0,0,0.000000,1,0,0
254650,1,0.571429,0,0,0,0.000000,0,0,1
254651,1,0.928571,0,0,0,0.730769,0,1,0
254652,1,0.642857,0,0,0,0.500000,1,0,0


In [15]:
norm_kids.describe()

Unnamed: 0,morekids,age,afam,hispanic,other,work,genders_FF,genders_MF,genders_MM
count,254654.0,254654.0,254654.0,254654.0,254654.0,254654.0,254654.0,254654.0,254654.0
mean,0.380563,0.670948,0.051662,0.074207,0.056343,0.365737,0.239329,0.494432,0.26624
std,0.485526,0.241889,0.221345,0.262107,0.230584,0.420525,0.426675,0.49997,0.441992
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.714286,0.0,0.0,0.0,0.096154,0.0,0.0,0.0
75%,1.0,0.857143,0.0,0.0,0.0,0.846154,0.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Data splitting 

In [16]:
Xk= norm_kids.iloc[:,1:]
yk= norm_kids["morekids"]
Xk_train, Xk_rem, yk_train, yk_rem = train_test_split(Xk, yk, test_size=0.30, random_state=42)
Xk_val, Xk_test, yk_val, yk_test = train_test_split(Xk_rem, yk_rem, test_size=0.8, random_state=42)
print("Xk_train shape: ",Xk_train.shape)
print("Xk_train shape: ",Xk_val.shape)
print("Xk_test shape: ",Xk_test.shape)
print("yk_train shape: ",yk_train.shape)
print("yk_train shape: ",yk_val.shape)
print("yk_test shape: ",yk_test.shape)

Xk_train shape:  (178257, 8)
Xk_train shape:  (15279, 8)
Xk_test shape:  (61118, 8)
yk_train shape:  (178257,)
yk_train shape:  (15279,)
yk_test shape:  (61118,)


###	Model construction 

In [7]:
KNNk=KNeighborsClassifier()
KNNk = KNNk.fit(Xk_train, yk_train)
yk_train_pred=KNNk.predict(Xk_train)
yk_val_pred=KNNk.predict(Xk_val)
print(metrics.accuracy_score(yk_train, yk_train_pred))
print(metrics.accuracy_score(yk_val, yk_val_pred))

0.6033760245039466
0.5915308593494338


###	Hyperparameter turning (choose whatever approach your like)

In [8]:
param_grid= {'n_neighbors': np.arange(1,11)}
grid_search= GridSearchCV(KNNk, param_grid, cv=3)
grid_search.fit(Xk_train, yk_train)
print(grid_search.best_estimator_)
grid_search.best_score_

KNeighborsClassifier(n_neighbors=10)


0.6123069500776968

In [9]:
param_grid= {'n_neighbors': np.arange(11,21)}
grid_search= GridSearchCV(KNNk, param_grid, cv=3)
grid_search.fit(Xk_train, yk_train)
print(grid_search.best_estimator_)
grid_search.best_score_

KNeighborsClassifier(n_neighbors=20)


0.6192968579074033

In [10]:
param_grid= {'n_neighbors': np.arange(21,52)}
grid_search= GridSearchCV(KNNk, param_grid, cv=3)
grid_search.fit(Xk_train, yk_train)
print(grid_search.best_estimator_)
grid_search.best_score_

KNeighborsClassifier(n_neighbors=42)


0.6257257779498141

### Best Model and Evaluation. 

In [11]:
KNNk=KNeighborsClassifier(n_neighbors=42)
KNNk = KNNk.fit(Xk_train, yk_train)
yk_train_pred=KNNk.predict(Xk_train)
yk_test_pred=KNNk.predict(Xk_test)
print(metrics.accuracy_score(yk_train, yk_train_pred))
print(metrics.accuracy_score(yk_test, yk_test_pred))

0.632261285671811
0.6234169966294708


### Discuss about overfitting for the model 

There does not appear to be overfitting and the model is performing slightly worse than Random Forest and Decision Trees.