In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt

In [None]:
df = pd.read_csv('/kaggle/input/adult-income-dataset/adult.csv')
df.head()

# EDA

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.nunique()

In [None]:
numerical = ['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']

categorical = df.loc[:, ~df.columns.isin(numerical)].columns

In [None]:
sns.set(style = 'whitegrid', palette = 'deep', font_scale = 1.1, rc = {'figure.figsize' : [8,5]})

In [None]:
fig, ax = plt.subplots(2,3, figsize = (20,10))

for variable, subplot in zip(numerical, ax.flatten()):
    
    sns.histplot(df[variable],kde = True, ax = subplot)

In [None]:
fig, ax = plt.subplots(3,3, figsize = (25, 25))

for variable, subplot in zip(categorical, ax.flatten()):
    
    sns.countplot(x = variable, ax = subplot, hue = 'income', data = df)
    
    for label in subplot.get_xticklabels():
        
        label.set_rotation(90)

# Cleanup the Data 

It is observed that few column have '?' as data

In [None]:
for i in df.columns:
    
    df[i].replace('?', 'otheres', inplace = True)

# Scaling and Encoding

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()

In [None]:
df1 = df.copy()

*One hot encoding for following columns:*
* gender (Gender count has some weightage for both male (1) and female(0))
* race (Races Black (1) and White (1) have considerable count, rest have very low count (0))

*Marital status, Occupation, Relationship, Education and Workclass have considerable population count across all categories*
* marital status 
* occupation 
* relationship
* education
* workclass

*For country, we can encode based on the Min and Max of the count of population*


*For Numerical Data, we will use MinMax scaling*

**Categorical Data Encoding**

In [None]:
def income_one_hot(col):
    
    if col == '<=50K':
        
        return 1
    
    else:
        
        return 0
    
df1['if_<=50K'] = df['income'].apply(income_one_hot)

In [None]:
def Encoding(col):
    
    if col == 'White':
        
        return 1
    
    else:
        
        return 0
    
def Encoding_black(col):
    
    if col == 'Black':
        
        return 1
    
    else:
        
        return 0
    
df1['if_white'] = df['race'].apply(Encoding)
df1['is_black'] =  df['race'].apply(Encoding_black)
df1.drop(['race'], axis = 1, inplace = True)

In [None]:
df1['gender'] = pd.get_dummies(df['gender'], drop_first = True)

In [None]:
df1 = df1.join(pd.get_dummies(df['marital-status'], drop_first = True))
df1.drop('marital-status', axis = 1, inplace = True)

In [None]:
df1 = df1.join(pd.get_dummies(pd.get_dummies(df['occupation'], drop_first = True)))
df1.drop('occupation', axis = 1, inplace = True)

In [None]:
df1 = df1.join(pd.get_dummies(pd.get_dummies(df['relationship'], drop_first = True)))
df1.drop('relationship', axis = 1, inplace = True)

In [None]:
df1 = df1.join(pd.get_dummies(pd.get_dummies(df['education'], drop_first = True)))
df1.drop('education', axis = 1, inplace = True)

In [None]:
df1 = df1.join(pd.get_dummies(pd.get_dummies(df['workclass'],prefix = 'workclass_', drop_first = True)))
df1.drop('workclass', axis = 1, inplace = True)

In [None]:
df1.drop('income', axis = 1, inplace = True)

To scale the country, we first have to map the countries with their population count

In [None]:
count_incoding_list = dict(df['native-country'].value_counts())
df1['native-country'] = df1['native-country'].map(count_incoding_list)

In [None]:
df1[['native-country']] = scaler.fit_transform(df1[['native-country']])

In [None]:
df1[numerical] = scaler.fit_transform(df1[numerical])

In [None]:
df1.dtypes

We have encoded all the columns

# Model Preparation

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import neighbors

In [None]:
x = df1.loc[:, df1.columns!= 'if_<=50K']
y = df1['if_<=50K']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.8)

**Hyperparameter tuning**

In [None]:
acc_train = {}
acc_test = {}
for i in range(1,15,2):
    classification_model = neighbors.KNeighborsClassifier(i) 
    classification_model.fit(x_train, y_train)
    acc_train[i] = classification_model.score(x_train, y_train)
    acc_test[i] = classification_model.score(x_test, y_test)  
    
    print('\nk = ', i )
    print('train acc = ', classification_model.score(x_train, y_train) )
    print('test acc  = ', classification_model.score(x_test, y_test)   )

*Plotting bias and variance*

In [None]:
plt.plot(acc_train.keys(),acc_train.values())
plt.plot(acc_test.keys(),acc_test.values())

plt.show()

From above calculation and plot, it is observed that bias and variance converge around k = 13.

So, we can use K = 13 for further parameter tuning

In [None]:
k = 13


metric_acc_training = {}
metric_acc_testing = {}

dist_calc = ['euclidean', 'manhattan', 'chebyshev', 'minkowski']

for calculation in dist_calc:
    
    classification_model = neighbors.KNeighborsClassifier(k, metric = calculation) 
    classification_model.fit(x_train, y_train)
    metric_acc_training[calculation] = classification_model.score(x_train, y_train)
    metric_acc_testing[calculation] = classification_model.score(x_test, y_test) 
    
    print('\nCalculation : ', calculation)
    print('Training accuracy :  ', classification_model.score(x_train, y_train))
    print('Testing accuracy : ', classification_model.score(x_test, y_test) )
    

There is not much difference in accuracy between the distance calculations. So, let's go with Euclidean"

Final Parameters:

k = 13

Distance = Euclidean
