In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder

from sklearn.preprocessing import PolynomialFeatures

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN

from sklearn.metrics import silhouette_score
from sklearn.model_selection import cross_val_score,StratifiedKFold, KFold, GridSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
!pip install category_encoders
import category_encoders as ce



In [None]:
from google.colab import files
files.upload()

{}

In [None]:
df = pd.read_csv('adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [None]:
def summerize_data(df):
    for column in df.columns:
        print (column)
        if df.dtypes[column] == np.object: # Categorical data
            print (df[column].value_counts())
        else:
            print (df[column].describe()) 
            
        print ('\n')
    
summerize_data(df)

age
count    32561.000000
mean        38.581647
std         13.640433
min         17.000000
25%         28.000000
50%         37.000000
75%         48.000000
max         90.000000
Name: age, dtype: float64


workclass
Private             22696
Self-emp-not-inc     2541
Local-gov            2093
?                    1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: workclass, dtype: int64


fnlwgt
count    3.256100e+04
mean     1.897784e+05
std      1.055500e+05
min      1.228500e+04
25%      1.178270e+05
50%      1.783560e+05
75%      2.370510e+05
max      1.484705e+06
Name: fnlwgt, dtype: float64


education
HS-grad         10501
Some-college     7291
Bachelors        5355
Masters          1723
Assoc-voc        1382
11th             1175
Assoc-acdm       1067
10th              933
7th-8th           646
Prof-school       576
9th               514
12th              433
Doctorate         413
5th-6t

In [None]:
df.replace('?',np.nan,inplace = True)

In [None]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,HS-grad,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,,186061,Some-college,10,Widowed,,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [None]:
categorical_features = df.columns[df.dtypes == object]
df[categorical_features].describe()

Unnamed: 0,workclass,education,marital.status,occupation,relationship,race,sex,native.country,income
count,30725,32561,32561,30718,32561,32561,32561,31978,32561
unique,8,16,7,14,6,5,2,41,2
top,Private,HS-grad,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States,<=50K
freq,22696,10501,14976,4140,13193,27816,21790,29170,24720


In [None]:
X = df.drop(columns=['fnlwgt','education','income'])
y = np.where(df['income']=='>50K',1,0)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 10)

In [None]:
X_train.head()

Unnamed: 0,age,workclass,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
29849,20,Private,10,Never-married,Adm-clerical,Own-child,White,Female,0,0,10,United-States
30319,20,Private,9,Never-married,Farming-fishing,Not-in-family,White,Male,0,0,55,United-States
1878,47,State-gov,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,15024,0,50,United-States
20686,50,Self-emp-not-inc,10,Divorced,Exec-managerial,Unmarried,Asian-Pac-Islander,Female,0,0,40,
26007,39,Private,10,Never-married,Machine-op-inspct,Other-relative,Asian-Pac-Islander,Male,0,0,40,Philippines


In [None]:
onehot_encoder = ['relationship','race','sex']
binary_encoder = ['workclass','marital.status','occupation','native.country']

binary_encoder_pipeline = Pipeline([
                                    ('imputer',SimpleImputer(strategy = 'constant',fill_value = 'NC')),
                                    ('binary encoder',ce.BinaryEncoder())
])


col_transformer = ColumnTransformer([
    ('onehot',OneHotEncoder(),onehot_encoder),
    ('binary encoder',binary_encoder_pipeline ,binary_encoder)
],remainder = 'passthrough')


In [None]:
X_train_prep = col_transformer.fit_transform(X_train)
X_test_prep = col_transformer.transform(X_test)

In [None]:
lr = LogisticRegression()
knn_c = KNeighborsClassifier()
rf_c = RandomForestClassifier()

In [None]:
lr.fit(X_test_prep,y_test)
knn_c.fit(X_test_prep,y_test)
rf_c.fit(X_test_prep,y_test)

y_pred_lr = lr.predict(X_test_prep)
y_pred_knn = knn_c.predict(X_test_prep)
y_pred_rf_c = rf_c.predict(X_test_prep)

In [None]:
print(classification_report(y_test,y_pred_lr))

              precision    recall  f1-score   support

           0       0.84      0.93      0.88      6182
           1       0.67      0.45      0.54      1959

    accuracy                           0.81      8141
   macro avg       0.76      0.69      0.71      8141
weighted avg       0.80      0.81      0.80      8141



In [None]:
print(classification_report(y_test,y_pred_knn))

              precision    recall  f1-score   support

           0       0.91      0.94      0.93      6182
           1       0.79      0.71      0.75      1959

    accuracy                           0.89      8141
   macro avg       0.85      0.82      0.84      8141
weighted avg       0.88      0.89      0.88      8141



In [None]:
print(classification_report(y_test,y_pred_rf_c))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      6182
           1       0.98      0.96      0.97      1959

    accuracy                           0.99      8141
   macro avg       0.99      0.98      0.98      8141
weighted avg       0.99      0.99      0.99      8141



Cross Val

In [None]:
folds = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 100)
lr_cv = cross_val_score(lr,X_train_prep,y_train,cv=folds).mean()
knn_cv = cross_val_score(knn_c,X_train_prep,y_train,cv=folds).mean()
rf_cv = cross_val_score(rf_c,X_train_prep,y_train,cv=folds).mean()

In [None]:
print('Logistitic Regression Cross Val :' ,lr_cv )
print('Knn Cross Val :' ,knn_cv )
print('Random Forrest Cross Val :' ,rf_cv )

Logistitic Regression Cross Val : 0.8314905814905815
Knn Cross Val : 0.8447993447993447
Random Forrest Cross Val : 0.8533169533169532


In [None]:
eclf = VotingClassifier(estimators=[ 
    ('lr', lr),
    ('rf',rf_c),
    ('knn', knn_c),
    ], voting='soft')

In [None]:
params = {'lr__C': [1.0, 100.0],
      'knn__n_neighbors': range(2,10)}

In [None]:
grid = GridSearchCV(estimator=eclf, param_grid=params, cv=2)
grid.fit(X_train_prep,y_train)

GridSearchCV(cv=2, error_score=nan,
             estimator=VotingClassifier(estimators=[('lr',
                                                     LogisticRegression(C=1.0,
                                                                        class_weight=None,
                                                                        dual=False,
                                                                        fit_intercept=True,
                                                                        intercept_scaling=1,
                                                                        l1_ratio=None,
                                                                        max_iter=100,
                                                                        multi_class='auto',
                                                                        n_jobs=None,
                                                                        penalty='l2',
                              

In [None]:
grid.best_params_

{'knn__n_neighbors': 8, 'lr__C': 100.0}

In [None]:
best_cv = grid.best_estimator_.fit(X_train_prep, y_train)

In [None]:
y_pred_grid = best_cv.predict(X_test_prep)

In [None]:
print(accuracy_score(y_test,y_pred_grid))

0.8591082176636777


In [None]:
print(classification_report(y_test,y_pred_grid))

              precision    recall  f1-score   support

           0       0.88      0.94      0.91      6182
           1       0.76      0.61      0.67      1959

    accuracy                           0.86      8141
   macro avg       0.82      0.77      0.79      8141
weighted avg       0.85      0.86      0.85      8141

