# Predict whether income exceeds $50K/yr

use the dataset in [this](https://archive.ics.uci.edu/ml/datasets/Adult) link to predict whether income exceeds $50k/yr

In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline
from sklearn.naive_bayes import GaussianNB
from mlxtend.plotting import plot_decision_regions
import scipy.cluster.hierarchy as sch
from sklearn import datasets
from sklearn.neighbors import kneighbors_graph
import matplotlib.gridspec as gridspec
import itertools
from sklearn.preprocessing import StandardScaler
from itertools import cycle, islice
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.ensemble import VotingClassifier
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv("../datasets/adult.csv")

In [3]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
features = df.iloc[:, [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11]].values
goal = df.iloc[:, -1].values
encoder = LabelEncoder()
features[:, 1] = encoder.fit_transform(features[:, 1])
features[:, 2] = encoder.fit_transform(features[:, 2])
features[:, 4] = encoder.fit_transform(features[:, 4])
features[:, 5] = encoder.fit_transform(features[:, 5])
features[:, 6] = encoder.fit_transform(features[:, 6])
features[:, 7] = encoder.fit_transform(features[:, 7])
features[:, 8] = encoder.fit_transform(features[:, 8])
goal = encoder.fit_transform(goal)
x_train, x_test, y_train, y_test = train_test_split(features, goal, test_size=0.2, random_state=0)

## Votting

In [7]:
logistic = LogisticRegression(random_state=0)
svm = LinearSVC(random_state=0)
gnb = GaussianNB()
knn = KNeighborsClassifier(n_neighbors=5)
DecisionTree = DecisionTreeClassifier(random_state=0)
rbf = SVC(kernel='rbf', probability=True)
poly = SVC(kernel='poly', probability=True)
vote = VotingClassifier(estimators=[('lr', logistic), ('svc', svm), ('gnb', gnb), ('knn', knn), ('DecisionTree', DecisionTree)], voting='hard')
for clf, lab in zip([logistic, svm, gnb, knn, DecisionTree, vote], ['Logistic Regression', 'SVC', 'naive Bayes','knn', 'DecisionTree', 'Ensemble']):
    clf.fit(x_train, y_train)
    y_predict = clf.predict(x_test)
    cm = confusion_matrix(y_test, y_predict)
    print(pd.DataFrame(cm))
    print("model {} with acu {:.2f}".format(lab, clf.score(x_test, y_test)))

      0    1
0  4661  257
1   911  684
model Logistic Regression with acu 0.82
      0    1
0  4817  101
1  1210  385
model SVC with acu 0.80
      0    1
0  4689  229
1  1092  503
model naive Bayes with acu 0.80
      0     1
0  4466   452
1   566  1029
model knn with acu 0.84
      0    1
0  4370  548
1   650  945
model DecisionTree with acu 0.82
      0    1
0  4776  142
1   933  662
model Ensemble with acu 0.83


## Bagging

In [8]:
bagging = BaggingClassifier(LogisticRegression(random_state=0))
knn = LogisticRegression(random_state=0)
for clf, lab in zip([knn, bagging], ['KNN', 'Bagging']):
    clf.fit(x_train, y_train)
    y_predict = clf.predict(x_test)
    cm = confusion_matrix(y_test, y_predict)
    print(pd.DataFrame(cm))
    print("model {} with acu {:.2f}".format(lab, clf.score(x_test, y_test)))

      0    1
0  4661  257
1   911  684
model KNN with acu 0.82
      0    1
0  4664  254
1   923  672
model Bagging with acu 0.82


## Random Forest

In [11]:
forest = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)
tree = DecisionTreeClassifier(random_state=0)
for clf, lab in zip([tree, forest], ['tree', 'Forest']):
    clf.fit(x_train, y_train)
    y_predict = clf.predict(x_test)
    cm = confusion_matrix(y_test, y_predict)
    print(pd.DataFrame(cm))
    print("model {} with acu {:.2f}".format(lab, clf.score(x_test, y_test)))

      0    1
0  4370  548
1   650  945
model tree with acu 0.82
      0    1
0  4476  442
1   617  978
model Forest with acu 0.84


## Boosting

In [15]:
clf1 = LogisticRegression(random_state=0)
clf2 = LinearSVC(random_state=0)
clf3 = GaussianNB()
eclf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)

for clf, lab in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'SVC', 'naive Bayes', 'Gboost']):
    clf.fit(x_train, y_train)
    y_predict = clf.predict(x_test)
    cm = confusion_matrix(y_test, y_predict)
    print(pd.DataFrame(cm))
    print("model {} with acu {:.2f}".format(lab, clf.score(x_test, y_test)))

      0    1
0  4661  257
1   911  684
model Logistic Regression with acu 0.82
      0    1
0  4817  101
1  1210  385
model SVC with acu 0.80
      0    1
0  4689  229
1  1092  503
model naive Bayes with acu 0.80
      0     1
0  4592   326
1   590  1005
model Gboost with acu 0.86


In [20]:
xgb_clf = XGBClassifier()
xgb_clf.fit(x_train, y_train)
y_predict = clf.predict(x_test)
cm = confusion_matrix(y_test, y_predict)
print(pd.DataFrame(cm))
print("model {} with acu {:.2f}".format('xgb', clf.score(x_test, y_test)))

      0     1
0  4592   326
1   590  1005
model xgb with acu 0.86
