In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from imblearn.under_sampling import ClusterCentroids
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

%matplotlib inline

In [None]:
test = pd.read_csv("test.csv",header=None, names=["age","workclass","fnlwgt","education","education_num","marital-status","occupation","relationship","race","sex",
                                                   "capital-gain","capital-loss","hours-per-week","native-country","income"])
test = test.iloc[1:]

In [None]:
train = pd.read_csv("adult.csv",header=None, names=["age","workclass","fnlwgt","education","education_num","marital-status","occupation","relationship","race","sex",
                                                   "capital-gain","capital-loss","hours-per-week","native-country","income"])

In [None]:
data = pd.concat([train, test])
data['age'] = data['age'].apply(lambda x: int(x))
data = data.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
data['income'] = data['income'].apply(lambda x: x.replace('.',''))

In [None]:
data.head(3)

In [None]:
data.shape

# Data Cleaning

In [None]:
print(f"Presence of Nan values: {data.isnull().values.any()}")

In [None]:
print(f"number of samples with unknown native country: {len(data[data['native-country']=='?'])}")
print(f"number of samples with unknown workclass: {len(data[data['workclass']=='?'])}")
print(f"number of samples with unknown occupation: {len(data[data['occupation']=='?'])}")

In [None]:
data.drop(data[data['occupation']=='?'].index, inplace=True)
data.drop(data[data['native-country']=='?'].index, inplace=True)
data.drop(data[data['workclass']=='?'].index, inplace=True)

# Exploration

In [None]:
data.describe()

In [None]:
sns.set(style="whitegrid")
plt.figure(figsize=(8,8))
ax = sns.countplot(x="income", data=data, hue="income")
plt.ylabel('Count', fontsize=12)
plt.xlabel('Class', fontsize=12)

for p in ax.patches:
    ax.annotate('{0:.0f}'.format(p.get_height(), 'f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')

plt.savefig(fname="Class_count.png")
plt.show()

In [None]:
plt.figure(figsize=(5,4))
ax = sns.boxplot(y="age", x="income", data=data)
plt.ylabel('Age', fontsize=12)
plt.xlabel('Class', fontsize=12)

plt.savefig(fname="agebox.png")
plt.show()

In [None]:
sns.set_style("whitegrid")
plt.figure(figsize=(22,6))
ax = sns.countplot(x="occupation", hue="income", data=data)
plt.ylabel('Count', fontsize=12)
plt.xlabel('Occupation', fontsize=12)

ax.legend().set_title('Class')
plt.setp(ax.get_legend().get_title(), fontsize='22')
plt.setp(ax.get_legend().get_texts(), fontsize='18')

for p in ax.patches:
    ax.annotate(format(p.get_height(), 'd'), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')

plt.show()

In [None]:
sns.set_style("whitegrid")
plt.figure(figsize=(22,6))
ax = sns.countplot(x="marital-status", hue="income", data=data)
plt.ylabel('Count', fontsize=12)
plt.xlabel('Marital Status', fontsize=12)

ax.legend().set_title('Class')
plt.setp(ax.get_legend().get_title(), fontsize='22')
plt.setp(ax.get_legend().get_texts(), fontsize='18')

for p in ax.patches:
    ax.annotate(format(p.get_height(), 'd'), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')

plt.show()

In [None]:
sns.set_style("whitegrid")
plt.figure(figsize=(22,6))
ax = sns.countplot(x="workclass", hue="income", data=data)
plt.ylabel('Count', fontsize=12)
plt.xlabel('Workclass', fontsize=12)


for p in ax.patches:
    ax.annotate('{0:.0f}'.format(p.get_height(), 'f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')

ax.legend().set_title('Class')
plt.setp(ax.get_legend().get_title(), fontsize='22')
plt.setp(ax.get_legend().get_texts(), fontsize='18')

plt.show()

In [None]:
sns.set_style("whitegrid")
plt.figure(figsize=(22,6))
ax = sns.countplot(x="education", hue="income", data=data)
plt.ylabel('Count', fontsize=12)
plt.xlabel('Education', fontsize=12)

for p in ax.patches:
    ax.annotate('{0:.0f}'.format(p.get_height(), 'f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')

ax.legend().set_title('Class')
plt.setp(ax.get_legend().get_title(), fontsize='22')
plt.setp(ax.get_legend().get_texts(), fontsize='18')

plt.show()

In [None]:
sns.set_style("whitegrid")
plt.figure(figsize=(22,6))
ax = sns.countplot(x="relationship", hue="income", data=data)
plt.ylabel('Count', fontsize=12)
plt.xlabel('Relationship', fontsize=12)

ax.legend().set_title('Class')
plt.setp(ax.get_legend().get_title(), fontsize='22')
plt.setp(ax.get_legend().get_texts(), fontsize='18')

for p in ax.patches:
    ax.annotate(format(p.get_height(), 'd'), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')

plt.show()

In [None]:
sns.set_style("whitegrid")
plt.figure(figsize=(22,6))
ax = sns.countplot(x="race", hue="income", data=data)
plt.ylabel('Count', fontsize=12)
plt.xlabel('Race', fontsize=12)

ax.legend().set_title('Class')
plt.setp(ax.get_legend().get_title(), fontsize='22')
plt.setp(ax.get_legend().get_texts(), fontsize='18')

for p in ax.patches:
    ax.annotate(format(p.get_height(), 'd'), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')

plt.show()

In [None]:
sns.set_style("whitegrid")
plt.figure(figsize=(22,6))
ax = sns.countplot(x="sex", hue="income", data=data)
plt.ylabel('Count', fontsize=12)
plt.xlabel('Sex', fontsize=12)

ax.legend().set_title('Class')
plt.setp(ax.get_legend().get_title(), fontsize='22')
plt.setp(ax.get_legend().get_texts(), fontsize='18')

for p in ax.patches:
    ax.annotate(format(p.get_height(), 'd'), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')

plt.show()

In [None]:
plt.figure(figsize=(5,4))
ax = sns.boxplot(y="hours-per-week", x="income", data=data)
plt.ylabel('Hours per Week', fontsize=12)
plt.xlabel('Class', fontsize=12)
plt.savefig(fname="hoursbox.png")
plt.show()

In [None]:
plt.figure(figsize=(5,4))
ax = sns.boxplot(y="fnlwgt", x="income", data=data)
plt.ylabel('fnlwgt', fontsize=12)
plt.xlabel('Class', fontsize=12)
plt.savefig(fname="fnlwgtbox.png")
plt.show()

In [None]:
plt.figure(figsize=(5,4))
ax = sns.boxplot(y="education_num", x="income", data=data)
plt.ylabel('Education Num', fontsize=12)
plt.xlabel('Class', fontsize=12)

plt.show()

In [None]:
plt.figure(figsize=(5,4))
ax = sns.boxplot(y="capital-gain", x="income", data=data)
plt.ylabel('Capital Gain', fontsize=12)
plt.xlabel('Class', fontsize=12)
plt.savefig(fname="Gainbox.png")
plt.show()

In [None]:
plt.figure(figsize=(5,4))
ax = sns.boxplot(y="capital-loss", x="income", data=data)
plt.ylabel('Capital Loss', fontsize=12)
plt.xlabel('Class', fontsize=12)
plt.savefig(fname="Lossbox.png")
plt.show()

In [None]:
df_corr = data.corr()
fig, ax = plt.subplots(figsize=(8,6))         
ax = sns.heatmap(df_corr, annot=True,cmap="Blues") 
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)

In [None]:
sns.pairplot(data, hue='income')

# One Hot Encoding

In [None]:
y = data['income']
data = data.drop('income', axis=1)
y[y=='<=50K'] = 0
y[y=='>50K'] = 1
y=y.astype('int')

In [None]:
X = pd.get_dummies(data).astype(np.float)

In [None]:
X.shape

# Outliers Removal

In [None]:
from scipy.stats import zscore

In [None]:
scores = np.abs(zscore(X))
filtered = (scores < 3).all(axis=1)
newX1 = X[filtered]
newy1 = y[filtered]

In [None]:
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
newX2 = X[~((data < (Q1 - 1.5 * IQR)) |(data > (Q3 + 1.5 * IQR))).any(axis=1)]
newy2 = y[~((data < (Q1 - 1.5 * IQR)) |(data > (Q3 + 1.5 * IQR))).any(axis=1)]

In [None]:
from sklearn.ensemble import IsolationForest
clf = IsolationForest()
is_outliers = clf.fit_predict(X)
newX3 = X.iloc[is_outliers==1]
newy3 = y.iloc[is_outliers==1]

In [None]:
X,y = newX2, newy2

## Scaling

In [None]:
scaler = MinMaxScaler()
X[:] = scaler.fit_transform(X[:])

# PCA

In [None]:
from sklearn.decomposition import PCA

pca = PCA().fit(X)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.plot(pca.explained_variance_ratio_)
plt.xlabel('number of components')
plt.ylabel('proportion of variance explained');
plt.legend(['Cumulative Variance', 'Component Variance'])

In [None]:
pca = PCA(n_components = 40)
X_pca = pca.fit_transform(X)

### Train Test split

In [None]:
from sklearn.model_selection import train_test_split
X_train_val, X_test, y_train_val, y_test = train_test_split(X_pca, y, test_size=0.30, random_state=42,stratify=y)

# Models and Results

In [None]:
from sklearn.metrics import confusion_matrix
def confmat(y_test,pred):
    cf_matrix = confusion_matrix(y_test, pred)
    ax = sns.heatmap(cf_matrix, annot=True,  fmt='g',cmap='coolwarm',xticklabels=["<=50K",">50K"],yticklabels=["<=50K",">50K"]) #notation: "annot" not "annote"
    bottom, top = ax.get_ylim()
    ax.set_ylim(bottom + 0.5, top - 0.5)
    ax.set(xlabel='True Label', ylabel='Predicted Label')

# Decision Tree

In [None]:
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
kf = StratifiedKFold(n_splits=10)
scores = []
for train_index, test_index in kf.split(X_train_val,y_train_val):
    X_train, X_val = X_train_val[train_index], X_train_val[test_index]
    y_train, y_val = y_train_val.iloc[train_index], y_train_val.iloc[test_index]
    
    #SMOTE
    sm = SMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X_train, y_train)
    
    #Classification
    clf = DecisionTreeClassifier(criterion='gini',max_depth=10)
    clf.fit(X_res,y_res)
    pred = clf.predict(X_val)
    scores.append(f1_score(pred,y_val))
print(f"mean score {np.mean(scores)}")

In [None]:
# results on the test set
clf = DecisionTreeClassifier(criterion='gini',max_depth=10)
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train_val, y_train_val)
clf.fit(X_res,y_res)
pred = clf.predict(X_test)
print(accuracy_score(pred,y_test))
print(f1_score(pred,y_test))
print(precision_score(pred,y_test))
print(recall_score(pred,y_test))

In [None]:
confmat(y_test,pred)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
kf = StratifiedKFold(n_splits=10)
scores = []
for train_index, test_index in kf.split(X_train_val,y_train_val):
    X_train, X_val = X_train_val[train_index], X_train_val[test_index]
    y_train, y_val = y_train_val.iloc[train_index], y_train_val.iloc[test_index]
    
    #SMOTE
    sm = SMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X_train, y_train)
    
    #Classification
    clf = RandomForestClassifier()
    clf.fit(X_res,y_res)
    pred = clf.predict(X_val)
    scores.append(f1_score(pred,y_val))
print(f"mean score {np.mean(scores)}")

In [None]:
# results on the test set
clf = RandomForestClassifier(n_estimators=300,max_depth=10, max_features = 'sqrt')
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train_val, y_train_val)
clf.fit(X_res,y_res)
pred = clf.predict(X_test)
print(accuracy_score(pred,y_test))
print(f1_score(pred,y_test))
print(precision_score(pred,y_test))
print(recall_score(pred,y_test))

In [None]:
confmat(y_test,pred)

# Logistic Regression

In [None]:
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
kf = StratifiedKFold(n_splits=10)
scores = []
for train_index, test_index in kf.split(X_train_val,y_train_val):
    X_train, X_val = X_train_val[train_index], X_train_val[test_index]
    y_train, y_val = y_train_val.iloc[train_index], y_train_val.iloc[test_index]
    
    #SMOTE
    sm = SMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X_train, y_train)
    
    #Classification
    clf = LogisticRegression()
    clf.fit(X_res,y_res)
    pred = clf.predict(X_val)
    scores.append(f1_score(pred,y_val))
print(f"mean score {np.mean(scores)}")

In [None]:
clf = LogisticRegression()
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train_val, y_train_val)
clf.fit(X_res,y_res)
pred = clf.predict(X_test)
print(accuracy_score(pred,y_test))
print(f1_score(pred,y_test))
print(precision_score(pred,y_test))
print(recall_score(pred,y_test))

In [None]:
confmat(y_test,pred)

# SVM

In [None]:
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
kf = StratifiedKFold(n_splits=10)
scores = []
for train_index, test_index in kf.split(X_train_val,y_train_val):
    X_train, X_val = X_train_val[train_index], X_train_val[test_index]
    y_train, y_val = y_train_val.iloc[train_index], y_train_val.iloc[test_index]
    
    #SMOTE
    sm = SMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X_train, y_train)
    
    #Classification
    clf = LinearSVC()
    clf.fit(X_res,y_res)
    pred = clf.predict(X_val)
    scores.append(f1_score(pred,y_val))
print(f"mean score {np.mean(scores)}")

In [None]:
from sklearn.svm import LinearSVC
clf = LinearSVC(C=1, max_iter=10000)
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train_val, y_train_val)
clf.fit(X_res,y_res)
pred = clf.predict(X_test)
print(accuracy_score(pred,y_test))
print(f1_score(pred,y_test))
print(precision_score(pred,y_test))
print(recall_score(pred,y_test))

In [None]:
confmat(y_test,pred)

# SVC

In [None]:
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
clf = SVC(gamma=0.1)
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train_val, y_train_val)
clf.fit(X_res,y_res)
pred = clf.predict(X_test)
print(accuracy_score(pred,y_test))
print(f1_score(pred,y_test))
print(precision_score(pred,y_test))
print(recall_score(pred,y_test))

In [None]:
confmat(y_test,pred)

# KNN

In [None]:
from imblearn.over_sampling import SMOTE


In [None]:
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
kf = StratifiedKFold(n_splits=10)
scores = []
for train_index, test_index in kf.split(X_train_val,y_train_val):
    X_train, X_val = X_train_val[train_index], X_train_val[test_index]
    y_train, y_val = y_train_val.iloc[train_index], y_train_val.iloc[test_index]
    
    #SMOTE
    sm = SMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X_train, y_train)
    
    #Classification
    clf = KNeighborsClassifier(n_neighbors=10)
    clf.fit(X_res,y_res)
    pred = clf.predict(X_val)
    scores.append(f1_score(pred,y_val))
print(f"mean score {np.mean(scores)}")

In [None]:
from sklearn.svm import LinearSVC
clf = KNeighborsClassifier(n_neighbors=10)
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train_val, y_train_val)
clf.fit(X_res,y_res)
pred = clf.predict(X_test)
print(accuracy_score(pred,y_test))
print(f1_score(pred,y_test))
print(precision_score(pred,y_test))
print(recall_score(pred,y_test))

In [None]:
confmat(y_test,pred)

In [None]:
f1 = [0.6264282165921511,0.6549401197604791,0.6596923076923078,0.6555134937110757,0.6330916623511135]
accuracy = [0.7668036281882317,0.7855647724629816,0.7856422978525467,0.7752538956508257,0.780293045972556]
precision = [0.789358372456964,0.8215962441314554,0.838810641627543,0.8397496087636933,0.7652582159624414]
recall = [0.5192505661931234,0.5444928438083385,0.5436105476673428,0.5374449339207048,0.5398542724663281]
scores = pd.DataFrame(list(zip(f1,accuracy,precision,recall)), 
               columns =['f1', 'accuracy','precision','recall']) 

plt.figure(figsize=(8,6))
x=["Decision Tree","Random Forest","Logistic Regression","SVM","K-nn"]
ax = sns.barplot(x=x,y=scores['f1'],palette='colorblind')
for p in ax.patches:
    ax.annotate("{:.3f}".format(p.get_height(), 'f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.show()

In [None]:
plt.figure(figsize=(8,6))
ax = sns.barplot(x=x,y=scores['accuracy'],palette='colorblind')
for p in ax.patches:
    ax.annotate("{:.3f}".format(p.get_height(), 'f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.show()

In [None]:
plt.figure(figsize=(8,6))
ax = sns.barplot(x=x,y=scores['precision'],palette='colorblind')
for p in ax.patches:
    ax.annotate("{:.3f}".format(p.get_height(), 'f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.show()

In [None]:
plt.figure(figsize=(8,6))
ax = sns.barplot(x=x,y=scores['recall'],palette='colorblind')
for p in ax.patches:
    ax.annotate("{:.3f}".format(p.get_height(), 'f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.show()