In [None]:
# !pip install scikit-learn
# !pip install imblearn

### Goal is to classify for income <=50k or >50K

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [None]:
df = pd.read_csv('../../Datasets/adultcensusincome.csv', sep=',')
df.head() 

In [None]:
df.info()

In [None]:
df['income'].value_counts()

In [None]:
df.isin(['?']).sum() 

In [None]:
df['workclass'].value_counts()

In [None]:
df['occupation'].value_counts()

In [None]:
df['native.country'].value_counts()

In [None]:
df = df.replace('?', np.nan)

In [None]:
df.isnan()

In [None]:
df['income'].isin(['?']).sum()

Replace

In [None]:
income = df['income'].value_counts()
# income = df['income'].value_counts().reset_index()

income

In [None]:
# sns.barplot(data=df, x='income', y='count')
# sns.barplot(data=income, x='income', y='count')
sns.barplot(income)

plt.show()

we have way more people with less than 50K in income

## Plot the histogram of the age data

In [None]:
# sns.distplot(df['age'], bins=25, kde=True)
# sns.displot(df['age'], bins=25, kde=True)
sns.histplot(df['age'], bins=25, kde=True)

In [None]:
education  = df['education'].value_counts().reset_index()
education.columns = ['index', 'values']
sns.barplot(education, x='values', y='index', hue='index', palette='rainbow')

### Education Number

In [None]:
education  = df['education.num'].value_counts().reset_index()
education.columns = ['index', 'values']
sns.barplot(education, x='index', y='values', hue='index', palette='rainbow')

## 3.5 Create Pie Chart for Marital Status

In [None]:
marital_status = df['marital.status'].value_counts()

plt.pie(marital_status.values, labels=marital_status.index)
plt.show()

### 3.6 Barplot for Sex

In [None]:
sex  = df['sex'].value_counts().reset_index()
sex.columns = ['index', 'values']
sns.barplot(sex, x='index', y='values', hue='index', palette='rainbow')

Data is biased to Male. We need to consider this for our model

### 3.7 Barplot for Hours per week

In [None]:
df['hours.per.week'].value_counts()

In [None]:
sns.lineplot(df['hours.per.week'])

In [None]:
sns.histplot(df['hours.per.week'], bins=25)

### See the correlation of variables with Income

Convert income variable (categorical) into a numeric variable

In [None]:
encoder = LabelEncoder()
df['income_numeric'] = encoder.fit_transform(df['income'])

In [None]:
numeric_columns = df.select_dtypes(include=['number'])

corr_matrix = numeric_columns.corr()

In [None]:
sns.heatmap(corr_matrix, annot=True)

# Perform bivariate

## 4.1 Create countplot of income accross columns age, education, marital status, race, sex

In [None]:
plt.figure(figsize=(20, 8))
sns.countplot(data=df, x='age', hue=df['income'])

In [None]:
plt.figure(figsize=(20, 8))
sns.countplot(data=df, x='education', hue=df['income'])

In [None]:
plt.figure(figsize=(20, 8))
sns.countplot(data=df, x='marital.status', hue=df['income'])

In [None]:
plt.figure(figsize=(20, 8))
sns.countplot(data=df, x='race', hue=df['income'])

Data is very biased for White people

In [None]:
plt.figure(figsize=(20, 8))
sns.countplot(data=df, x='sex', hue=df['income'])

# 5 Prepare data for modeling

## 5.1 Label encode all the categorical columns

Fix missing values

In [None]:
df.isna().sum()

In [None]:
df['workclass'].fillna(df['workclass'].mode().iloc[0], inplace=True)
df['occupation'].fillna(df['occupation'].mode().iloc[0], inplace=True)
df['native.country'].fillna(df['native.country'].mode().iloc[0], inplace=True)

# df['workclass'].fillna(df['workclass'].mode(), inplace=True)
# df['workclass'].fillna(df['workclass'].mode(), inplace=True)

In [None]:
df.isna().sum()

## 5.1 Label encode all the categorical columns to convert all into numerical variables

In [None]:
numeric_columns = df.select_dtypes(include=['number'])
numeric_columns

In [None]:
df.columns

In [None]:
del df['income_numeric']

In [None]:
df.columns

In [None]:
# create the encoder for our categorical variables

encoder = LabelEncoder()

for col in df.columns:
    if(df[col].dtypes== 'object'):
        df[col] = encoder.fit_transform(df[col])

In [None]:
numeric_columns = df.select_dtypes(include=['number'])
numeric_columns

## 5.2 Prepra independent and dependent variables

In [None]:
X = df.drop('income', axis=1)
y = df['income']

## 5.3 Perform feature scaling using StandardScaler ind fix the imbalance in the dataset using SMOTE or RandomOverSampler

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
# for col in df.columns:
for col in X.columns:
    scaler = StandardScaler()
    X[col] = scaler.fit_transform(X[col].values.reshape(-1,1))

In [None]:
X.head()

In [None]:
y.value_counts(normalize=True)

Data is very unbalanced for the majority class(>50K)

Let's fix that

In [None]:
from imblearn.over_sampling import RandomOverSampler

In [None]:
ros = RandomOverSampler(random_state=42)
ros.fit(X,y)

Do the actual resampling...

In [None]:
X_resampled, y_resampled = ros.fit_resample(X,y)

In [None]:
y_resampled.value_counts(normalize=True)

In [None]:
y_resampled.value_counts()

## 5.4 Perform a tran test split in the ratio 80:20 and random state 42

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split (X_resampled, y_resampled, test_size=0.2, random_state=42)

In [None]:
X_train.shape

In [None]:
y_test.shape

# 6. Perform Classification Modeling

## 6.1 Train logistic regression model, KNN classifier model, SVM classifier, naive bayes classifier, decision tree classifier and random forest classifier

Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logistic_reg  = LogisticRegression(random_state=42)
logistic_reg.fit(X_train, y_train)

In [None]:
preds_logistic_reg = logistic_reg.predict(X_test)

KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

In [None]:
preds_knn = knn.predict(X_test)

SVM

In [None]:
from sklearn.svm import SVC

In [None]:
svc = SVC()
svc.fit(X_train, y_train)

In [None]:
preds_svc = svc.predict(X_test)

Naive Bayes

We should use Multinomial instead of Gaussian

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
nb = GaussianNB()
nb.fit(X_train, y_train)

In [None]:
preds_nb = nb.predict(X_test)

Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier


In [None]:
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train)

In [None]:
preds_tree = tree.predict(X_test)

Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

In [None]:
preds_rf = rf.predict(X_test)

## 6.2 Perform model evaluation on Accuracy and F1

In [None]:
from sklearn.metrics import accuracy_score, f1_score

In [None]:
def print_metrics(title,model_preds):
    print(title)
    print('Accuracy: ', accuracy_score(y_test, model_preds))
    print('F1 score: ', f1_score(y_test, model_preds))

In [None]:
print_metrics('Log Reg', preds_logistic_reg)
print_metrics('KNN', preds_knn)
print_metrics('SVM Classifier', preds_svc)
print_metrics('Gaussian NB', preds_nb)
print_metrics('Decision Tree', preds_tree)
print_metrics('Random Forest', preds_rf)