In [None]:
!pip install seaborn --upgrade

In [None]:
!pip install plotly

In [None]:
import numpy as np
import plotly.express as px
from copy import copy
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
from sklearn import metrics


In [None]:
columns = ["Age", "Workclass", "fnlgwt", "Education", "Education num", "Marital Status",
           "Occupation", "Relationship", "Race", "Sex", "Capital Gain", "Capital Loss",
           "Hours/week", "Native Country", "Income"
          ]
income = pd.read_csv("data.csv", names = columns)


In [None]:
income.head()

In [None]:
# use 1 represent income > 50k, use 0 represent income <= 50k
income["Income"] = income["Income"].apply(lambda x:0 if x == " <=50K" else 1)

In [None]:
income

In [None]:
# data analysis
income.describe()

In [None]:
# check whether tha data set have null values
income.isnull().sum()

In [None]:
# get the number of people in different workclass
income["Workclass"].value_counts()

In [None]:
income["Workclass"].replace(' ?', income["Workclass"].mode()[0], inplace = True)

In [None]:
income["Workclass"].value_counts()

In [None]:
income["Native Country"].value_counts()

In [None]:
income["Native Country"].replace(" ?", income["Native Country"].mode()[0], inplace = True)

In [None]:
# data visualization
# count plot for marital status 
plt.figure(figsize = (20, 10))
sns.countplot(income["Marital Status"])

In [None]:
# count plot for education
plt.figure(figsize = (20, 10))
sns.countplot(income["Education"])

In [None]:
# dist plot for age
plt.figure(figsize = (20, 10))
sns.distplot(income["Age"])

In [None]:
# pair plot 
plt.figure(figsize = (20, 10))
sns.pairplot(income)

In [None]:
# use seaborn to see the correlation between the features
cm = income.corr()
sns.heatmap(cm, annot = True)

In [None]:
# prepare the data
X = income.drop(["Income"], axis = 1)
y = income["Income"]

In [None]:
# use one hot encoding, convert text to numeric
features = ["Workclass", "Education", "Marital Status",
           "Occupation", "Relationship", "Race", "Sex", "Native Country"]
X = pd.get_dummies(X, columns = features)
X.head()

In [None]:
# do the data standalization
from sklearn.preprocessing import StandardScaler
s = StandardScaler()
X = s.fit_transform(X)

In [None]:
# train, test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
from sklearn.linear_model import LogisticRegression
logModel = LogisticRegression()
logModel.fit(X_train, y_train)

In [None]:
prediction = logModel.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, prediction))

In [None]:
from sklearn.metrics import confusion_matrix
confusionMatrix = confusion_matrix(y_test, prediction)
sns.heatmap(confusionMatrix, annot = True, fmt = "0.5g")