In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB

In [2]:
df = pd.read_csv("../DSBDAL_Exam_DataSets/Adult/adult.csv")
df

Unnamed: 0,Age,Workclass,FnlWgt,Education,Education Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours Per Week,Native Country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Age             32561 non-null  int64 
 1   Workclass       32561 non-null  object
 2   FnlWgt          32561 non-null  int64 
 3   Education       32561 non-null  object
 4   Education Num   32561 non-null  int64 
 5   Marital Status  32561 non-null  object
 6   Occupation      32561 non-null  object
 7   Relationship    32561 non-null  object
 8   Race            32561 non-null  object
 9   Sex             32561 non-null  object
 10  Capital Gain    32561 non-null  int64 
 11  Capital Loss    32561 non-null  int64 
 12  Hours Per Week  32561 non-null  int64 
 13  Native Country  32561 non-null  object
 14  Income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [4]:
df.shape

(32561, 15)

# Data Cleaning

In [5]:
df.isna().sum()

Age               0
Workclass         0
FnlWgt            0
Education         0
Education Num     0
Marital Status    0
Occupation        0
Relationship      0
Race              0
Sex               0
Capital Gain      0
Capital Loss      0
Hours Per Week    0
Native Country    0
Income            0
dtype: int64

In [6]:
df[df == '?'].sum()

Age               0.0
Workclass           0
FnlWgt            0.0
Education           0
Education Num     0.0
Marital Status      0
Occupation          0
Relationship        0
Race                0
Sex                 0
Capital Gain      0.0
Capital Loss      0.0
Hours Per Week    0.0
Native Country      0
Income              0
dtype: object

In [7]:
df = df.replace('?', np.nan)
df.dropna(inplace=True)

# Outliers

In [8]:
df.dtypes

Age                int64
Workclass         object
FnlWgt             int64
Education         object
Education Num      int64
Marital Status    object
Occupation        object
Relationship      object
Race              object
Sex               object
Capital Gain       int64
Capital Loss       int64
Hours Per Week     int64
Native Country    object
Income            object
dtype: object

In [9]:
column1 = ['Age', 'FnlWgt', 'Education Num', 'Capital Gain', 'Capital Loss', 'Hours Per Week']

for column_name in column1:
    df=df[df[column_name]>=0]
df

Unnamed: 0,Age,Workclass,FnlWgt,Education,Education Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours Per Week,Native Country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [10]:
for column_name in column1:
    out = abs((df[column_name] - df[column_name].mean()) / df[column_name].std())
    lier = out > 3
    df=df[~lier]

df

Unnamed: 0,Age,Workclass,FnlWgt,Education,Education Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours Per Week,Native Country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


# Data Transformation

In [11]:
df.dtypes

Age                int64
Workclass         object
FnlWgt             int64
Education         object
Education Num      int64
Marital Status    object
Occupation        object
Relationship      object
Race              object
Sex               object
Capital Gain       int64
Capital Loss       int64
Hours Per Week     int64
Native Country    object
Income            object
dtype: object

In [12]:
columns2 = ['Workclass', 'Education', 'Marital Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Native Country', 'Income']

for column_name in columns2:
    lb = LabelEncoder()
    df[column_name] = lb.fit_transform(df[column_name])

df

Unnamed: 0,Age,Workclass,FnlWgt,Education,Education Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours Per Week,Native Country,Income
0,39,7,77516,8,13,4,1,1,4,1,2174,0,40,38,0
1,50,6,83311,8,13,2,4,0,4,1,0,0,13,38,0
2,38,4,215646,10,9,0,6,1,4,1,0,0,40,38,0
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,38,0
4,28,4,338409,8,13,2,10,5,2,0,0,0,40,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,4,257302,6,12,2,13,5,4,0,0,0,38,38,0
32557,40,4,154374,10,9,2,7,0,4,1,0,0,40,38,1
32558,58,4,151910,10,9,6,1,4,4,0,0,0,40,38,0
32559,22,4,201490,10,9,4,1,3,4,1,0,0,20,38,0


# Model

In [13]:
X = df[['Age', 'FnlWgt', 'Education Num', 'Capital Gain', 'Capital Loss', 'Hours Per Week']]
Y = df['Income']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Logistic Regression

In [14]:
lr = LogisticRegression()
lr.fit(X_train, Y_train)

In [15]:
pred_lr = lr.predict(X_test)
pred_lr

array([0, 0, 0, ..., 0, 1, 0])

In [16]:
pred_lr_acc = accuracy_score(Y_test, pred_lr)
pred_lr_acc

0.8124371438149514

# Naive Bayes

In [17]:
m_NB = MultinomialNB()
m_NB.fit(X_train, Y_train)

In [18]:
g_NB = GaussianNB()
g_NB.fit(X_train, Y_train)

In [19]:
pred_mnb = m_NB.predict(X_test)
pred_mnb

array([0, 0, 0, ..., 0, 1, 0])

In [20]:
pred_mnb_acc = accuracy_score(Y_test, pred_mnb)
pred_mnb_acc

0.7955078779751927

In [21]:
pred_gnb = g_NB.predict(X_test)
pred_gnb

array([0, 0, 0, ..., 0, 1, 0])

In [22]:
pred_gnb_acc = accuracy_score(Y_test, pred_gnb)
pred_gnb_acc

0.7996982903117666