## Import Library 

In [3]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np 
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, OrdinalEncoder
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import KBinsDiscretizer

## Load Data

In [4]:
data = r'census_income_fix.csv'
data = pd.read_csv(data)
data.head()

Unnamed: 0,Age,Workclass,Fnlwgt,Education,Education-Num,MatrialStatus,Occupation,Relationship,Race,Sex,CapitalGain,CapitalLoss,Hoursperweek,Country,Target,Outlier,ExtremeValue
0,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,no,no
1,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,no,no
2,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,no,no
3,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K,no,no
4,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K,no,no


## Cek Missing Value

In [5]:
data.isna().sum()

Age              0
Workclass        0
Fnlwgt           0
Education        0
Education-Num    0
MatrialStatus    0
Occupation       0
Relationship     0
Race             0
Sex              0
CapitalGain      0
CapitalLoss      0
Hoursperweek     0
Country          0
Target           0
Outlier          0
ExtremeValue     0
dtype: int64

## Cek Duplicate

In [6]:
duplicate = data[data.duplicated()]
duplicate.shape

(20, 17)

In [7]:
data = data.drop_duplicates(keep='first')
data.shape

(24281, 17)

## Analisis Data

In [10]:
#analisis data pada kolom target
data['Target'].describe()

count     24281
unique        2
top       <=50K
freq      19338
Name: Target, dtype: object

In [3]:
#merubah nilai pada kolom target dengan satuan integer
def string_int(x):
    if x == '<=50K':
        return 1
    if x == '>50K':
        return 2
data['Trans_Target'] = data['Target'].apply(string_int)


## Pemilihan Variabel (kolom)

In [4]:
data.drop(['Outlier','ExtremeValue'], axis =1 , inplace=True)

## Label Encoder

In [5]:
labelencoder = LabelEncoder()
#data['Age_trans'] = labelencoder.fit_transform(data['Age'])
data['Workclass_trans'] = labelencoder.fit_transform(data['Workclass'])
data['Education_trans'] = labelencoder.fit_transform(data['Education'])
data['MatrialStatus_trans'] = labelencoder.fit_transform(data['MatrialStatus'])
data['Occupation_trans'] = labelencoder.fit_transform(data['Occupation'])
data['Relationship_trans'] = labelencoder.fit_transform(data['Relationship'])
data['Race_trans'] = labelencoder.fit_transform(data['Race'])
data['Sex_trans'] = labelencoder.fit_transform(data['Sex'])
data['Country_trans'] = labelencoder.fit_transform(data['Country'])
data.head()

Unnamed: 0,Age,Workclass,Fnlwgt,Education,Education-Num,MatrialStatus,Occupation,Relationship,Race,Sex,...,Target,Trans_Target,Workclass_trans,Education_trans,MatrialStatus_trans,Occupation_trans,Relationship_trans,Race_trans,Sex_trans,Country_trans
0,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,...,<=50K,1,3,11,0,5,1,4,1,37
1,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,...,<=50K,1,3,1,2,5,0,2,1,37
2,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,...,<=50K,1,3,9,2,9,5,2,0,4
3,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,...,<=50K,1,3,12,2,3,5,4,0,37
4,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,...,>50K,2,5,11,2,3,0,4,1,37


## Discretization

In [6]:
enc = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
X_binned = enc.fit_transform(data[['Age']])
data['trans_age'] = X_binned
data.head()

Unnamed: 0,Age,Workclass,Fnlwgt,Education,Education-Num,MatrialStatus,Occupation,Relationship,Race,Sex,...,Trans_Target,Workclass_trans,Education_trans,MatrialStatus_trans,Occupation_trans,Relationship_trans,Race_trans,Sex_trans,Country_trans,trans_age
0,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,...,1,3,11,0,5,1,4,1,37,1.0
1,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,...,1,3,1,2,5,0,2,1,37,2.0
2,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,...,1,3,9,2,9,5,2,0,4,0.0
3,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,...,1,3,12,2,3,5,4,0,37,1.0
4,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,...,2,5,11,2,3,0,4,1,37,2.0


In [7]:
enc = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
X_binned = enc.fit_transform(data[['Fnlwgt']])
data['trans_Fnlwgt'] = X_binned
data.head()

Unnamed: 0,Age,Workclass,Fnlwgt,Education,Education-Num,MatrialStatus,Occupation,Relationship,Race,Sex,...,Workclass_trans,Education_trans,MatrialStatus_trans,Occupation_trans,Relationship_trans,Race_trans,Sex_trans,Country_trans,trans_age,trans_Fnlwgt
0,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,...,3,11,0,5,1,4,1,37,1.0,3.0
1,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,...,3,1,2,5,0,2,1,37,2.0,3.0
2,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,...,3,9,2,9,5,2,0,4,0.0,5.0
3,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,...,3,12,2,3,5,4,0,37,1.0,4.0
4,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,...,5,11,2,3,0,4,1,37,2.0,3.0


In [8]:
enc = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
X_binned = enc.fit_transform(data[['Hoursperweek']])
data['trans_Hoursperweek'] = X_binned
data.head()

Unnamed: 0,Age,Workclass,Fnlwgt,Education,Education-Num,MatrialStatus,Occupation,Relationship,Race,Sex,...,Education_trans,MatrialStatus_trans,Occupation_trans,Relationship_trans,Race_trans,Sex_trans,Country_trans,trans_age,trans_Fnlwgt,trans_Hoursperweek
0,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,...,11,0,5,1,4,1,37,1.0,3.0,4.0
1,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,...,1,2,5,0,2,1,37,2.0,3.0,4.0
2,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,...,9,2,9,5,2,0,4,0.0,5.0,4.0
3,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,...,12,2,3,5,4,0,37,1.0,4.0,4.0
4,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,...,11,2,3,0,4,1,37,2.0,3.0,5.0


## Pembagian Data

In [9]:
X = data[['trans_age','Workclass_trans','trans_Fnlwgt','Education_trans','Education-Num','MatrialStatus_trans','Occupation_trans','Relationship_trans','Race_trans','Sex_trans','trans_Hoursperweek','Country_trans']]
y = data[['Target']]


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=1)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(19440, 12)
(19440, 1)
(4861, 12)
(4861, 1)


## Klasifikasi

In [11]:
NaiveBayes = MultinomialNB().fit(X_train,np.ravel(y_train,order='c'))
print(NaiveBayes)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


In [23]:
prediction = NaiveBayes.predict(X_test)
print(prediction)

from sklearn.metrics import classification_report
print(classification_report(y_test, prediction))


['<=50K' '<=50K' '<=50K' ... '>50K' '<=50K' '<=50K']
              precision    recall  f1-score   support

       <=50K       0.87      0.90      0.88     15449
        >50K       0.54      0.48      0.51      3991

    accuracy                           0.81     19440
   macro avg       0.71      0.69      0.70     19440
weighted avg       0.80      0.81      0.81     19440

