In [1]:
import pandas as pd
import numpy as np
import pickle 
import os

In [2]:
# Load dataset
url = 'data/adult/adult.csv'
df = pd.read_csv(url)
print(df.columns)

Index(['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')


In [3]:
col_names = df.columns
for i in col_names:
    df[i] = df[i].replace('?', np.NaN)
    
df = df.apply(lambda x:x.fillna(x.value_counts().index[0]))

df.replace(['Divorced', 'Married-AF-spouse', 
              'Married-civ-spouse', 'Married-spouse-absent', 
              'Never-married','Separated','Widowed'],
             ['divorced','married','married','married',
              'not married','not married','not married'], inplace = True)

In [4]:
# Label encoding
from sklearn import preprocessing
category_col =['workclass', 'race', 'education','marital-status', 'occupation',
               'relationship', 'gender', 'native-country', 'income'] 
labelEncoder = preprocessing.LabelEncoder()

mapping_dict={}
for col in category_col:
    df[col] = labelEncoder.fit_transform(df[col])
    le_name_mapping = dict(zip(labelEncoder.classes_, labelEncoder.transform(labelEncoder.classes_)))
    mapping_dict[col]=le_name_mapping
print(mapping_dict)

{'workclass': {'Federal-gov': 0, 'Local-gov': 1, 'Never-worked': 2, 'Private': 3, 'Self-emp-inc': 4, 'Self-emp-not-inc': 5, 'State-gov': 6, 'Without-pay': 7}, 'race': {'Amer-Indian-Eskimo': 0, 'Asian-Pac-Islander': 1, 'Black': 2, 'Other': 3, 'White': 4}, 'education': {'10th': 0, '11th': 1, '12th': 2, '1st-4th': 3, '5th-6th': 4, '7th-8th': 5, '9th': 6, 'Assoc-acdm': 7, 'Assoc-voc': 8, 'Bachelors': 9, 'Doctorate': 10, 'HS-grad': 11, 'Masters': 12, 'Preschool': 13, 'Prof-school': 14, 'Some-college': 15}, 'marital-status': {'divorced': 0, 'married': 1, 'not married': 2}, 'occupation': {'Adm-clerical': 0, 'Armed-Forces': 1, 'Craft-repair': 2, 'Exec-managerial': 3, 'Farming-fishing': 4, 'Handlers-cleaners': 5, 'Machine-op-inspct': 6, 'Other-service': 7, 'Priv-house-serv': 8, 'Prof-specialty': 9, 'Protective-serv': 10, 'Sales': 11, 'Tech-support': 12, 'Transport-moving': 13}, 'relationship': {'Husband': 0, 'Not-in-family': 1, 'Other-relative': 2, 'Own-child': 3, 'Unmarried': 4, 'Wife': 5}, 'g

In [5]:
#droping redundant columns
df=df.drop(['fnlwgt','educational-num'], axis=1)

In [6]:
df

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,3,1,2,6,3,2,1,0,0,40,38,0
1,38,3,11,1,4,0,4,1,0,0,50,38,0
2,28,1,7,1,10,0,4,1,0,0,40,38,1
3,44,3,15,1,6,0,2,1,7688,0,40,38,1
4,18,3,15,2,9,3,4,0,0,0,30,38,0
5,34,3,0,2,7,1,4,1,0,0,30,38,0
6,29,3,11,2,9,4,2,1,0,0,40,38,0
7,63,5,14,1,9,0,4,1,3103,0,32,38,1
8,24,3,15,2,7,4,4,0,0,0,40,38,0
9,55,3,5,1,2,0,4,1,0,0,10,38,0


In [7]:
X = df.values[:, :12]
Y = df.values[:, 12]

In [9]:
# train, test split
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size = 0.3, random_state = 100)
cfl = DecisionTreeClassifier(random_state = 100)
cfl.fit(X_train, y_train)
y_pred = cfl.predict(X_test)
accuracy_score(y_test, y_pred)

0.8145089742714803

In [12]:
# save model checkpoint
pickle.dump(cfl, open("models/model.pkl","wb"))