In [2]:
import pandas 
import numpy 
from sklearn import preprocessing 
  
df = pandas.read_csv('adult.csv')     
df.head() 

Unnamed: 0.1,Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,12359,49,Private,180532,Masters,14,Married-spouse-absent,Prof-specialty,Not-in-family,White,Male,0,0,40,United-States,<=50K
1,11680,35,Local-gov,308945,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,>50K
2,13682,28,Private,47907,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States,<=50K
3,8807,32,Private,198068,HS-grad,9,Never-married,Transport-moving,Not-in-family,White,Male,0,0,60,United-States,<=50K
4,16644,23,Private,192978,HS-grad,9,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,40,United-States,<=50K


In [3]:
df.drop(columns="Unnamed: 0", inplace=True)

In [4]:
df = df.drop(['fnlwgt', 'education.num'], axis = 1) 
  
col_names = df.columns 
  
for c in col_names: 
    df = df.replace("?", numpy.NaN) 
df = df.apply(lambda x:x.fillna(x.value_counts().index[0])) 

In [5]:
df.columns

Index(['age', 'workclass', 'education', 'marital.status', 'occupation',
       'relationship', 'race', 'sex', 'capital.gain', 'capital.loss',
       'hours.per.week', 'native.country', 'income'],
      dtype='object')

In [6]:
df["marital.status"].unique()

array(['Married-spouse-absent', 'Married-civ-spouse', 'Never-married',
       'Divorced', 'Separated', 'Widowed', 'Married-AF-spouse'],
      dtype=object)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22792 entries, 0 to 22791
Data columns (total 13 columns):
age               22792 non-null int64
workclass         22792 non-null object
education         22792 non-null object
marital.status    22792 non-null object
occupation        22792 non-null object
relationship      22792 non-null object
race              22792 non-null object
sex               22792 non-null object
capital.gain      22792 non-null int64
capital.loss      22792 non-null int64
hours.per.week    22792 non-null int64
native.country    22792 non-null object
income            22792 non-null object
dtypes: int64(4), object(9)
memory usage: 2.3+ MB


In [8]:
df["race"]

0        White
1        White
2        White
3        White
4        White
         ...  
22787    White
22788    Black
22789    White
22790    White
22791    White
Name: race, Length: 22792, dtype: object

In [9]:
category_col =['workclass', 'race', 'education', 'marital.status', 'occupation', 
               'relationship', 'sex', 'native.country', 'income']  
labelEncoder = preprocessing.LabelEncoder() 
  
mapping_dict ={} 
for col in category_col: 
    df[col] = labelEncoder.fit_transform(df[col]) 
  
    le_name_mapping = dict(zip(labelEncoder.classes_, 
                        labelEncoder.transform(labelEncoder.classes_))) 
  
    mapping_dict[col]= le_name_mapping 
print(mapping_dict) 

{'workclass': {'Federal-gov': 0, 'Local-gov': 1, 'Never-worked': 2, 'Private': 3, 'Self-emp-inc': 4, 'Self-emp-not-inc': 5, 'State-gov': 6, 'Without-pay': 7}, 'race': {'Amer-Indian-Eskimo': 0, 'Asian-Pac-Islander': 1, 'Black': 2, 'Other': 3, 'White': 4}, 'education': {'10th': 0, '11th': 1, '12th': 2, '1st-4th': 3, '5th-6th': 4, '7th-8th': 5, '9th': 6, 'Assoc-acdm': 7, 'Assoc-voc': 8, 'Bachelors': 9, 'Doctorate': 10, 'HS-grad': 11, 'Masters': 12, 'Preschool': 13, 'Prof-school': 14, 'Some-college': 15}, 'marital.status': {'Divorced': 0, 'Married-AF-spouse': 1, 'Married-civ-spouse': 2, 'Married-spouse-absent': 3, 'Never-married': 4, 'Separated': 5, 'Widowed': 6}, 'occupation': {'Adm-clerical': 0, 'Armed-Forces': 1, 'Craft-repair': 2, 'Exec-managerial': 3, 'Farming-fishing': 4, 'Handlers-cleaners': 5, 'Machine-op-inspct': 6, 'Other-service': 7, 'Priv-house-serv': 8, 'Prof-specialty': 9, 'Protective-serv': 10, 'Sales': 11, 'Tech-support': 12, 'Transport-moving': 13}, 'relationship': {'Husba

In [10]:
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score 
  
X = df.values[:, 0:12] 
Y = df.values[:, 12] 

In [13]:
df

Unnamed: 0,age,workclass,education,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,49,3,12,3,9,1,4,1,0,0,40,37,0
1,35,1,11,2,2,0,4,1,0,0,40,37,1
2,28,3,9,4,0,1,4,0,0,0,40,37,0
3,32,3,11,4,13,1,4,1,0,0,60,37,0
4,23,3,11,2,5,0,4,1,0,0,40,37,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22787,43,3,9,2,7,5,4,0,0,0,35,37,1
22788,34,3,11,2,0,0,2,1,0,0,40,37,0
22789,37,3,11,2,0,5,4,0,0,0,60,37,0
22790,50,1,9,2,9,0,4,1,0,0,45,37,1


In [35]:
X_train, X_test, y_train, y_test = train_test_split( 
           X, Y, test_size = 0.3, random_state = 100) 
  
dt_clf_gini = DecisionTreeClassifier(criterion = "gini", 
                                     random_state = 100, 
                                     max_depth = 5, 
                                     min_samples_leaf = 5) 
  
dt_clf_gini.fit(X_train, y_train) 
y_pred_gini = dt_clf_gini.predict(X_test) 
  
print ("Desicion Tree using Gini Index\nAccuracy is ", 
             accuracy_score(y_test, y_pred_gini)*100 )

Desicion Tree using Gini Index
Accuracy is  83.16759286341036


In [36]:
#creating and training a model
#serializing our model to a file called model.pkl
import pickle
pickle.dump(dt_clf_gini, open("model.pkl","wb"))