In [1]:
import pandas as pd
import numpy as np

In [2]:
with open("adult.names") as file:
    print(file.read())

| This data was extracted from the census bureau database found at
| http://www.census.gov/ftp/pub/DES/www/welcome.html
| Donor: Ronny Kohavi and Barry Becker,
|        Data Mining and Visualization
|        Silicon Graphics.
|        e-mail: ronnyk@sgi.com for questions.
| Split into train-test using MLC++ GenCVFiles (2/3, 1/3 random).
| 48842 instances, mix of continuous and discrete    (train=32561, test=16281)
| 45222 if instances with unknown values are removed (train=30162, test=15060)
| Duplicate or conflicting instances : 6
| Class probabilities for adult.all file
| Probability for the label '>50K'  : 23.93% / 24.78% (without unknowns)
| Probability for the label '<=50K' : 76.07% / 75.22% (without unknowns)
|
| Extraction was done by Barry Becker from the 1994 Census database.  A set of
|   reasonably clean records was extracted using the following conditions:
|   ((AAGE>16) && (AGI>100) && (AFNLWGT>1)&& (HRSWK>0))
|
| Prediction task is to determine whether a person makes over

In [3]:
column_names = ['age','workclass','fnlwgt','education','education_num','marital_status','occupation',
            'relationship','race','sex','capital_gain','capital_loss','hours_per_week','native_country',
            'target']

In [4]:
df = pd.read_csv(r'adult.data', sep = ',', names = column_names)

In [5]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
df_test = pd.read_csv(r'adult.test', sep = ',', names = column_names)

In [7]:
df_test.drop(0, axis = 0, inplace = True)
df_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,target
1,25,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K.
2,38,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K.
3,28,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K.
4,44,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K.
5,18,?,103497.0,Some-college,10.0,Never-married,?,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K.


In [8]:
df['target'].unique()

array([' <=50K', ' >50K'], dtype=object)

In [9]:
df_test['target'].unique()

array([' <=50K.', ' >50K.'], dtype=object)

Let's encode target variable as integer in train and test datasets

In [10]:
df['target'] = df.target.apply(lambda x: 0 if x == ' <=50K' else 1)
df_test['target'] = df_test.target.apply(lambda x: 0 if x == ' <=50K.' else 1)

In [11]:
df.describe(include = 'all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
age,32561,,,,38.5816,13.6404,17.0,28.0,37.0,48.0,90.0
workclass,32561,9.0,Private,22696.0,,,,,,,
fnlwgt,32561,,,,189778.0,105550.0,12285.0,117827.0,178356.0,237051.0,1484700.0
education,32561,16.0,HS-grad,10501.0,,,,,,,
education_num,32561,,,,10.0807,2.57272,1.0,9.0,10.0,12.0,16.0
marital_status,32561,7.0,Married-civ-spouse,14976.0,,,,,,,
occupation,32561,15.0,Prof-specialty,4140.0,,,,,,,
relationship,32561,6.0,Husband,13193.0,,,,,,,
race,32561,5.0,White,27816.0,,,,,,,
sex,32561,2.0,Male,21790.0,,,,,,,


In [12]:
df['target'].value_counts(normalize = True)

0    0.75919
1    0.24081
Name: target, dtype: float64

In [13]:
df.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
target             int64
dtype: object

In [14]:
df_test.dtypes

age                object
workclass          object
fnlwgt            float64
education          object
education_num     float64
marital_status     object
occupation         object
relationship       object
race               object
sex                object
capital_gain      float64
capital_loss      float64
hours_per_week    float64
native_country     object
target              int64
dtype: object

In [15]:
df_test['age'] = df_test['age'].astype(int)
df_test['fnlwgt'] = df_test['fnlwgt'].astype(int)
df_test['education_num'] = df_test['education_num'].astype(int)
df_test['capital_gain'] = df_test['capital_gain'].astype(int)
df_test['capital_loss'] = df_test['capital_loss'].astype(int)
df_test['hours_per_week'] = df_test['hours_per_week'].astype(int)

In [16]:
df_test['target'].value_counts(normalize = True)

0    0.763774
1    0.236226
Name: target, dtype: float64

Let's find na, null and irrelevant values that are to be replaced by the mode or median if the column is categorical or numerical respectively.

In [17]:
df_na = df.isna()
df_na.agg(['unique']).T

Unnamed: 0,unique
age,[False]
workclass,[False]
fnlwgt,[False]
education,[False]
education_num,[False]
marital_status,[False]
occupation,[False]
relationship,[False]
race,[False]
sex,[False]


In [18]:
df_null = df.isnull()
df_null.agg(['unique'])

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,target
unique,[False],[False],[False],[False],[False],[False],[False],[False],[False],[False],[False],[False],[False],[False],[False]


In [19]:
df_test_na = df_test.isna()
df_test_na.agg(['unique'])

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,target
unique,[False],[False],[False],[False],[False],[False],[False],[False],[False],[False],[False],[False],[False],[False],[False]


In [20]:
df_test_null = df_test.isnull()
df_test_null.agg(['unique'])

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,target
unique,[False],[False],[False],[False],[False],[False],[False],[False],[False],[False],[False],[False],[False],[False],[False]


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  target          32561 non-null  int64 
dtypes: int64(7), object(8)
memory usage: 3.7+ MB


In [22]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16281 entries, 1 to 16281
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             16281 non-null  int32 
 1   workclass       16281 non-null  object
 2   fnlwgt          16281 non-null  int32 
 3   education       16281 non-null  object
 4   education_num   16281 non-null  int32 
 5   marital_status  16281 non-null  object
 6   occupation      16281 non-null  object
 7   relationship    16281 non-null  object
 8   race            16281 non-null  object
 9   sex             16281 non-null  object
 10  capital_gain    16281 non-null  int32 
 11  capital_loss    16281 non-null  int32 
 12  hours_per_week  16281 non-null  int32 
 13  native_country  16281 non-null  object
 14  target          16281 non-null  int64 
dtypes: int32(6), int64(1), object(8)
memory usage: 1.6+ MB


In [23]:
df['native_country'].unique()

array([' United-States', ' Cuba', ' Jamaica', ' India', ' ?', ' Mexico',
       ' South', ' Puerto-Rico', ' Honduras', ' England', ' Canada',
       ' Germany', ' Iran', ' Philippines', ' Italy', ' Poland',
       ' Columbia', ' Cambodia', ' Thailand', ' Ecuador', ' Laos',
       ' Taiwan', ' Haiti', ' Portugal', ' Dominican-Republic',
       ' El-Salvador', ' France', ' Guatemala', ' China', ' Japan',
       ' Yugoslavia', ' Peru', ' Outlying-US(Guam-USVI-etc)', ' Scotland',
       ' Trinadad&Tobago', ' Greece', ' Nicaragua', ' Vietnam', ' Hong',
       ' Ireland', ' Hungary', ' Holand-Netherlands'], dtype=object)

In [24]:
df.replace(' ?', np.nan, inplace = True)
df_test.replace(' ?', np.nan, inplace = True)

In [25]:
categorical_columns = [cat for cat in df.columns if df[cat].dtype.name == 'object']
numerical_columns = [cat for cat in df.columns if df[cat].dtype.name != 'object']
print(categorical_columns, '\n', numerical_columns)

['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country'] 
 ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week', 'target']


In [26]:
for e in categorical_columns:
    df[e].fillna(df[e].mode()[0], inplace = True)
    df_test[e].fillna(df[e].mode()[0], inplace = True)
    
for e in numerical_columns:
    df[e].fillna(df[e].median(), inplace = True)
    df_test[e].fillna(df[e].median(), inplace = True)

In [27]:
set(df.columns) - set(df_test.columns)

set()

In [28]:
for e in categorical_columns:
    if len(set(df.loc[:, e]) - set(df_test.loc[:, e])) > 0:
        print(e, ":", set(df.loc[:, e]) - set(df_test.loc[:, e]), '\n')
    else: continue

native_country : {' Holand-Netherlands'} 



In [29]:
df = pd.concat([df[numerical_columns], pd.get_dummies(df[categorical_columns])], axis = 1)
df_test = pd.concat([df_test[numerical_columns], pd.get_dummies(df_test[categorical_columns])], axis = 1)
df_test['native_country_  Holand-Netherlands'] = 0

In [30]:
X_train = df.drop(['target'], axis = 1)
y_train = df['target']
X_test = df_test.drop(['target'], axis = 1)
y_test = df_test['target']

In [31]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, cross_val_score

In [32]:
tree = DecisionTreeClassifier(criterion = 'entropy', max_depth = 3, random_state = 42)

In [33]:
tree.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=3, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [34]:
predictions = tree.predict(X_test)

In [35]:
accuracy_score(y_test, predictions)

0.8447884036607088

So the accuracy of our tree is 0.84. Let's try to use GridSearch to find out the best parameters for the model and whether we can improve the Decision tree

In [36]:
tree_depths = {'max_depth': range(1,20)}
best_tree = GridSearchCV(DecisionTreeClassifier(random_state = 42),
                                                tree_depths, cv = 5)
best_tree.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=42,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': range(1, 20)}, pre_dispatch='2*n_jobs',
  

In [37]:
best_tree.best_params_

{'max_depth': 9}

In [38]:
best_tree.best_score_

0.8566691696931217

In [39]:
tuned_tree = DecisionTreeClassifier(criterion = 'gini', max_depth = 9, random_state = 42)
tuned_tree.fit(X_train, y_train)

tuned_predictions = tuned_tree.predict(X_test)
accuracy_score(tuned_predictions, y_test)

0.8477366255144033