In [23]:
import numpy as np
import pandas as pd

In [24]:
incomes_df = pd.read_csv("train.csv")
incomes_df.head()

Unnamed: 0,ID,Age,Workclass,Education,Marital.Status,Occupation,Relationship,Race,Sex,Hours.Per.Week,Native.Country,Income.Group
0,1,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K
1,2,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K
2,3,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K
3,4,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K
4,5,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K


In [25]:
# Let's checkout our continuous variables
incomes_df.describe()

Unnamed: 0,ID,Age,Hours.Per.Week
count,32561.0,32561.0,32561.0
mean,16281.0,38.581647,40.437456
std,9399.695394,13.640433,12.347429
min,1.0,17.0,1.0
25%,8141.0,28.0,40.0
50%,16281.0,37.0,40.0
75%,24421.0,48.0,45.0
max,32561.0,90.0,99.0


In [26]:
# Let's checkout missing values in the dataframe
incomes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 12 columns):
ID                32561 non-null int64
Age               32561 non-null int64
Workclass         30725 non-null object
Education         32561 non-null object
Marital.Status    32561 non-null object
Occupation        30718 non-null object
Relationship      32561 non-null object
Race              32561 non-null object
Sex               32561 non-null object
Hours.Per.Week    32561 non-null int64
Native.Country    31978 non-null object
Income.Group      32561 non-null object
dtypes: int64(3), object(9)
memory usage: 3.0+ MB


Here we have missing values in 3 variables:

1. Workclass (categorical)
2. Occupation (categorical)
3. Native.Country (categorical)

Since all of these are categorical, we can simply impute them with the mode values.

In [27]:
from scipy.stats import mode

In [28]:
cols_to_impute = ["Workclass", "Occupation", "Native.Country"]
for col in cols_to_impute:
    incomes_df[col].fillna(incomes_df[col].mode()[0], inplace=True)

In [29]:
incomes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 12 columns):
ID                32561 non-null int64
Age               32561 non-null int64
Workclass         32561 non-null object
Education         32561 non-null object
Marital.Status    32561 non-null object
Occupation        32561 non-null object
Relationship      32561 non-null object
Race              32561 non-null object
Sex               32561 non-null object
Hours.Per.Week    32561 non-null int64
Native.Country    32561 non-null object
Income.Group      32561 non-null object
dtypes: int64(3), object(9)
memory usage: 3.0+ MB


In [30]:
incomes_df["Workclass"].value_counts()/incomes_df.shape[0]

Private             0.753417
Self-emp-not-inc    0.078038
Local-gov           0.064279
State-gov           0.039864
Self-emp-inc        0.034274
Federal-gov         0.029483
Without-pay         0.000430
Never-worked        0.000215
Name: Workclass, dtype: float64

We can combine the workclass which are less than 5% into one class "Others"

In [31]:
categorical_variables = incomes_df.dtypes.loc[incomes_df.dtypes == "object"].index
categorical_variables

Index(['Workclass', 'Education', 'Marital.Status', 'Occupation',
       'Relationship', 'Race', 'Sex', 'Native.Country', 'Income.Group'],
      dtype='object')

In [32]:
# Run a loop over these values and combine
for column in categorical_variables:
    # Determine the categories to combine
    frq = incomes_df[column].value_counts()/incomes_df.shape[0]
    categories_to_combine = frq.loc[frq.values < 0.05].index
    
    # Loop over all the categories and combine them as others
    for cat in categories_to_combine:
        incomes_df[column].replace({cat: 'Others'}, inplace=True)
        incomes_df[column].replace({cat: 'Others'}, inplace=True)

In [33]:
incomes_df["Workclass"].value_counts()/incomes_df.shape[0]

Private             0.753417
Others              0.104266
Self-emp-not-inc    0.078038
Local-gov           0.064279
Name: Workclass, dtype: float64

sklearn accepts only numeric data so we've to convert text to numbers. We can use sklearn's Label encoder to do this

In [34]:
from sklearn.preprocessing import LabelEncoder

In [35]:
le = LabelEncoder()
for var in categorical_variables:
    incomes_df[var] = le.fit_transform(incomes_df[var])

In [36]:
incomes_df.dtypes

ID                int64
Age               int64
Workclass         int64
Education         int64
Marital.Status    int64
Occupation        int64
Relationship      int64
Race              int64
Sex               int64
Hours.Per.Week    int64
Native.Country    int64
Income.Group      int64
dtype: object

In [37]:
features = ["Age", "Workclass", "Education", "Marital.Status", "Occupation", "Relationship", "Race", "Sex", "Hours.Per.Week", "Native.Country"]
target = "Income.Group"

In [38]:
X = incomes_df[features]
Y = incomes_df[target]

In [39]:
# Let's split our dataframe into train and test sets
from sklearn.cross_validation import train_test_split
# To test our models accuracy score
from sklearn.metrics import accuracy_score

In [40]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=23)

In [41]:
# Let's use our RandomForest Classifier
from sklearn.ensemble import RandomForestClassifier

RFmodel = RandomForestClassifier()
RFmodel.fit(X_train, Y_train)
RFprediction = RFmodel.predict(X_test)
RF_acc = accuracy_score(Y_test, RFprediction)
RF_acc

0.79441117764471059

In [42]:
# Lets use our DecisionTree classifier
from sklearn.tree import DecisionTreeClassifier

DTmodel = DecisionTreeClassifier()
DTmodel.fit(X_train, Y_train)
DTprediction = DTmodel.predict(X_test)
DT_acc = accuracy_score(Y_test, DTprediction)
DT_acc

0.76877015200368493