In [73]:
%matplotlib inline
import numpy as np
import pandas as pd

Here we are building a model to predict the income level of American based on some given demographic data.

The data is available [here](https://archive.ics.uci.edu/ml/datasets/adult)

Data is labeled with binary values. Zero for income less than 50k, one otherwise.

# Data preparation

In [2]:
df=pd.read_csv('annualincome.csv')
df.head()

Unnamed: 0,RECORD_ID,AGE,WORKCLASS,FNLWGHT,EDUCATION,EDUCATION_NUM,MARITAL_STATUS,OCCUPATION,RELATIONSHIP,RACE,SEX,CAPITAL_GAIN,CAPITAL_LOSS,HOURS_PER_WEEK,COUNTRY,PROXY,TARGET
0,1,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,0
1,2,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,0
2,3,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,0
3,4,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,0
4,5,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,0


In [170]:
len(df)

32561

In [5]:
df.columns

Index(['RECORD_ID', 'AGE', 'WORKCLASS', 'FNLWGHT', 'EDUCATION',
       'EDUCATION_NUM', 'MARITAL_STATUS', 'OCCUPATION', 'RELATIONSHIP', 'RACE',
       'SEX', 'CAPITAL_GAIN', 'CAPITAL_LOSS', 'HOURS_PER_WEEK', 'COUNTRY',
       'PROXY', 'TARGET'],
      dtype='object')

What are the unique labels of the `WORKCLASS` category?

In [6]:
df["WORKCLASS"].unique()

array(['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov',
       'Local-gov', 'Self-emp-inc', '?', 'Without-pay', 'Never-worked'], dtype=object)

What are the unique labels of the `MARITAL_STATUS` category?

In [7]:
df["MARITAL_STATUS"].unique()

array(['Never-married', 'Married-civ-spouse', 'Divorced',
       'Married-spouse-absent', 'Separated', 'Married-AF-spouse', 'Widowed'], dtype=object)

Replace `SEX` values with 0 and 1.

Remove `EDUCATION` (the numeric value is more convenient), `PROXY`, and `RECORD_ID`.

In [8]:
df["SEX"] = df["SEX"].replace({"Male":0, "Female":1})

#df["TARGET"] = df["TARGET"].replace({0:-1})
del df["EDUCATION"]
del df["PROXY"]
del df["RECORD_ID"]

Decision trees in `sklearn` cannot work with categorical variables. Convert the table so that each value of a categorical variable becomes a binary feature.

In [9]:
df = pd.get_dummies(df,
               columns=["WORKCLASS",
                        "MARITAL_STATUS",
                        "OCCUPATION",
                        "RELATIONSHIP",
                        "RACE",
                        "COUNTRY"
                       ],
               prefix=["work",
                       "marital",
                       "occ",
                       "rel",
                       "race",
                       "ctry"
                      ])

As a result, we have a lot more columns.

In [10]:
df.columns

Index(['AGE', 'FNLWGHT', 'EDUCATION_NUM', 'SEX', 'CAPITAL_GAIN',
       'CAPITAL_LOSS', 'HOURS_PER_WEEK', 'TARGET', 'work_?',
       'work_Federal-gov', 'work_Local-gov', 'work_Never-worked',
       'work_Private', 'work_Self-emp-inc', 'work_Self-emp-not-inc',
       'work_State-gov', 'work_Without-pay', 'marital_Divorced',
       'marital_Married-AF-spouse', 'marital_Married-civ-spouse',
       'marital_Married-spouse-absent', 'marital_Never-married',
       'marital_Separated', 'marital_Widowed', 'occ_?', 'occ_Adm-clerical',
       'occ_Armed-Forces', 'occ_Craft-repair', 'occ_Exec-managerial',
       'occ_Farming-fishing', 'occ_Handlers-cleaners', 'occ_Machine-op-inspct',
       'occ_Other-service', 'occ_Priv-house-serv', 'occ_Prof-specialty',
       'occ_Protective-serv', 'occ_Sales', 'occ_Tech-support',
       'occ_Transport-moving', 'rel_Husband', 'rel_Not-in-family',
       'rel_Other-relative', 'rel_Own-child', 'rel_Unmarried', 'rel_Wife',
       'race_Amer-Indian-Eskimo', 'race

Check if we have any empty values.

In [11]:
df.isnull().any().any()

False

In [12]:
df.head()

Unnamed: 0,AGE,FNLWGHT,EDUCATION_NUM,SEX,CAPITAL_GAIN,CAPITAL_LOSS,HOURS_PER_WEEK,TARGET,work_?,work_Federal-gov,...,ctry_Portugal,ctry_Puerto-Rico,ctry_Scotland,ctry_South,ctry_Taiwan,ctry_Thailand,ctry_Trinadad&Tobago,ctry_United-States,ctry_Vietnam,ctry_Yugoslavia
0,39,77516,13,0,2174,0,40,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,0,13,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,0,40,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,0,40,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,1,0,0,40,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Generate numpy arrays to feed into scikit-learn.

In [13]:
X = np.array(df.drop("TARGET",axis=1))
y = np.array(df["TARGET"])

X.shape, y.shape

((32561, 91), (32561,))

In [14]:
from sklearn.model_selection import train_test_split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.25)

# Decision tree algorithm

In [18]:
from sklearn import tree

In [30]:
clf = tree.DecisionTreeClassifier(max_depth=5)
clf = clf.fit(X_train, y_train)

Evaluate the score (accuracy) on the test set.

In [31]:
clf.score(X_test, y_test)

0.85444048642672887

The following produces a graphical representation of the decision tree.

In [None]:
with open("tree.dot", 'w') as f:
    f = tree.export_graphviz(clf, 
                             feature_names=df.drop("TARGET",axis=1).columns,
                             out_file=f)

List the features with their importances according to the decision tree.

In [22]:
for featname, featimp in zip(list(df.drop("TARGET",axis=1).columns), list(clf.feature_importances_)):
    print(featname, featimp)

AGE 0.00473830890157
FNLWGHT 0.0
EDUCATION_NUM 0.237445774022
SEX 0.0
CAPITAL_GAIN 0.212488124786
CAPITAL_LOSS 0.0529007579552
HOURS_PER_WEEK 0.0194206363234
work_? 0.00069455383336
work_Federal-gov 0.0
work_Local-gov 0.0
work_Never-worked 0.0
work_Private 0.00124736272941
work_Self-emp-inc 0.0
work_Self-emp-not-inc 0.000392688266667
work_State-gov 0.0
work_Without-pay 0.0
marital_Divorced 0.0
marital_Married-AF-spouse 0.0
marital_Married-civ-spouse 0.470543968886
marital_Married-spouse-absent 0.0
marital_Never-married 0.0
marital_Separated 0.0
marital_Widowed 0.0
occ_? 0.0
occ_Adm-clerical 0.0
occ_Armed-Forces 0.0
occ_Craft-repair 0.0
occ_Exec-managerial 0.0
occ_Farming-fishing 0.000127824296615
occ_Handlers-cleaners 0.0
occ_Machine-op-inspct 0.0
occ_Other-service 0.0
occ_Priv-house-serv 0.0
occ_Prof-specialty 0.0
occ_Protective-serv 0.0
occ_Sales 0.0
occ_Tech-support 0.0
occ_Transport-moving 0.0
rel_Husband 0.0
rel_Not-in-family 0.0
rel_Other-relative 0.0
rel_Own-child 0.0
rel_Unmarr

In [23]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [32]:
print("Confusion matrix")
print(confusion_matrix(y_true=y_test, y_pred=clf.predict(X_test) ))
print("Classification report")
print(classification_report(y_true=y_test, y_pred=clf.predict(X_test) ))

Confusion matrix
[[5925  309]
 [ 876 1031]]
Classification report
             precision    recall  f1-score   support

          0       0.87      0.95      0.91      6234
          1       0.77      0.54      0.64      1907

avg / total       0.85      0.85      0.84      8141



# Random Forest algorithm

In [25]:
from sklearn.ensemble import RandomForestClassifier

In [33]:
clf = RandomForestClassifier(n_estimators=100)
clf = clf.fit(X_train, y_train)

In [34]:
clf.score(X_test,y_test)

0.85849404250092132

In [35]:
for featname, featimp in zip(list(df.drop("TARGET",axis=1).columns), list(clf.feature_importances_)):
    if featimp > 0.01:
        print(featname, featimp*100,"percent")

AGE 14.9282789562 percent
FNLWGHT 16.1516423846 percent
EDUCATION_NUM 10.8614732302 percent
SEX 1.26811061212 percent
CAPITAL_GAIN 9.94011811443 percent
CAPITAL_LOSS 3.36613534858 percent
HOURS_PER_WEEK 8.46363045597 percent
marital_Married-civ-spouse 5.37259885657 percent
marital_Never-married 2.86863145948 percent
occ_Exec-managerial 2.02255835413 percent
occ_Prof-specialty 1.74470148608 percent
rel_Husband 4.62714851852 percent
rel_Not-in-family 1.41192462036 percent


In [36]:
print("Confusion matrix")
print(confusion_matrix(y_true=y_test, y_pred=clf.predict(X_test) ))
print("Classification report")
print(classification_report(y_true=y_test, y_pred=clf.predict(X_test) ))

Confusion matrix
[[5780  454]
 [ 698 1209]]
Classification report
             precision    recall  f1-score   support

          0       0.89      0.93      0.91      6234
          1       0.73      0.63      0.68      1907

avg / total       0.85      0.86      0.86      8141



# AdaBoost algorithm

In [37]:
from sklearn.ensemble import AdaBoostClassifier

In [38]:
clf = AdaBoostClassifier(n_estimators=100)
clf = clf.fit(X_train, y_train)

In [39]:
clf.score(X_test,y_test)

0.86795234000737009

In [40]:
for featname, featimp in zip(list(df.drop("TARGET",axis=1).columns), list(clf.feature_importances_)):
    if featimp > 0.01:
        print(featname, featimp)

AGE 0.13
FNLWGHT 0.05
EDUCATION_NUM 0.09
SEX 0.03
CAPITAL_GAIN 0.22
CAPITAL_LOSS 0.14
HOURS_PER_WEEK 0.04
rel_Husband 0.02
rel_Wife 0.02


In [41]:
print("Confusion matrix")
print(confusion_matrix(y_true=y_test, y_pred=clf.predict(X_test) ))
print("Classification report")
print(classification_report(y_true=y_test, y_pred=clf.predict(X_test) ))

Confusion matrix
[[5856  378]
 [ 697 1210]]
Classification report
             precision    recall  f1-score   support

          0       0.89      0.94      0.92      6234
          1       0.76      0.63      0.69      1907

avg / total       0.86      0.87      0.86      8141



# Gradient Boosting algorithm

In [42]:
from sklearn.ensemble import GradientBoostingClassifier

In [43]:
clf = GradientBoostingClassifier(n_estimators=100)
clf = clf.fit(X_train,y_train)

In [44]:
clf.score(X_test,y_test)

0.87040904065839575

In [45]:
for featname, featimp in zip(list(df.drop("TARGET",axis=1).columns), list(clf.feature_importances_)):
    if featimp > 0.01:
        print(featname, featimp)

AGE 0.134555457818
FNLWGHT 0.0337250059606
EDUCATION_NUM 0.10677104892
SEX 0.0189195617662
CAPITAL_GAIN 0.118222267744
CAPITAL_LOSS 0.129163799669
HOURS_PER_WEEK 0.0711553177061
work_Federal-gov 0.0150988984744
work_Self-emp-not-inc 0.0159094912641
marital_Married-civ-spouse 0.103676868866
occ_Exec-managerial 0.0294722163642
occ_Farming-fishing 0.02203913761
occ_Other-service 0.0188427695009
occ_Prof-specialty 0.0144819874234
occ_Protective-serv 0.0101553156761
occ_Tech-support 0.0142565044234
rel_Husband 0.0102364180271
rel_Wife 0.0283733509428
race_White 0.0103255208702


In [46]:
print("Confusion matrix")
print(confusion_matrix(y_true=y_test, y_pred=clf.predict(X_test) ))
print("Classification report")
print(classification_report(y_true=y_test, y_pred=clf.predict(X_test) ))

Confusion matrix
[[5897  337]
 [ 718 1189]]
Classification report
             precision    recall  f1-score   support

          0       0.89      0.95      0.92      6234
          1       0.78      0.62      0.69      1907

avg / total       0.87      0.87      0.87      8141



# Learning with balanced data

The precision and recall of the algorithms in the >50k case is significantly lower. This can be due to the fact that most of the data has < 50k. Let us choose an equal number of each label and repeat the learning.

In [49]:
len(df), len(df[ df["TARGET"]==1 ]), len(df[ df["TARGET"]==0 ])

(32561, 7841, 24720)

In [171]:
rich = df[ df["TARGET"]==1 ]
poor = df[ df["TARGET"]==0 ].sample(len(rich))

In [172]:
df_bal = pd.concat((rich,poor))

In [173]:
len(df_bal)

15682

In [174]:
X = np.array(df_bal.drop("TARGET",axis=1))
y = np.array(df_bal["TARGET"])

X.shape, y.shape

((15682, 96), (15682,))

In [179]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.25)

In [180]:
clf = GradientBoostingClassifier(n_estimators=100)
clf = clf.fit(X_train,y_train)

In [181]:
clf.score(X_test,y_test)

0.83295077786279015

In [182]:
print("Confusion matrix")
print(confusion_matrix(y_true=y_test, y_pred=clf.predict(X_test) ))
print("Classification report")
print(classification_report(y_true=y_test, y_pred=clf.predict(X_test) ))

Confusion matrix
[[1567  404]
 [ 251 1699]]
Classification report
             precision    recall  f1-score   support

          0       0.86      0.80      0.83      1971
          1       0.81      0.87      0.84      1950

avg / total       0.84      0.83      0.83      3921



Even though the overall accuracy is down a little, we have increased the precision and the accuracy of the >50k cases.

In [185]:
for featname, featimp in zip(list(df.drop("TARGET",axis=1).columns), list(clf.feature_importances_)):
    if featimp > 0.02:
        print(featname, featimp)

AGE 0.120689554242
FNLWGHT 0.0371659753372
EDUCATION_NUM 0.0878571922715
CAPITAL_GAIN 0.114161868711
CAPITAL_LOSS 0.0980555319702
HOURS_PER_WEEK 0.085559878969
marital_Married-civ-spouse 0.101439488292
occ_Exec-managerial 0.0291078732585
occ_Prof-specialty 0.0275814329182
rel_Wife 0.0370934125303


# Naive Bayes

## Data preparation

We apply BernouilliNB, which requires binary features. So we transform the raw data as follows:
* Discretize `AGE`, `CAPITAL_GAIN`, `CAPITAL_LOSS`, `HOURS_PER_WEEK`, `FNLWGHT`. The values will be 0 for the first 20-percentile, 1 for the second, and so on. Except for `HOURS_PER_WEEK`.
* For most people `CAPITAL_GAIN` and `CAPITAL_LOSS` are zero. So apply the discretization to values above 0.
* `HOURS_PER_WEEK` has a very narrow distribution. Most people report a 40-hour week. The 20-percentiles attain the same value. We discretize it with only two values: 0 (<=40 hours) and 1 (>40 hours).
* Apply `pandas.get_dummies()` to convert them to binary feature vectors.

Threshold values for numeric variables:

In [113]:
for feat in ["AGE", "FNLWGHT", "CAPITAL_GAIN","CAPITAL_LOSS"]:
    print(feat)
    print(np.percentile(df[df[feat]>0][feat], [20,40,60,80]))

AGE
[ 26.  33.  41.  50.]
FNLWGHT
[ 106648.  158662.  196338.  259873.]
CAPITAL_GAIN
[  3103.   5013.   7688.  15024.]
CAPITAL_LOSS
[ 1617.  1876.  1902.  2001.]


A utility function that takes a value x and a list of thresholds, and returns the index of the interval where x belongs.

In [105]:
def conv(x,thresholds):
    i = 0
    while i<len(thresholds):
        if x < thresholds[i]:
            break
        i += 1
    return i

Add new fields to the data frame with this grouping.

In [137]:
for feat in ["AGE", "FNLWGHT", "CAPITAL_GAIN","CAPITAL_LOSS"]:
    lst = np.percentile(df[df[feat]>0][feat], [20,40,60,80])
    df[feat+"_G"] = pd.Series([conv(x, lst) for x in df[feat]])

We break the working hours per week by its median only (median becomes 0).

In [143]:
hrs_med = [df["HOURS_PER_WEEK"].median()+1]
df["HOURS_PER_WEEK_G"] = pd.Series([conv(x, hrs_med ) for x in df["HOURS_PER_WEEK"]])

In [144]:
df.head()

Unnamed: 0,AGE,FNLWGHT,EDUCATION_NUM,SEX,CAPITAL_GAIN,CAPITAL_LOSS,HOURS_PER_WEEK,TARGET,work_?,work_Federal-gov,...,ctry_Thailand,ctry_Trinadad&Tobago,ctry_United-States,ctry_Vietnam,ctry_Yugoslavia,AGE_G,FNLWGHT_G,CAPITAL_GAIN_G,CAPITAL_LOSS_G,HOURS_PER_WEEK_G
0,39,77516,13,0,2174,0,40,0,0,0,...,0,0,1,0,0,2,0,0,0,0
1,50,83311,13,0,0,0,13,0,0,0,...,0,0,1,0,0,4,0,0,0,0
2,38,215646,9,0,0,0,40,0,0,0,...,0,0,1,0,0,2,3,0,0,0
3,53,234721,7,0,0,0,40,0,0,0,...,0,0,1,0,0,4,3,0,0,0
4,28,338409,13,1,0,0,40,0,0,0,...,0,0,0,0,0,1,4,0,0,0


Now remove the numerical columns

In [147]:
df2 = df.drop(["AGE", "FNLWGHT", "CAPITAL_GAIN","CAPITAL_LOSS","HOURS_PER_WEEK"],axis=1)

In [149]:
df2.head()

Unnamed: 0,EDUCATION_NUM,SEX,TARGET,work_?,work_Federal-gov,work_Local-gov,work_Never-worked,work_Private,work_Self-emp-inc,work_Self-emp-not-inc,...,ctry_Thailand,ctry_Trinadad&Tobago,ctry_United-States,ctry_Vietnam,ctry_Yugoslavia,AGE_G,FNLWGHT_G,CAPITAL_GAIN_G,CAPITAL_LOSS_G,HOURS_PER_WEEK_G
0,13,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,2,0,0,0,0
1,13,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,4,0,0,0,0
2,9,0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,2,3,0,0,0
3,7,0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,4,3,0,0,0
4,13,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,4,0,0,0


Now we binarize the categories.

In [150]:
df2 = pd.get_dummies(df2,
               columns=["AGE_G",
                        "FNLWGHT_G",
                        "CAPITAL_GAIN_G",
                        "CAPITAL_LOSS_G",
                        "EDUCATION_NUM"
                       ],
               prefix=["age",
                       "fnlwght",
                       "cgain",
                       "closs",
                       "edu"
                      ])

In [151]:
df2.head()

Unnamed: 0,SEX,TARGET,work_?,work_Federal-gov,work_Local-gov,work_Never-worked,work_Private,work_Self-emp-inc,work_Self-emp-not-inc,work_State-gov,...,edu_7,edu_8,edu_9,edu_10,edu_11,edu_12,edu_13,edu_14,edu_15,edu_16
0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0


Now we have a binary data set. We can apply Bernouilli Naive Bayes.

## Bernouilli Naive Bayes

In [168]:
X = np.array(df2.drop("TARGET",axis=1))
y = np.array(df2["TARGET"])
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.25)

In [155]:
from sklearn.naive_bayes import BernoulliNB

In [169]:
clf = BernoulliNB(binarize=None)
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.79253163002088201

In [163]:
print("Confusion matrix")
print(confusion_matrix(y_true=y_test, y_pred=clf.predict(X_test) ))
print("Classification report")
print(classification_report(y_true=y_test, y_pred=clf.predict(X_test) ))

Confusion matrix
[[4867 1358]
 [ 360 1556]]
Classification report
             precision    recall  f1-score   support

          0       0.93      0.78      0.85      6225
          1       0.53      0.81      0.64      1916

avg / total       0.84      0.79      0.80      8141



## Bernouilli Naive Bayes with balancing

In [164]:
rich = df2[ df2["TARGET"]==1 ]
poor = df2[ df2["TARGET"]==0 ].sample(len(rich))
df2_bal = pd.concat((rich,poor))

In [165]:
X = np.array(df2_bal.drop("TARGET",axis=1))
y = np.array(df2_bal["TARGET"])
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.25)

In [166]:
clf = BernoulliNB(binarize=None)
clf.fit(X_train,y_train)
clf.score(X_test, y_test)

0.79290997194593216

In [167]:
print("Confusion matrix")
print(confusion_matrix(y_true=y_test, y_pred=clf.predict(X_test) ))
print("Classification report")
print(classification_report(y_true=y_test, y_pred=clf.predict(X_test) ))

Confusion matrix
[[1422  530]
 [ 282 1687]]
Classification report
             precision    recall  f1-score   support

          0       0.83      0.73      0.78      1952
          1       0.76      0.86      0.81      1969

avg / total       0.80      0.79      0.79      3921



Again, balancing gives a better estimate for the richer group, but the overall accuracy of Naive Bayes is less than tree-based methods.