In [1]:
from lib.data.wrangler import PreProcess, FeatureSelector
import pandas as pd

In [2]:
# turns marital-status into boolean with-spouse and w/o spouse
def reducemarriage(coldict, marriage):
    spouses = {'Married-AF-spouse','Married-civ-spouse'}
    withspouse = {v for k,v in coldict['marital-status'].iteritems() if k.strip() in spouses}
    return marriage.isin(withspouse).astype("int")

In [3]:
# turns race into boolean white-nonwhite
def reducerace(coldict, race):
    white = None
    for k,v in coldict.get('race').iteritems():
        if k.strip() == 'White':
            white = v
    return (race==white).astype("int")

In [4]:
# turns native country into boolean developed vs under-developed
def reducenativecountry(coldict, native):
    rdev = set()
    devlist = ["Canada", "England", "France", "Germany", "Italy",
               "Ireland", "Japan", "Portugal", "Taiwan", "India",
               "Holand-Netherlands", "China", "United-States"]
    rdev = {v for k,v in coldict.get('native-country').iteritems() if k.strip() in devlist}
    return native.isin(rdev).astype("int")

In [5]:
# reduces edu levels to 4 categories Nohighschool, HighSchool, Associate/Vocational, Graduate
def reducedu(coldict, education):
    noHS  = {"10th", "11th", "12th", "1st-4th", "5th-6th", "7th-8th", "9th", "Preschool"}
    HS = {"HS-grad", "Some-college"}
    AS = {"Prof-school", "Assoc-acdm", "Assoc-voc"}
    nhs = set()
    hs = set()
    ass = set()
    for k,v in coldict['education'].iteritems():
        if k.strip() in noHS:
            nhs.add(v)
        elif k.strip() in HS:
            hs.add(v)
        elif k.strip() in AS:
            ass.add(v)
    edulevel = [1 if x in nhs else 2 
                      if x in hs else 3
                      if x in ass else 4
                      for x in education.values]
    return edulevel

In [6]:
# Reduces Workclass to 3 categories gov-employee, self-employed, private-employee
def reduceworkclass(coldict,workclass):
    govemp = {"Federal-gov", "Local-gov", "State-gov"}
    selfemp = {"Self-emp-inc", "Self-emp-not-inc"}
    govnum = set()
    selfnum = set()
    privnum = set()

    for k,v in coldict["workclass"].iteritems():
        if k.strip() in govemp:
            govnum.add(v)
        elif k.strip in selfemp:
            selfnum.add(v)
        else:
            privnum.add(v)
    worktype = [1 if x in govnum else 2
                if x in selfnum else 3
                for x in workclass.values]
    return worktype

In [7]:
# reduces Occupation to boolean highly paid and lower-pay'
def reduceoccup(coldict,occupation):
    whitecols = {"Exec-managerial", "Prof-specialty", "Tech-support", "Sales"}
    highsals = {v for k,v in coldict["occupation"].iteritems() if k.strip() in whitecols}
    return occupation.isin(highsals).astype("int")

In [8]:
# transforms the pre-preocessed data to get derived features
def transformer(prepdata,coldict):
    dervcols = ["age","hours-per-week","sex","income"]
    dervdata = prepdata[dervcols]
    dervdata["marital-status"] = reducemarriage(coldict, prepdata["marital-status"])
    dervdata["netcapita"] = prepdata["capital-gain"] - prepdata["capital-loss"]
    dervdata["race"] = reducerace(coldict, prepdata["race"])
    dervdata["native-country"] = reducenativecountry(coldict, prepdata["native-country"])
    dervdata["education-num"] = reducedu(coldict, prepdata["education-num"])
    dervdata["workclass"] = reduceworkclass(coldict,prepdata["workclass"])
    dervdata["occupation"] = reduceoccup(coldict, prepdata["occupation"])
    dervdata = dervdata[["age","netcapita","education-num",
                         "hours-per-week","marital-status","native-country",
                         "occupation","race","sex","workclass","income"]]
    return dervdata

In [9]:
#trains an baseline classsifier and returns the score (Accuracy)
def trainclf(clf, *args):
    Xtrain,ytrain,Xtest,ytest = args
    clf.fit(Xtrain,ytrain)
    return clf.score(Xtest,ytest)

In [10]:
# Gets train and CV data for the classifier
def getdata(selector):
    Xtrain, ytrain = prepselector.features, prepselector.labels
    Xtest, ytest = prepselector.testfeatures, prepselector.testlabels
    return Xtrain, ytrain, Xtest, ytest

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

# gets a performance dictionary for the types of data
def comparedata(prepselector, dervselector, dumdervselector):
    clfs = [DecisionTreeClassifier(), RandomForestClassifier(),
            ExtraTreesClassifier(), LogisticRegression(),
            GaussianNB(), SVC()]
    prepdata = getdata(prepselector)
    dervdata = getdata(dervselector)
    dumdervdata = getdata(dumdervselector)
    perf_dict = {}
    for clf in clfs:
        key = clf.__class__.__name__
        perf_dict[key] = {"prep":trainclf(clf,*prepdata),
                          "derv":trainclf(clf,*dervdata),
                          "dumderv": trainclf(clf,*dumdervdata) }
    return perf_dict

In [12]:
# Creating a pre-process object.
data = PreProcess()
# this is the pre-processed data with encoded categories instead of strings
prepdata = data.__getprep__()
# a dictionay mapping the strings in raw to the encoded categories in prepdata
coldict = data.col_map
# this is the raw data
rawdata = data.__getraw__()
# add education mapping to coldict since there is no mapping for that column
edudict = pd.Series(rawdata["education-num"].values,index=rawdata["education"]).to_dict()
coldict['education'] = edudict
# This is the derived data after doing some compression on the data
dervdata = transformer(prepdata,coldict)

# A temporary df containing only to categorical features for one-hot-encoding
catFrame = dervdata[["education-num","marital-status","native-country",
                     "occupation","race","sex","workclass"]]
frames = [pd.get_dummies(catFrame[col], prefix=col) for col in catFrame.columns]

# The one-hot-encoded features
dumdervdata = pd.concat(frames, axis=1)
dumdervdata = dumdervdata.merge(dervdata[["age","netcapita","hours-per-week","income"]], left_index=True, right_index=True)

In [13]:
# Defining the featureselector objects for the 3 types of features
prepselector = FeatureSelector(prepdata)
dervselector = FeatureSelector(dervdata)
dumdervselector = FeatureSelector(dumdervdata)

In [14]:
# final comparision - prep - pre-processed data, derv - Compressed data, dumderv - compressed and one-hot encoded data
comparedata(prepselector, dervselector, dumdervselector)

{'DecisionTreeClassifier': {'derv': 0.8150271266250384,
  'dumderv': 0.81205855256423376,
  'prep': 0.8112396355819429},
 'ExtraTreesClassifier': {'derv': 0.83672842665574776,
  'dumderv': 0.83539768655952507,
  'prep': 0.83335039410379774},
 'GaussianNB': {'derv': 0.79752277612856992,
  'dumderv': 0.79752277612856992,
  'prep': 0.79752277612856992},
 'LogisticRegression': {'derv': 0.81963353465042477,
  'dumderv': 0.81963353465042477,
  'prep': 0.81963353465042477},
 'RandomForestClassifier': {'derv': 0.84389395025079328,
  'dumderv': 0.84399631487357973,
  'prep': 0.84266557477735693},
 'SVC': {'derv': 0.81052308322243838,
  'dumderv': 0.81052308322243838,
  'prep': 0.81052308322243838}}