In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import sklearn
from sklearn.linear_model import (LogisticRegression, SGDClassifier, 
                                  SGDRegressor, LinearRegression)
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.testing import all_estimators
import time

import warnings
warnings.filterwarnings('ignore')

## Functions

In [2]:
def printScore(y1, y2, n):
    print("Score: {:.3f}".format(sum(y1 == y2)/n))
    
def predict_and_save(clf, X_test, title):
    categories = clf.predict(X_test)
    ids = range(16281)
    
    pd.DataFrame(data={"Id": ids, "Category": categories}). \
        to_csv("submission_{}.csv".format(title), index=False)
    
    return categories

def get_best_classifiers(X_train, y_train, X_valid, y_valid):
    estimators = all_estimators()
    best_clf = {}

    for name, est in estimators:

        start_time = time.time()
        try:
            if hasattr(est, 'predict'):
                print(name)
                clf = est().fit(X_train, y_train)
                y_hat = clf.predict(X_valid)
                score = printScore(y_valid, y_hat, y_hat.shape[0])
                if score >= 0.7:
                    best_clf[name] = est
        except Exception as e:
            print(e)

        print('Time taken: {}\n'.format(time.time() - start_time))
        
    
    return best_clf

## Load Data

In [3]:
attributes = {
    "age": None,
    "workclass": ["Private", "Self-emp-not-inc", "Self-emp-inc", 
                  "Federal-gov", "Local-gov", "State-gov", 
                  "Without-pay", "Never-worked"],
    "fnlwgt": None,
    "education": ["Bachelors", "Some-college", "11th", "HS-grad",
                  "Prof-school", "Assoc-acdm", "Assoc-voc", "9th",
                  "7th-8th", "12th", "Masters", "1st-4th", "10th",
                  "Doctorate", "5th-6th", "Preschool"],
    "education-num": None,
    "marital-status": ["Married-civ-spouse", "Divorced", "Never-married",
                       "Separated", "Widowed", "Married-spouse-absent",
                       "Married-AF-spouse"],
    "occupation": ["Tech-support", "Craft-repair", "Other-service",
                   "Sales", "Exec-managerial", "Prof-specialty", 
                   "Handlers-cleaners", "Machine-op-inspct", 
                   "Adm-clerical", "Farming-fishing", "Transport-moving",
                   "Priv-house-serv", "Protective-serv", "Armed-Forces"],
    "relationship": ["Wife", "Own-child", "Husband", "Not-in-family",
                     "Other-relative", "Unmarried"],
    "race": ["White", "Asian-Pac-Islander", 'Amer-Indian-Eskimo',
             "Other", "Black"],
    "sex": ["Female", "Male"],
    "capital-gain": None,
    "capital-loss": None,
    "hours-per-week": None,
    "native-country": ["United-States", "Cambodia", "England", "Puerto-Rico", 
                       "Canada", "Germany", "Outlying-US(Guam-USVI-etc)",
                       "India", "Japan", "Greece", "South", "China",
                       "Cuba", "Iran", "Honduras", "Philippines", "Italy",
                       "Poland", "Jamaica", "Vietnam", "Mexico", "Portugal",
                       "Ireland", "France", "Dominican-Republic", "Laos",
                       "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary",
                       "Guatemala", "Nicaragua", "Scotland", "Thailand",
                       "Yugoslavia", "El-Salvador", "Trinadad&Tobago",
                       "Peru", "Hong", "Holand-Netherlands"],
    "income": None #Binary (0 means <=50K, 1 means >50K)
}
cols = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
       "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
       "hours-per-week", "native-country", "income"]
indices = [i for i in range(15)]
columns = {i: j for i, j in zip(indices, cols)}


In [4]:
train_data = pd.read_csv("data/train.data", header=None)
train_data = train_data.rename(columns=columns)
train_data = train_data.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
train_data = train_data.replace('?', np.nan)



In [5]:
print(train_data.shape)
train_data.head()

(32561, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [6]:
test_data = pd.read_csv("data/test.data", header=None)
test_data = test_data.rename(columns=columns)
test_data = test_data.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
test_data = test_data.replace('?', np.nan)



In [7]:
print(test_data.shape)
test_data.head()

(16281, 14)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States


#### Map strings to ints

In [8]:
string_cols = [i for i in train_data.dtypes.index if train_data.dtypes[i] != 'int64']
map_dict = {}
for i in string_cols:
    map_dict[i] = {}
    values = attributes[i]
    index = 0
    for val in values:
        map_dict[i][val] = index
        index += 1

In [9]:
train_data = train_data.replace(map_dict)
train_data.fillna(train_data.mean(), inplace=True)

test_data = test_data.replace(map_dict)
test_data.fillna(test_data.mean(), inplace=True)


In [10]:
test_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,25,0.0,226802,2,7,2,7.0,1,4,1,0,0,40,0.0
1,38,0.0,89814,3,9,0,9.0,2,0,1,0,0,50,0.0
2,28,4.0,336951,5,12,0,12.0,2,0,1,0,0,40,0.0
3,44,0.0,160323,1,10,0,7.0,2,4,1,7688,0,40,0.0
4,18,0.753688,103497,1,10,2,4.721123,1,0,0,0,0,30,0.0


#### Divide data

In [11]:
cols.remove('income')

In [12]:
X = train_data[cols]
y = train_data['income']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25)

X_test = test_data[:]


## Logistic regression

In [13]:
log_reg = LogisticRegression().fit(X_train, y_train)

In [14]:
y_hat = log_reg.predict(X_valid)
printScore(y_valid, y_hat, y_hat.shape[0])


Score: 0.794


In [15]:
y_log_reg = predict_and_save(log_reg, X_test, "log_reg")

## Linear regression

In [16]:
lin_reg = LinearRegression().fit(X_train, y_train)

In [17]:
y_hat = lin_reg.predict(X_valid)
printScore(y_valid, y_hat, y_hat.shape[0])

Score: 0.000


In [18]:
y_lin_reg = predict_and_save(lin_reg, X_test, "lin_reg")

In [19]:
y_lin_reg

array([-0.03202923,  0.28807492,  0.36970819, ...,  0.49882043,
        0.44684741,  0.51048605])

## SGD

In [20]:
sgd = SGDClassifier().fit(X_train, y_train)

In [21]:
y_hat = sgd.predict(X_valid)
printScore(y_valid, y_hat, y_hat.shape[0])

Score: 0.789


In [22]:
y_sgd = predict_and_save(sgd, X_test, "sgd")

## Random Forest

In [23]:
forest = RandomForestClassifier().fit(X_train, y_train)

In [24]:
y_hat = forest.predict(X_valid)
printScore(y_valid, y_hat, y_hat.shape[0])

Score: 0.846


In [25]:
forest = predict_and_save(forest, X_test, 'rand_forest')

## Try all estimators

In [None]:
best_clf = get_best_classifiers(X_train, y_train, X_valid, y_valid)

ARDRegression


In [None]:
best_clf

In [None]:
for clf_name in best_clf:
    start_time = time.time()
    print(clf_name)
    clf = best_clf[clf_name]().fit(X_train, y_train)
    y_hat = clf.predict(X_valid)
    printScore(y_valid, y_hat, y_hat.shape[0])
    
#     y_test = predict_and_save(clf, X_test, train_indices, clf_name)
    print("Time taken:", time.time()-start_time)
    print()