In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import sklearn
from sklearn.model_selection import GridSearchCV
import time

import warnings
warnings.filterwarnings('ignore')

## Functions

In [2]:
def printScore(y1, y2, n):
    print("Score: {:.3f}".format(sum(y1 == y2)/n))
    
def predict_and_save(clf, X_test, title):
    categories = clf.predict(X_test)
    ids = range(16281)
    
    pd.DataFrame(data={"Id": ids, "Category": categories}). \
        to_csv("submission_{}.csv".format(title), index=False)
    
    return categories

def get_best_classifiers(X_train, y_train, X_valid, y_valid):
    estimators = all_estimators()
    best_clf = {}

    for name, est in estimators:

        start_time = time.time()
        try:
            if hasattr(est, 'predict'):
                print(name)
                clf = est().fit(X_train, y_train)
                y_hat = clf.predict(X_valid)
                score = printScore(y_valid, y_hat, y_hat.shape[0])
                if score >= 0.7:
                    best_clf[name] = est
        except Exception as e:
            print(e)

        print('Time taken: {}\n'.format(time.time() - start_time))
        
    
    return best_clf

## Load Data

In [3]:
attributes = {
    "age": None,
    "workclass": ["Private", "Self-emp-not-inc", "Self-emp-inc", 
                  "Federal-gov", "Local-gov", "State-gov", 
                  "Without-pay", "Never-worked"],
    "fnlwgt": None,
    "education": ["Bachelors", "Some-college", "11th", "HS-grad",
                  "Prof-school", "Assoc-acdm", "Assoc-voc", "9th",
                  "7th-8th", "12th", "Masters", "1st-4th", "10th",
                  "Doctorate", "5th-6th", "Preschool"],
    "education-num": None,
    "marital-status": ["Married-civ-spouse", "Divorced", "Never-married",
                       "Separated", "Widowed", "Married-spouse-absent",
                       "Married-AF-spouse"],
    "occupation": ["Tech-support", "Craft-repair", "Other-service",
                   "Sales", "Exec-managerial", "Prof-specialty", 
                   "Handlers-cleaners", "Machine-op-inspct", 
                   "Adm-clerical", "Farming-fishing", "Transport-moving",
                   "Priv-house-serv", "Protective-serv", "Armed-Forces"],
    "relationship": ["Wife", "Own-child", "Husband", "Not-in-family",
                     "Other-relative", "Unmarried"],
    "race": ["White", "Asian-Pac-Islander", 'Amer-Indian-Eskimo',
             "Other", "Black"],
    "sex": ["Female", "Male"],
    "capital-gain": None,
    "capital-loss": None,
    "hours-per-week": None,
    "native-country": ["United-States", "Cambodia", "England", "Puerto-Rico", 
                       "Canada", "Germany", "Outlying-US(Guam-USVI-etc)",
                       "India", "Japan", "Greece", "South", "China",
                       "Cuba", "Iran", "Honduras", "Philippines", "Italy",
                       "Poland", "Jamaica", "Vietnam", "Mexico", "Portugal",
                       "Ireland", "France", "Dominican-Republic", "Laos",
                       "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary",
                       "Guatemala", "Nicaragua", "Scotland", "Thailand",
                       "Yugoslavia", "El-Salvador", "Trinadad&Tobago",
                       "Peru", "Hong", "Holand-Netherlands"],
    "income": None #Binary (0 means <=50K, 1 means >50K)
}
cols = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
       "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
       "hours-per-week", "native-country", "income"]
indices = [i for i in range(15)]
columns = {i: j for i, j in zip(indices, cols)}


In [4]:
train_data = pd.read_csv("data/train.data", header=None)
train_data = train_data.rename(columns=columns)
train_data = train_data.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
train_data = train_data.replace('?', np.nan)
train_data = pd.get_dummies(train_data)


In [5]:
print(train_data.shape)
train_data.head()

(32561, 106)


Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,income,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
test_data = pd.read_csv("data/test.data", header=None)
test_data = test_data.rename(columns=columns)
test_data = test_data.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
test_data = test_data.replace('?', np.nan)
test_data = pd.get_dummies(test_data)



In [7]:
print(test_data.shape)
test_data.head()

(16281, 104)


Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,25,226802,7,0,0,40,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,38,89814,9,0,0,50,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,28,336951,12,0,0,40,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,44,160323,10,7688,0,40,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,18,103497,10,0,0,30,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


#### Divide data

In [8]:
cols = test_data.columns

In [9]:
X = train_data[cols]
y = train_data['income']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25)

X_test = test_data[:]


## KNN

In [15]:
from sklearn.neighbors import KNeighborsClassifier

In [16]:
neigh = KNeighborsClassifier()
clf = neigh.fit(X_train, y_train)

In [19]:
y_hat = clf.predict(X_valid)
printScore(y_hat, y_valid, y_valid.shape[0])

Score: 0.775


In [12]:
params_knn = {
    'n_neighbors': [5, 7, 10, 13, 15, 17, 20],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [20, 25, 30, 35, 40, 45, 50],
    'p': [1, 2]
}

In [21]:
grid_knn = GridSearchCV(KNeighborsClassifier(), params_knn)

In [22]:
clf = grid_knn.fit(X_train, y_train)

In [23]:
y_hat = clf.predict(X_valid)
printScore(y,.hat, y_valid, y_valid.shape[0])

Score: 0.799


## SGD

In [33]:
from sklearn.linear_model import SGDClassifier

In [34]:
sgd = SGDClassifier()
clf = sgd.fit(X_train, y_train)

In [35]:
y_hat = clf.predict(X_valid)
printScore(y_hat, y_valid, y_valid.shape[0])

Score: 0.781


In [36]:
params_sgd = {
    'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron',
            'squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
    'penalty': ['none', 'l2', 'l1', 'elasticnet'],
    'alpha': [1e-6, 1e-5, 1e-4, 5e-4, 1e-3, 5e-3, 1e-2]
}

In [37]:
grid_sgd = GridSearchCV(SGDClassifier(), params_sgd)

In [None]:
clf = grid_sgd.fit(X_train, y_train)

In [None]:
y_hat = clf.predict(X_valid)
printScore(y_hat, y_valid, y_valid.shape[0])

In [None]:
predict_and_save(clf, X_test, "sgd")