In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import sklearn
from sklearn.linear_model import (LogisticRegression, SGDClassifier, 
                                  SGDRegressor, LinearRegression)
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.testing import all_estimators
import time

import warnings
warnings.filterwarnings('ignore')

## Functions

In [None]:
def printScore(y1, y2, n):
    print("Score: {:.3f}".format(sum(y1 == y2)/n))
    
def predict_and_save(clf, X_test, title):
    categories = clf.predict(X_test)
    ids = range(16281)
    
    pd.DataFrame(data={"Id": ids, "Category": categories}). \
        to_csv("submission_{}.csv".format(title), index=False)
    
    return categories

def get_best_classifiers(X_train, y_train, X_valid, y_valid):
    estimators = all_estimators()
    best_clf = {}

    for name, est in estimators:

        start_time = time.time()
        try:
            if hasattr(est, 'predict'):
                print(name)
                clf = est().fit(X_train, y_train)
                y_hat = clf.predict(X_valid)
                score = printScore(y_valid, y_hat, y_hat.shape[0])
                if score >= 0.7:
                    best_clf[name] = est
        except Exception as e:
            print(e)

        print('Time taken: {}\n'.format(time.time() - start_time))
        
    
    return best_clf

## Load Data

In [None]:
attributes = {
    "age": None,
    "workclass": ["Private", "Self-emp-not-inc", "Self-emp-inc", 
                  "Federal-gov", "Local-gov", "State-gov", 
                  "Without-pay", "Never-worked"],
    "fnlwgt": None,
    "education": ["Bachelors", "Some-college", "11th", "HS-grad",
                  "Prof-school", "Assoc-acdm", "Assoc-voc", "9th",
                  "7th-8th", "12th", "Masters", "1st-4th", "10th",
                  "Doctorate", "5th-6th", "Preschool"],
    "education-num": None,
    "marital-status": ["Married-civ-spouse", "Divorced", "Never-married",
                       "Separated", "Widowed", "Married-spouse-absent",
                       "Married-AF-spouse"],
    "occupation": ["Tech-support", "Craft-repair", "Other-service",
                   "Sales", "Exec-managerial", "Prof-specialty", 
                   "Handlers-cleaners", "Machine-op-inspct", 
                   "Adm-clerical", "Farming-fishing", "Transport-moving",
                   "Priv-house-serv", "Protective-serv", "Armed-Forces"],
    "relationship": ["Wife", "Own-child", "Husband", "Not-in-family",
                     "Other-relative", "Unmarried"],
    "race": ["White", "Asian-Pac-Islander", 'Amer-Indian-Eskimo',
             "Other", "Black"],
    "sex": ["Female", "Male"],
    "capital-gain": None,
    "capital-loss": None,
    "hours-per-week": None,
    "native-country": ["United-States", "Cambodia", "England", "Puerto-Rico", 
                       "Canada", "Germany", "Outlying-US(Guam-USVI-etc)",
                       "India", "Japan", "Greece", "South", "China",
                       "Cuba", "Iran", "Honduras", "Philippines", "Italy",
                       "Poland", "Jamaica", "Vietnam", "Mexico", "Portugal",
                       "Ireland", "France", "Dominican-Republic", "Laos",
                       "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary",
                       "Guatemala", "Nicaragua", "Scotland", "Thailand",
                       "Yugoslavia", "El-Salvador", "Trinadad&Tobago",
                       "Peru", "Hong", "Holand-Netherlands"],
    "income": None #Binary (0 means <=50K, 1 means >50K)
}
cols = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
       "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
       "hours-per-week", "native-country", "income"]
indices = [i for i in range(15)]
columns = {i: j for i, j in zip(indices, cols)}


In [None]:
train_data = pd.read_csv("data/train.data", header=None)
train_data = train_data.rename(columns=columns)
train_data = train_data.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
train_data = train_data.replace('?', np.nan)



In [None]:
print(train_data.shape)
train_data.head()

In [None]:
test_data = pd.read_csv("data/test.data", header=None)
test_data = test_data.rename(columns=columns)
test_data = test_data.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
test_data = test_data.replace('?', np.nan)



In [None]:
print(test_data.shape)
test_data.head()

#### Map strings to ints

In [None]:
string_cols = [i for i in train_data.dtypes.index if train_data.dtypes[i] != 'int64']
map_dict = {}
for i in string_cols:
    map_dict[i] = {}
    values = attributes[i]
    index = 0
    for val in values:
        map_dict[i][val] = index
        index += 1

In [None]:
train_data = train_data.replace(map_dict)
train_data.fillna(train_data.mean(), inplace=True)

test_data = test_data.replace(map_dict)
test_data.fillna(test_data.mean(), inplace=True)


In [None]:
test_data.head()

#### Divide data

In [None]:
cols.remove('income')

In [None]:
X = train_data[cols]
y = train_data['income']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25)

X_test = test_data[:]


## Logistic regression

In [None]:
log_reg = LogisticRegression().fit(X_train, y_train)

In [None]:
y_hat = log_reg.predict(X_valid)
printScore(y_valid, y_hat, y_hat.shape[0])


In [None]:
y_log_reg = predict_and_save(log_reg, X_test, "log_reg")

## Linear regression

In [None]:
lin_reg = LinearRegression().fit(X_train, y_train)

In [None]:
y_hat = lin_reg.predict(X_valid)
printScore(y_valid, y_hat, y_hat.shape[0])

In [None]:
y_lin_reg = predict_and_save(lin_reg, X_test, "lin_reg")

In [None]:
y_lin_reg

## SGD

In [None]:
sgd = SGDClassifier().fit(X_train, y_train)

In [None]:
y_hat = sgd.predict(X_valid)
printScore(y_valid, y_hat, y_hat.shape[0])

In [None]:
y_sgd = predict_and_save(sgd, X_test, "sgd")

## Random Forest

In [None]:
forest = RandomForestClassifier().fit(X_train, y_train)

In [None]:
y_hat = forest.predict(X_valid)
printScore(y_valid, y_hat, y_hat.shape[0])

In [None]:
forest = predict_and_save(forest, X_test, 'rand_forest')

## Optimal parameters

In [None]:
params = {
    'AdaBoostClassifier': {
        'n_estimators': [30, 40, 50, 60, 70, 80, 90],
        'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.2, 0.5, 0.7, 0.8, 1.0],
        'algorithm': ['SAMME', 'SAMME.R']
    },
    
    'BaggingClassifier': {
        'n_estimators': [5, 10, 15, 20, 30, 40, 50],
        'max_samples': [0.2, 0.4, 0.6, 0.8, 1.0],
        'max_features': [0.2, 0.4, 0.6, 0.8, 1.0],
        'bootstrap': [True, False],
        'bootstrap_features': [True, False]
    },
    
    'BayesianGaussianMixture': {
        'n_components': [1, 2, 4, 8, 10, 15, 20],
        'covariance_type': ['full', 'tied', 'diag', 'spherical'],
        'tol': [0.00001, 0.00005, 0.0001, 0.001, 0.005, 0.01, 0.05, 0.1],
        'reg_covar': [0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01],
        'init_params': ['kmeans', 'random']
    },
    
    'BernoulliNB': {
        'alpha': [0.001, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5, 1.0],
        'fit_prior': [True, False]
    },
    
    'CalibratedClassifierCV': {
        'method': ['sigmoid', 'isotonic'],
        'cv': [None, 2, 3, 4, 5, 6]
    },
    
    'ExtraTreeRegressor': {
        'splitter': ['random', 'best'],
    },
    
    'ExtraTreesClassifier': {
        'n_estimators': [5, 10, 15, 20, 30, 40, 50],
        'criterion': ['gini', 'entropy']
    },
    
    'GaussianMixture': {
        'n_components': [1, 2, 4, 8, 10, 15, 20],
        'covariance_type': ['full', 'tied', 'diag', 'spherical'],
        'tol': [0.00001, 0.00005, 0.0001, 0.001, 0.005, 0.01, 0.05, 0.1],
        'reg_covar': [0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01],
        'init_params': ['kmeans', 'random']
    },
    
    'GaussianProcessRegressor': {
        'alpha': [1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 5e-3],
        'normalize_y': [True, False]
    },
    
    'GradientBoostingClassifier': {
        'loss': ['deviance', 'exponential'],
        'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.2, 0.5, 0.7, 0.8, 1.0],
        'n_estimators': [50, 70, 90, 100, 120, 140, 160],
        'criterion': ['friedman_mse', 'mse', 'mae']
    },
    
    'KNeighborsClassifier': {
        'n_neighbors': [3, 4, 5, 7, 9, 11, 14, 16, 20],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'leaf_size': [10, 15, 20, 30, 40, 50, 60, 70],
        'p' : [1, 2]
    },
    
    'LabelPropagation': {
        'kernel': ['knn', 'rbf'],
        'gamma': [0.01, 0.05, 0.1, 0.5, 1.0],
        'n_neighbors': [3, 4, 5, 7, 10, 13, 15, 18]
    },
    
    'LabelSpreading': {
        'kernel': ['knn', 'rbf'],
        'gamma': [0.01, 0.05, 0.1, 0.5, 1.0],
        'n_neighbors': [3, 4, 5, 7, 10, 13, 15, 18],
        'alpha': [0.001, 0.01, 0.05, 0.1, 0.5, 0.6, 0.8]
    },
    
    'LinearSVC': {
        'penalty': ['l1', 'l2'],
        'loss': ['hinge', 'squared_hinge'],
        'dual': [True, False],
        'tol': [0.00001, 0.00005, 0.0001, 0.001, 0.005, 0.01, 0.05, 0.1],
        'C': [0.1, 0.5, 1.0, 1.3, 1.5, 2.0],
        'multi_class': ['ovr', 'crammer_singer'],
        'fit_intercept': [True, False]
    },
    
    'LogisticRegressionCV': {
        'Cs': [1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1000, 5000, 8000],
        'fit_intercept': [True, False],
        'cv': [2, 3, 4, 5, 6, 7],
        'dual': [True, False],
        'penalty': ['l1', 'l2'],
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
        'tol': [0.00001, 0.00005, 0.0001, 0.001, 0.005, 0.01, 0.05, 0.1]
    },
    
    'MLPClassifier': {
        'activation': ['identity', 'logistic', 'tanh', 'relu'],
        'solver': ['lbfgs', 'sgd', 'adam'],
        'alpha': [0.000001, 0.00001, 0.0001, 0.0005, 0.001, 0.01, 0.1, 0.5],
        'learning_rate': ['constant', 'invscaling', 'adaptive'],
        'learning_rate_init': [0.001, 0.01, 0.05, 0.1, 0.2, 0.5, 0.7, 0.8, 1.0]
    },
    
    'MultinomialNB': {
        'alpha': [0.01, 0.1, 0.5, 1.0, 1.5, 1.8, 2.1],
        'fit_prior': [True, False]
    },
    
    'PassiveAggressiveClassifier': {
        'C': [0.5, 1.0, 1.4, 1.8, 2.0, 2.5],
        'fit_intercept': [True, False],
        'tol': [0.00001, 0.00005, 0.0001, 0.001, 0.005, 0.01, 0.05, 0.1],
        'early_stopping': [True, False]
    },
    
    'Perceptron': {
        'penalty': [None, 'l1', 'l2', 'elasticnet'],
        'alpha': [0.000001, 0.00001, 0.0001, 0.0005, 0.001, 0.01, 0.1, 0.5],
        'fit_intercept': [True, False],
        'tol': [0.00001, 0.00005, 0.0001, 0.001, 0.005, 0.01, 0.05, 0.1]
    },
    
    'RandomForestClassifier': {
        'n_estimators': [3, 5, 10, 15, 20, 25, 30],
        'criterion': ['gini', 'entropy'],
        'max_features': ['auto', 'sqrt', 'log2', None],
        'min_impurity_split': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3],
        'bootstrap': [True, False],
        'oob_score': [True, False]
    },
    

    
}