# Adults - Linear Classification

# Exercise
Load and preprocess the adult data as before.
include dummy encoding and scaling
Learn a logistic regression model and visualize the coefficients.
Then grid-search the regularization parameter C.
Compare the coefficients of the best model with the coefficients of a model with more regularization.

## Import Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.datasets.samples_generator import make_blobs
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

# Visualization Style
%matplotlib inline
#plt.style.use('seaborn-white')
plt.style.use('fivethirtyeight')
plt.tight_layout()

## Import Dataset

In [None]:
data = pd.read_csv("../datasets/adult.csv", index_col=0)
data.head()

In [None]:
income = data.income
data_features = data.drop("income", axis=1)

In [None]:
data_features.head()

## Data Encoding: using dummies

In [None]:
data_encoded = pd.get_dummies(data_features)
data_encoded.head()

## Data Splitting

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_encoded, income)

## Data Preprocessing

In [None]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)

## Cross-Validation with default parameters

In [None]:
scores = cross_val_score(LogisticRegression(), X_train, y_train, cv=5)
#print(scores.mean())

In [None]:
scores.mean()

## Grid-Search

In [None]:
param_grid = {'C': np.logspace(-3, 3, 7)}
param_grid

In [None]:
grid = GridSearchCV(LogisticRegression(solver='lbfgs'), param_grid, cv=5,
                    return_train_score=True)
grid.fit(X_train, y_train)


In [None]:
print("Best Params: ", grid.best_params_)
print("Best Score: ", grid.best_score_)

## Score Visualization

In [None]:
res = pd.DataFrame(grid.cv_results_)


In [None]:
res.columns

In [None]:
res[['param_C', 'mean_test_score']]

In [None]:
res.mean_test_score.plot()
res.mean_train_score.plot()