# Data Preprocessing
# Exercise: Adult Data

## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
#from IPython.display import display

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LogisticRegression

## Data Loading

In [None]:
df = pd.read_csv("../datasets/adult.csv", index_col=0)

## Exercise 1

Load the "adult" dataset using consisting of income data from the census, 
including information whether someone has a salary of less than 50k or more. Look at the data using the head method. Our final goal in Exercise 4 will be to classify entries into those making less than 50k and those that make more.

In [None]:
df.head()

## Exercise 2
Experiment with visualizing the data. Can you find out which features influence the income the most?

In [None]:
df.age.hist()

### Plot Income by Gender

In [None]:
df['income_bin'] = df.income == " >50K"
plt.figure()
plt.title("By gender")
grouped = df.groupby("gender")
grouped.income_bin.mean().plot.barh()

### Plot Income by Education

In [None]:
plt.figure()
plt.title("By education")
df.groupby("education").income_bin.mean().sort_values().plot.barh()

### Plot Income by Race

In [None]:
plt.figure()
plt.title("By race")
df.groupby("race").income_bin.mean().sort_values().plot.barh()

## Exercise 3
Separate the target variable from the features.
Split the data into training and test set.
Apply dummy encoding and scaling.
How did this change the number of variables?

In [None]:
y = df['income']
y.head()

In [None]:
X = df.drop("income", axis=1)
X.head()

### Encoding Categorical Variables

In [None]:
X_encoded = pd.get_dummies(X)
print("X_train Shape: ", X.shape, "X_encoded Shape: ", X_encoded.shape)

### Data Splitting

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, random_state=0)

### Data Scaling

In [None]:
X_train.head()

In [None]:
scaler = MinMaxScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)

## Exercise 4
Build and evaluate a LogisticRegression model on the data.

In [None]:
logreg = LogisticRegression(C=0.1)
logreg.fit(X_train_scaled, y_train)

In [None]:
print("Training score:", logreg.score(X_train_scaled, y_train))

In [None]:
X_test_scaled = scaler.transform(X_test)
print("Test score:", logreg.score(X_test_scaled, y_test))

In [None]:
print("Fraction <= 50k", (y_train.values == " <=50K").mean())