## Introduction

In [None]:
from IPython.display import Image
Image("DS_Pipeline.png")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn

from sklearn import preprocessing
from collections import Counter

## Defining the problem - loading the data:

In [None]:
full_adult_df = pd.read_csv("adult.csv")

## Problem description

In the beginning of a data analysis process, it is crucial to understand the problem that needs to be solved, identifying variables (columns), data types, inputs and outputs.

For this dataset, the goal is to learn whether a person's salary (output) is higher than $50K, relying only on other personal informations (inputs). As we have 2 possible values for output, this is considered a traditional 2-class classification problem. Our inputs are as follows:
* age (integer)
* workclass (categorical)
* fnlwgt (numeric)
* education (categorical)
* education-num (number of years of education - integer)
* marital-status (categorical)
* occupation (categorical)
* relationship (categorical)
* race (categorical)
* sex (categorical)
* capital-gain (numeric)
* capital-loss (numeric)
* hours-per-week (numeric)
* native-country (categorical)

The output is binary, as mentioned before:
* salary (binary: true if higher than $50K)



In [None]:
print(full_adult_df.dtypes)

numeric_inputs = ["age", "capital-gain", "capital-loss", "hours-per-week"]
categoric_inputs = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]
outputs = ["salary"]

In [None]:
full_adult_df

## EDA (Exploratory Data Analysis)

In [None]:
%matplotlib inline
seaborn.pairplot(full_adult_df[numeric_inputs])

In [None]:
categoric_count = {}
for column in categoric_inputs:
    categoric_count[column] = Counter(full_adult_df[column])

fig, ax = plt.subplots(3, 3, figsize = (14, 6))

for i in range(0, len(categoric_inputs)):
    column = categoric_inputs[i]
    ax[int(i / 3)][i % 3].pie(categoric_count[column].values(), labels = categoric_count[column].keys())
    
plt.plot()


    





## Preprocessing

In [None]:
normalizer = preprocessing.MinMaxScaler() 
normalized_df = normalizer.fit_transform(full_adult_df[numeric_inputs])
normalized_df = pd.DataFrame(normalized_df)
print(normalized_df.head())
normalized_df.columns = numeric_inputs
print(normalized_df.head())

In [None]:
label_encoder = preprocessing.LabelEncoder()
one_hot_encoder = preprocessing.OneHotEncoder(sparse = False)

full_processed_df = normalized_df.copy()

for column in categoric_inputs:
    values = sorted(categoric_count[column].keys())
    col_names = []
    for value in values:
        col_names.append(column + value)
    label_encoded_df = label_encoder.fit_transform(full_adult_df[column])
    one_hot_df = label_encoded_df.reshape(len(label_encoded_df), 1)
    one_hot_df = one_hot_encoder.fit_transform(one_hot_df)
    one_hot_df = pd.DataFrame(one_hot_df)
    one_hot_df.columns = col_names
    full_processed_df = full_processed_df.join(one_hot_df)
    


In [None]:
label_encoder = preprocessing.LabelEncoder()

output_df = label_encoder.fit_transform(full_adult_df["salary"])

## Splitting training and test

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(full_processed_df, full_adult_df["salary"], train_size = 0.80)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support
import scikitplot as skplt
import mlflow

experiment_name = "demo_adult"

experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None:
    experiment_id = mlflow.create_experiment(experiment_name)
else:
    experiment_id = experiment.experiment_id



## Training the models

### First attempt: KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
with mlflow.start_run(experiment_id = experiment_id):
    number_of_neighbours = 7
    knn_classifier = KNeighborsClassifier(number_of_neighbours)
    knn_classifier.fit(x_train, y_train)
    y_train_predict = knn_classifier.predict(x_train)
    y_predict = knn_classifier.predict(x_test)
    mlflow.log_param("Approach", "KNN")
    mlflow.log_param("Number of Neighbours", number_of_neighbours)
    mlflow.log_metric("Training accuracy", accuracy_score(y_train, y_train_predict))
    mlflow.log_metric("Accuracy", accuracy_score(y_test, y_predict))
    skplt.metrics.plot_confusion_matrix(y_test, y_predict)

In [None]:
precision_recall_fscore_support(y_test, y_predict)

### Second attempt: Decision trees

In [None]:
from sklearn.tree import DecisionTreeClassifier 
with mlflow.start_run(experiment_id = experiment_id):
    depth = 6
    dt_classifier = DecisionTreeClassifier(max_depth = depth) 
    
    dt_classifier.fit(x_train, y_train)
    y_train_predict = dt_classifier.predict(x_train)
    y_predict = dt_classifier.predict(x_test)
    mlflow.log_param("Approach", "Decision Tree")
    mlflow.log_param("Tree Height", depth)
    mlflow.log_metric("Training accuracy", accuracy_score(y_train, y_train_predict))
    mlflow.log_metric("Accuracy", accuracy_score(y_test, y_predict))
    skplt.metrics.plot_confusion_matrix(y_test, y_predict)


In [None]:
precision_recall_fscore_support(y_test, y_predict)

### Third attempt: Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(10) 
rf_classifier.fit(x_train, y_train)
y_predict = rf_classifier.predict(x_test)
skplt.metrics.plot_confusion_matrix(y_test, y_predict)
print(accuracy_score(y_test, y_predict))

In [None]:
precision_recall_fscore_support(y_test, y_predict)

### Fourth attempt: Support Vector Machines

In [None]:
from sklearn.svm import SVC
svm_classifier = SVC()
svm_classifier.fit(x_train, y_train)
y_predict = svm_classifier.predict(x_test)
print(accuracy_score(y_test, y_predict))