# Path of the data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data_path = "/content/drive/MyDrive/Courses/Data Quality/Project/data.csv"

# Data importing

## Download Packages

In [None]:
!pip install sweetviz

## Import Packages

In [None]:
import sweetviz as sv
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import scipy.sparse
from scipy.stats import zscore
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from scipy.spatial.distance import cosine
import matplotlib.gridspec as gridspec

## Import Data

We start importing the original dataset.

In [None]:
originaldata = pd.read_csv(data_path)
originaldata

We define the function that will create 5 different dataset from the original data: the first with the 50% of missing values, the second with the 40% of missing values, the third with the 30% of missing values, the fourth with the 20% of missing values and the fifth with the 10% of missing values.

In [None]:
def check_datatypes(df):
    for col in df.columns:
        if (df[col].dtype == "bool"):
            df[col] = df[col].astype('string')
            df[col] = df[col].astype('object')
    return  df

def injection(df_pandas, seed, name, name_class): 
    # the first is the dataset where we want to do the injection
    # when we set a seed, it is to control the randomness of the injection. the fu ction with always the same seed will do the same injection
    # name class is the class to be predicted, because in the injection this column is the one to be predicted, and so it is the target, used for example in classification
    # so we specify this column so that the injection dont touch this column

    np.random.seed(seed)

    #%%

    df_list = []

    perc = [0.50, 0.40, 0.30, 0.20, 0.10]
    for p in perc:
        df_dirt = df_pandas.copy()
        comp = [p,1-p]
        df_dirt = check_datatypes(df_dirt)

        for col in df_dirt.columns:

            if col!=name_class:

                rand = np.random.choice([True, False], size=df_dirt.shape[0], p=comp)

                df_dirt.loc[rand == True,col]=np.nan

        df_list.append(df_dirt)
        print("saved {}-completeness{}%".format(name, round((1-p)*100)))
    return df_list

    # so given a dataset, the function return 5 datasets: each one with 10% of missing, 20% of values, 30%, 40% and 50%

In [None]:
datasets_no_imputed = injection(df_pandas=originaldata, seed=2, name='data', name_class='income')

In [None]:
datasets = datasets_no_imputed.copy()

# Data profiling

Data profiling is the set of activities and processes designed to determine the metadata of a given dataset. Data profiling helps understand and prepare data for subsequent cleansing, integration, and analysis.

## Basic

In [None]:
originaldata.shape

In [None]:
cols = originaldata.columns
cols

In [None]:
originaldata.dtypes

In [None]:
n_rows = len(originaldata)

## Cardinalities

In [None]:
# with all the 5 datasets
i = 50
for dataset in datasets:
  print("\n\n\nDataset with %d of missing values\n" %(i))
  for col in dataset.columns:
    print('\n' + col + '\n')
    print(dataset[col].value_counts())
  i = i - 10

In [None]:
# with the original dataset
print("\n\n\nOriginal dataset\n")
for col in originaldata.columns:
    print('\n' + col + '\n')
    print(originaldata[col].value_counts())

* Age, fnlwgt, education-num, hours-per-week can be treated as a numeric value
* Workclass, education, maritial-status, occupation, relationship, race, sex are categorical, with the values that have not so different frequencies.
* Capital-gain and Capital-loss are numeric, but it is good to treat it as a categorical variable, since it seems that they have a particular value that is way more frequent than the others.
* Native country is a categorical variable that have a particular value that is way more frequent than the others.

It is important to note that with the replacement of missing values, some categorical values lost some distinct values (eg. Workflow has originally 7 distinct and in the 50% dataset has 6 distinct)

## Uniqueness

In [None]:
# with original dataset
for col in originaldata.columns:
  distinct_values = (originaldata[col].nunique())
  #count_values = originaldata[col].count()
  uniqueness = distinct_values / n_rows
  print("\nUniqueness variable", col,": ", uniqueness, "\n")

Since the attribute `fnlwgt` seems to be unique for each row, we can remove it.

In [None]:
originaldata = originaldata.drop(columns="fnlwgt")

In [None]:
datasets_no_imputed[0] = datasets_no_imputed[0].drop(columns="fnlwgt")
datasets_no_imputed[1] = datasets_no_imputed[1].drop(columns="fnlwgt")
datasets_no_imputed[2] = datasets_no_imputed[2].drop(columns="fnlwgt")
datasets_no_imputed[3] = datasets_no_imputed[3].drop(columns="fnlwgt")
datasets_no_imputed[4] = datasets_no_imputed[4].drop(columns="fnlwgt")

In [None]:
datasets = datasets_no_imputed.copy()

## Distinctness

In [None]:
# with original dataset
for col in originaldata.columns:
  distinct_values = (originaldata[col].nunique())
  count_values = originaldata[col].count()
  distinctness = distinct_values / count_values
  print("\nDistinctness variable", col,": ",  distinctness, "\n")

## Pandas Profiling Library

#### Original data

In [None]:
#sweetviz library
sweet_report = sv.analyze(originaldata)
sweet_report.show_notebook()

#### Dataset with 50% of missing values

In [None]:
#sweetviz library
sweet_report50 = sv.analyze(datasets[0])
sweet_report50.show_notebook()

#### Dataset with 40% of missing values

In [None]:
#sweetviz library
sweet_report40 = sv.analyze(datasets[1])
sweet_report40.show_notebook()

#### Dataset with 30% of missing values

In [None]:
#sweetviz library
sweet_report30 = sv.analyze(datasets[2])
sweet_report30.show_notebook()

#### Dataset with 20% of missing values

In [None]:
#sweetviz library
sweet_report20 = sv.analyze(datasets[3])
sweet_report20.show_notebook()

#### Dataset with 10% of missing values

In [None]:
#sweetviz library
sweet_report10 = sv.analyze(datasets[4])
sweet_report10.show_notebook()

# Classification algorithms

## Preparation

To feed a ML algorithm with data, it should be with a specific structure. The first thing
to notice is that the data consists of numerical and categorical data. Since the ML algorithm that I will use will be Logistic regression and SVM, they need the data to be
numerical. Thus, I decided to hot-encode the categorical variables. Moreover, I
performed a standardization using a z score: all the data will be standardized using the
mean and the std dev.

In [None]:
def preparation(data, categorical_variables, numerical_variables, target_column, splitting_ratio):
  
  X = data.loc[:, originaldata.columns!=target_column]
  t = data[target_column]

  categorical_columns = categorical_variables
  numerical_columns = numerical_variables

  categorical_preprocessor = OneHotEncoder(handle_unknown="ignore") # processors for categorical columns
  numerical_preprocessor = StandardScaler() # processors for numerical columns

  preprocessor = ColumnTransformer([
      ('one-hot-encoder', categorical_preprocessor, categorical_columns), # trasformer for categorical columns
      ('standard_scaler', numerical_preprocessor, numerical_columns)]) # transformer for numerical columns

  X = pd.DataFrame.sparse.from_spmatrix(preprocessor.fit_transform(X))   

  X = pd.DataFrame(zscore(X.values))
  X, t = shuffle(X, t, random_state=0)

  X_train, X_test, y_train, y_test = train_test_split(X, t, test_size=splitting_ratio, random_state=0)
  return X_train, X_test, y_train, y_test

## Performances metrics definitions

To evaluate the performances of the chosen method, we need to compute the *confusion matrix* which tells us the number of points which have been correctly classified and those which have been misclassified.

 <table style="width:100%">
  <tr>
    <th>
    <center> <img src='https://upload.wikimedia.org/wikipedia/commons/2/26/Precisionrecall.svg' width=300 /> </center>
    </th>
    <th>\begin{array}{|l|l|l|}
	\hline
	&\text{Actual Class: 1}	& \text{Actual Class: 0}\\
	\hline
	\text{Predicted Class: 1}	& tp	& fp\\
	\hline
	\text{Predicted Class: 0}	& fn	& tn\\
	\hline
\end{array}
</th>
  </tr>

</table> 

Based on this matrix we can evaluate:
- Accuracy: $Acc = \frac{tp + tn}{N}$ fraction of the samples correctly classified in the dataset;
- Precision $Pre = \frac{tp}{tp + fp}$ fraction of samples correctly classified in the positive class among the ones classified in the positive class;
- Recall: $Rec = \frac{tp}{tp + fn}$ fraction of samples correctly classified in the positive class among the ones belonging to the positive class;
- F1 score: $F1 = \frac{2 \cdot Pre \cdot Rec}{Pre + Rec}$ harmonic mean of the precision and recall;

where $tn$ is the number of true negatives, $fp$ is the number of false positives, $fn$ are the false negatives and $tn$ are the true negatives.
Equivalently, we can look at the meaning of Precision and Recall by looking at the figure above.

Remember that:
- The higher these figures of merits the better the algorithm is performing.
- These performance measures are **not** symmetric, but depends on the class we selected as positive.
- Depending on the **application** one might switch the classes to have measures which better evaluate the predictive power of the classifier.

In [None]:
def conf_matrix(y_test, pred_test):    
    
    # Creating a confusion matrix
    con_mat = confusion_matrix(y_test, pred_test)
    con_mat = pd.DataFrame(con_mat, range(2), range(2))
   
    #Ploting the confusion matrix
    plt.figure(figsize=(6,6))
    sns.set(font_scale=1.5) 
    sns.heatmap(con_mat, annot=True, annot_kws={"size": 16}, fmt='g', cmap='Blues', cbar=False)

## Logistic regression

In [None]:
def logistic_regression_classifier(X_tr, X_te, y_tr, y_te):
  log_classifier = LogisticRegression(solver='lbfgs', max_iter=1000, class_weight='balanced') # regularization is applied as default
  log_classifier.fit(X_tr, y_tr)
  y_pred = log_classifier.predict(X_te)

  metrics = {
      'accuracy': accuracy_score(y_te, y_pred),
      'precision_greater': precision_score(y_te, y_pred, pos_label='>50K'),
      'precision_smaller': precision_score(y_te, y_pred, pos_label='<=50K'),
      'recall_greater': recall_score(y_te, y_pred, pos_label='>50K'),
      'recall_smaller': recall_score(y_te, y_pred, pos_label='<=50K'),
      'f1_greater': f1_score(y_te, y_pred, pos_label='>50K'),
      'f1_smaller': f1_score(y_te, y_pred, pos_label='<=50K'),
  }

  print("\nMetrics:\n")
  print(classification_report(y_te, y_pred))
  print("\nAccuracy: ", accuracy_score(y_te, y_pred), "\n\n")
  print("\nConfusion Matrix:\n")
  conf_matrix(y_te, y_pred)

  return metrics

## Support vector machine

In [None]:
def SVM_classifier(X_tr, X_te, y_tr, y_te, kernel):
  SVM_classifier = svm.SVC(C=1.0, random_state=1, kernel=kernel, class_weight="balanced")
  SVM_classifier.fit(X_tr, y_tr)
  y_pred = SVM_classifier.predict(X_te)
  
  metrics = {
      'accuracy': accuracy_score(y_te, y_pred),
      'precision_greater': precision_score(y_te, y_pred, pos_label='>50K'),
      'precision_smaller': precision_score(y_te, y_pred, pos_label='<=50K'),
      'recall_greater': recall_score(y_te, y_pred, pos_label='>50K'),
      'recall_smaller': recall_score(y_te, y_pred, pos_label='<=50K'),
      'f1_greater': f1_score(y_te, y_pred, pos_label='>50K'),
      'f1_smaller': f1_score(y_te, y_pred, pos_label='<=50K'),
  }

  print("\nMetrics:\n")
  print(classification_report(y_te, y_pred))
  print("\nAccuracy: ", accuracy_score(y_te, y_pred), "\n\n")
  print("\nConfusion Matrix:\n")
  conf_matrix(y_te, y_pred)

  return metrics

# ML on original data

## Logistic Regression

In [None]:
categorical_columns = ["workclass","education","maritial-status","occupation","relationship","race","sex","native-country"]
numerical_columns = ["age","education-num","hours-per-week","capital-gain","capital-loss"]

X_train, X_test, y_train, y_test = preparation(originaldata, categorical_columns, numerical_columns, "income", 0.2)

In [None]:
metrics_lr_original = logistic_regression_classifier(X_train, X_test, y_train, y_test)

## Support Vector Machine

In [None]:
categorical_columns = ["workclass","education","maritial-status","occupation","relationship","race","sex","native-country"]
numerical_columns = ["age","education-num","hours-per-week","capital-gain","capital-loss"]

X_train, X_test, y_train, y_test = preparation(originaldata, categorical_columns, numerical_columns, "income", 0.2)

In [None]:
metrics_svm_original = SVM_classifier(X_train, X_test, y_train, y_test, 'linear')

# Advanced Imputation

## Imputation

Looking at the values that each attribute assumes, and at the cardinality of each value, we can choose how to treat each attribute for the imputation:
* Age, education-num, hours-per-week, Capital-gain and Capital-loss can be treated as a numeric value, and thus ***ML-based Imputation using KNN*** imputation will be used
* Workclass, education, maritial-status, occupation, relationship, race, sex, Native country are categorical. We will use ***Decision Trees*** method

The imputation will be done using MICE - Multiple Imputation by Chained Equations.

Since there are few reference on MICE with both categorical and numerical variables, a pure custom implementation has been made.

In [None]:
datasets = datasets_no_imputed.copy()

In [None]:
categorical_columns = ["workclass","education","maritial-status","occupation","relationship","race","sex","native-country"]
numerical_columns = ["age","education-num","hours-per-week","capital-gain","capital-loss"]
number_of_num_columns = len(numerical_columns)

We remove the target from the datasets.

In [None]:
datasets[0] = datasets[0].drop(columns="income")
datasets[1] = datasets[1].drop(columns="income")
datasets[2] = datasets[2].drop(columns="income")
datasets[3] = datasets[3].drop(columns="income")
datasets[4] = datasets[4].drop(columns="income")

We start by imputing only values in the numerical variables.

In [None]:
datasets_cat = datasets.copy() # the dataset with numerical and categorical variables (13 variables)
datasets_aux = list(range(5))
i=0
for dataset in datasets:
  datasets_aux[i] = dataset[numerical_columns].copy()
  i=i+1
datasets = datasets_aux.copy() # dataset with only numerical

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer

multivariate_impute_pipe = ColumnTransformer([
    ("impute_num", IterativeImputer(estimator=KNeighborsRegressor(n_neighbors=10),
                                    max_iter=100), datasets[0].columns)
]
)
datasets_aux = list(range(5))
i=0
for dataset in datasets:
  datasets_aux[i] = pd.DataFrame(multivariate_impute_pipe.fit_transform(dataset), columns=dataset.columns)
  i=i+1

In [None]:
datasets = datasets_aux.copy() # dataset with only numerical imputed using KNN

Now, we consider each categorical column at the time as target of a Decision Tree classifier.

In [None]:
datasets_advanced_imputation = list(range(5)) # we will save here the final datasets imputed
i = 0 # we iterate trhough the dataset that have at each iteration a column that will be the target column to predict
for dataset in datasets:
  col_count = 0 # if we have != 0 columns together with the numerical ones (imputed before), we have to hot encode them

  for col in categorical_columns: # we consider a categorical column as target to predict the missings
    dataset_to_replace = dataset.copy() # we will use this version of the dataset (with categorical and numerical) and we will add to it the next imputed columns

    # check if there are other categorical variables: if yes we hot encode them
    if col_count!=0:
      no_numerical_variables = number_of_num_columns
      cat = dataset.iloc[: , -(originaldata.shape[1]-number_of_num_columns):]

      columns_names = list(dataset.columns.values)

      cat_columns = columns_names[-(dataset.shape[1]-number_of_num_columns):] # we retrieve the categorical from the dataset, that at each iter will increase in number

      categorical_preprocessor = OneHotEncoder(handle_unknown="ignore") # processors for categorical columns

      preprocessor = ColumnTransformer([
          ('one-hot-encoder', categorical_preprocessor, cat_columns)])
      
      encoded = pd.DataFrame.sparse.from_spmatrix(preprocessor.fit_transform(dataset))
      dataset = pd.concat([dataset.iloc[:,:number_of_num_columns], encoded], axis=1) # we concatenate the numerical variables and the encoded ones

    X = dataset.copy()
    y = datasets_cat[i][col]

    # save indexes of nan and not_nan rows
    nans = y.index[y!=y]
    no_nans = y.index[y==y]

    X = pd.DataFrame(zscore(X.values))

    aux = X.assign(target=y)
    train = aux.iloc[no_nans]
    test = aux.iloc[nans]
    X_train = train.iloc[:,:-1]
    X_test = test.iloc[:,:-1]
    y_train = train.iloc[:,-1]
    y_test = test.iloc[:,-1]

    clf = DecisionTreeClassifier()
    clf = clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)

    final_column = pd.DataFrame(y)
    final_column.loc[nans, list(final_column.columns.values)[0]] = y_pred # we take the categorical variables and we replace the nans with the predicted values

    predicted_dataset = dataset_to_replace.assign(target=final_column) # we add to the initial dataset with not-hot-encoded categorical variables the predicted and imputed column at this iteration
    predicted_dataset.rename(columns = {'target':final_column.columns.values[-1]}, inplace = True)

    dataset = predicted_dataset.copy()
    col_count=col_count+1

  datasets_advanced_imputation[i] = dataset.copy()
  i = i + 1


In [None]:
datasets_advanced_imputation[0] = datasets_advanced_imputation[0].assign(income=originaldata["income"])
datasets_advanced_imputation[1] = datasets_advanced_imputation[1].assign(income=originaldata["income"])
datasets_advanced_imputation[2] = datasets_advanced_imputation[2].assign(income=originaldata["income"])
datasets_advanced_imputation[3] = datasets_advanced_imputation[3].assign(income=originaldata["income"])
datasets_advanced_imputation[4] = datasets_advanced_imputation[4].assign(income=originaldata["income"])

We finally change the order of the columns as in the original dataset.

In [None]:
datasets_advanced_imputation[0] = datasets_advanced_imputation[0].loc[:, list(originaldata.columns.values)]
datasets_advanced_imputation[1] = datasets_advanced_imputation[1].loc[:, list(originaldata.columns.values)]
datasets_advanced_imputation[2] = datasets_advanced_imputation[2].loc[:, list(originaldata.columns.values)]
datasets_advanced_imputation[3] = datasets_advanced_imputation[3].loc[:, list(originaldata.columns.values)]
datasets_advanced_imputation[4] = datasets_advanced_imputation[4].loc[:, list(originaldata.columns.values)]

## Imputation accuracy

* The accuracy for the categorical variables will be the number of the correct imputed values over the number of elements of that variable.
* The accuracy for the numerical variables will be calculated as 1 - the cosine similarity between the imputed and the original one.

In [None]:
accuracy_advanced_imputation = np.empty(5, dtype=object)
i = 0
for dataset in datasets_advanced_imputation:

  # init
  variables_accuracies = {}
  for col in dataset.columns.values[:-1]:
    variables_accuracies[col] = 0

  # fullfillement
  for col in dataset.columns[:-1]:
    if isinstance(dataset[col].iat[0], str):
      correct_values = 0
      for element1, element2 in zip(dataset[col].values, originaldata[col].values):
        if element1 == element2:
          correct_values += 1
      count_values = dataset[col].count()
      accuracy = correct_values/count_values
      variables_accuracies[col] = accuracy
    else:
      variables_accuracies[col] = 1 - cosine(dataset[col], originaldata[col])
  accuracy_advanced_imputation[i]=variables_accuracies
  i = i + 1

## Logistic regression

In [None]:
categorical_columns = ["workclass","education","maritial-status","occupation","relationship","race","sex","native-country"]
numerical_columns = ["age","education-num","hours-per-week","capital-gain","capital-loss"]

metrics_lr_advanced_imputation = np.empty(5, dtype=object)
i = 0
for dataset in datasets_advanced_imputation:
  X_train, X_test, y_train, y_test = preparation(dataset, categorical_columns, numerical_columns, "income", 0.2)
  metrics_lr_advanced_imputation[i] = logistic_regression_classifier(X_train, X_test, y_train, y_test)
  i = i + 1

## SVM

In [None]:
categorical_columns = ["workclass","education","maritial-status","occupation","relationship","race","sex","native-country"]
numerical_columns = ["age","education-num","hours-per-week","capital-gain","capital-loss"]

metrics_svm_advanced_imputation = np.empty(5, dtype=object)
i = 0
for dataset in datasets_advanced_imputation:
  X_train, X_test, y_train, y_test = preparation(dataset, categorical_columns, numerical_columns, "income", 0.2)
  metrics_svm_advanced_imputation[i] = SVM_classifier(X_train, X_test, y_train, y_test, 'linear')
  i = i + 1

# Standard Imputation

Looking at the values that each attribute assumes, and at the cardinality of each value, we can choose how to treat each attribute for the imputation:
* Age, education-num, hours-per-week can be treated as a numeric value, and thus ***median*** imputation will be used (keeping it without decimal values)
* Workclass, education, maritial-status, occupation, relationship, race, sex are categorical. Bfill (or ffill) seems a reasonable choice when we have few missing values, since it is a random imputation. The mode infact could change a lot the dstribution of the data, when the data has lots of missing values. Thus, with 10% and 20% of missing values, the mode imputation seems a good choice, while with 30% and above of missing values a method such that bbfill and ffill that dont change the distribution are reasonable. For simplicity we will use ***bfill*** method
* Capital-gain and Capital-loss are numeric, but it is good to treat it as a categorical variable because they can assume few values. Since it seems that they have a particular value that is way more frequent than the others, a ***mode*** imputation seems reasonable for these variables.
* Native country is a categorical variable that have a particular value that is way more frequent than the others. A ***mode*** imputation seems reasonable for this variable.

## Imputation

In [None]:
datasets = datasets_no_imputed.copy()

In [None]:
# we convert the capital-gain and capital-loss column from numeric to categorical
for dataset in datasets:
  dataset['capital-gain'] = dataset['capital-gain'].astype('category')
  dataset['capital-loss'] = dataset['capital-loss'].astype('category')

In [None]:
for dataset in datasets:
  for col in dataset.columns:
    if col in ["age","fnlwgt","education-num","hours-per-week"]:
      dataset[col] = dataset[col].fillna(dataset[col].median())
    if col in ["workclass","education","maritial-status","occupation","relationship","race","sex"]:
      dataset[col] = dataset[col].fillna(method="bfill")
    if col in ["capital-gain","capital-loss","native-country"]:
      dataset[col] = dataset[col].fillna(dataset[col].mode()[0])
datasets_standard_imputation = datasets.copy()

## Imputation accuracy

* The accuracy for the categorical variables will be the number of the correct imputed values over the number of elment of that variable.
* The accuracy for the numerical variables will be calculated as 1 - the cosine similarity between the imputed and the original one.

In [None]:
accuracy_standard_imputation = np.empty(5, dtype=object)
i = 0
for dataset in datasets_standard_imputation:

  # init
  variables_accuracies = {}
  for col in dataset.columns.values[:-1]:
    variables_accuracies[col] = 0

  # fullfillement
  for col in dataset.columns[:-1]:
    if isinstance(dataset[col].iat[0], str):
      correct_values = 0
      for element1, element2 in zip(dataset[col].values, originaldata[col].values):
        if element1 == element2:
          correct_values += 1
      count_values = dataset[col].count()
      accuracy = correct_values/count_values
      variables_accuracies[col] = accuracy
    else:
      variables_accuracies[col] = 1 - cosine(dataset[col], originaldata[col])
  accuracy_standard_imputation[i]=variables_accuracies
  i = i + 1

## Logistic regression

In [None]:
categorical_columns = ["workclass","education","maritial-status","occupation","relationship","race","sex","native-country"]
numerical_columns = ["age","education-num","hours-per-week","capital-gain","capital-loss"]

metrics_lr_standard_imputation = np.empty(5, dtype=object)
i = 0
for dataset in datasets_standard_imputation:
  X_train, X_test, y_train, y_test = preparation(dataset, categorical_columns, numerical_columns, "income", 0.2)
  metrics_lr_standard_imputation[i] = logistic_regression_classifier(X_train, X_test, y_train, y_test)
  i = i + 1

## SVM

In [None]:
categorical_columns = ["workclass","education","maritial-status","occupation","relationship","race","sex","native-country"]
numerical_columns = ["age","education-num","hours-per-week","capital-gain","capital-loss"]

metrics_svm_standard_imputation = np.empty(5, dtype=object)
i = 0
for dataset in datasets_standard_imputation:
  X_train, X_test, y_train, y_test = preparation(dataset, categorical_columns, numerical_columns, "income", 0.2)
  metrics_svm_standard_imputation[i] = SVM_classifier(X_train, X_test, y_train, y_test, 'linear')
  i = i + 1

# Conclusion

## Imputed methods accuracy

First, it is possible to note that the imputation on numerical variables is much more
precise than the one in the categorical variables. 

This can be due to the nature of the
variable: age, education number and hours per week are variables for which, even with a
lot of missing value, it is possible to obtain imputed data that have more or less a similar
distribution. For capital gain and capital loss that have a single value as the most frequent,
imputation obtains the majority of value close to the most frequent one.

It is different for the categorical variables. As I said before, if the data has lots of missing
values, it can happen that a specific value of that variable can lose all the observations,
with a lack of information that can not be imputed anymore. This problem infact is solved in the numerical one because the predictions can be close to the real value, and also
because the accuracy is measured using a similarity like the cosine.

The only exception is native country, since it is a variable that has a value that is way the
most frequent one, and also because during MICE it was the last categorical variable to be
predicted, so it has all the other variables as features.


In [None]:
plt.figure(figsize=(18, 18), dpi=80)

x = list(originaldata.columns.values[:-1])
x_axis = np.arange(len(x))

y1 = np.empty(5, dtype=object)
i = 0
for dataset_measure in accuracy_standard_imputation:
  y1[i] = np.empty(len(list(originaldata.columns.values[:-1])), dtype=object)
  j = 0
  for col in list(originaldata.columns.values[:-1]):
    y1[i][j] = dataset_measure[col]
    j += 1
  i += 1

y2 = np.empty(5, dtype=object)
i = 0
for dataset_measure in accuracy_advanced_imputation:
  y2[i] = np.empty(len(list(originaldata.columns.values[:-1])), dtype=object)
  j = 0
  for col in list(originaldata.columns.values[:-1]):
    y2[i][j] = dataset_measure[col]
    j += 1
  i += 1

# Draw first subplot

plt.subplot(2, 1, 1)
plt.bar(x_axis +0.10, y1[0], width=0.1, label = '50% imputed dataset')
plt.bar(x_axis +0.10*2, y1[1], width=0.1, label = '40% imputed dataset')
plt.bar(x_axis +0.10*3, y1[2], width=0.1, label = '30% imputed dataset')
plt.bar(x_axis +0.10*4, y1[3], width=0.1, label = '20% imputed dataset')
plt.bar(x_axis +0.10*5, y1[4], width=0.1, label = '10% imputed dataset')
plt.ylim(0.4, 1)

plt.xticks(x_axis,x, rotation = 45)
plt.title("Accuracy of the standard imputated datasets")
plt.ylabel("Accuracy")
plt.xlabel("Variables")

plt.legend(loc='lower right')
# Draw second subplot

plt.subplot(2, 1, 2)
plt.bar(x_axis +0.10, y2[0], width=0.1, label = '50% imputed dataset')
plt.bar(x_axis +0.10*2, y2[1], width=0.1, label = '40% imputed dataset')
plt.bar(x_axis +0.10*3, y2[2], width=0.1, label = '30% imputed dataset')
plt.bar(x_axis +0.10*4, y2[3], width=0.1, label = '20% imputed dataset')
plt.bar(x_axis +0.10*5, y2[4], width=0.1, label = '10% imputed dataset')
plt.ylim(0.4, 1)

plt.xticks(x_axis,x, rotation = 45)
plt.title("Accuracy of the advanced imputated datasets")
plt.ylabel("Accuracy")
plt.xlabel("Variables")

plt.legend(loc='lower right')
plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=None, hspace=0.5)

plt.show()

Another factor for the accuracy of the imputation can be the number of distinct values in
the categorical variables.

The less the distinct values, the closer are the imputed values to the original one. For
example, occupation has more distinct values than the other categorical variables and the
performance is worse than the others.
Regarding the two importation methods, there is an important aspect. In general, the
advanced imputation performs better than the standard, especially in education. We can
see that in the capital gain and in the capital loss, the accuracy in the 50% missing dataset
is lower than the standard one, because in the advanced the variable has been predicted
numerically using the KNN, while in the standard we used the mode, and as a result, we
kept the imputed values equal to the most frequent one.
In general, the trend is that the less missing values are present in the dataset, the better is
the similarity between imputed and original variables.

In [None]:
plt.figure(figsize=(18, 9), dpi=80)

x = list(originaldata.columns.values[:-1])
x_axis = np.arange(len(x))

y1 = np.empty(5, dtype=object)
i = 0
for dataset in datasets_no_imputed:
  y1[i] = np.empty(len(list(originaldata.columns.values[:-1])), dtype=object)
  j = 0
  for col in list(originaldata.columns.values[:-1]):
    distinct_values = (dataset[col].nunique())
    uniqueness = distinct_values / n_rows
    y1[i][j] = uniqueness
    j += 1
  i += 1

# Draw first subplot

plt.subplot(2, 1, 1)
plt.bar(x_axis +0.10, y1[0], width=0.1, label = '50% imputed dataset')
plt.bar(x_axis +0.10*2, y1[1], width=0.1, label = '40% imputed dataset')
plt.bar(x_axis +0.10*3, y1[2], width=0.1, label = '30% imputed dataset')
plt.bar(x_axis +0.10*4, y1[3], width=0.1, label = '20% imputed dataset')
plt.bar(x_axis +0.10*5, y1[4], width=0.1, label = '10% imputed dataset')
plt.ylim(0, 0.03)

plt.xticks(x_axis,x, rotation = 45)
plt.title("Uniqueness of the standard imputated datasets")
plt.ylabel("Uniqueness")
plt.xlabel("Variables")

plt.legend(loc='upper center')

plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=None, hspace=0.5)

plt.show()

## ML algorithms performances

The first thing is that logistic regression is performing well and better than SVM. The
reason can be multiple, and they depend on the tuning and on the optimization of them.
Another aspect is that the logistic regression and the SVM have good performances in the
standard imputation even with some missing value. The difference between standard and
advanced is that the standard imputation is more unstable than the advanced one, since
the latter has a more cleaned trend.

In [None]:
plt.figure(figsize=(18, 75), dpi=80)

AX = gridspec.GridSpec(7,2)
AX.update(right=1, top=1, wspace = 0.3, hspace = 0.3)
acc_std = plt.subplot(AX[0,0])
acc_adv = plt.subplot(AX[0,1])
pre1_std = plt.subplot(AX[1,0])
pre1_adv = plt.subplot(AX[1,1])
pre2_std = plt.subplot(AX[2,0])
pre2_adv = plt.subplot(AX[2,1])
rec1_std = plt.subplot(AX[3,0])
rec1_adv = plt.subplot(AX[3,1])
rec2_std = plt.subplot(AX[4,0])
rec2_adv = plt.subplot(AX[4,1])
f11_std = plt.subplot(AX[5,0])
f11_adv = plt.subplot(AX[5,1])
f12_std = plt.subplot(AX[6,0])
f12_adv = plt.subplot(AX[6,1])

  
x = ['50%','40%','30%','20%','10%','Original']

# accuracy
y1_1 = [metrics_lr_standard_imputation[0]['accuracy'],
      metrics_lr_standard_imputation[1]['accuracy'],
      metrics_lr_standard_imputation[2]['accuracy'],
      metrics_lr_standard_imputation[3]['accuracy'],
      metrics_lr_standard_imputation[4]['accuracy'],
      metrics_lr_original['accuracy']]  
y1_2 = [metrics_svm_standard_imputation[0]['accuracy'],
      metrics_svm_standard_imputation[1]['accuracy'],
      metrics_svm_standard_imputation[2]['accuracy'],
      metrics_svm_standard_imputation[3]['accuracy'],
      metrics_svm_standard_imputation[4]['accuracy'],
      metrics_svm_original['accuracy']]
y2_1 = [metrics_lr_advanced_imputation[0]['accuracy'],
      metrics_lr_advanced_imputation[1]['accuracy'],
      metrics_lr_advanced_imputation[2]['accuracy'],
      metrics_lr_advanced_imputation[3]['accuracy'],
      metrics_lr_advanced_imputation[4]['accuracy'],
      metrics_lr_original['accuracy']]  
y2_2 = [metrics_svm_advanced_imputation[0]['accuracy'],
      metrics_svm_advanced_imputation[1]['accuracy'],
      metrics_svm_advanced_imputation[2]['accuracy'],
      metrics_svm_advanced_imputation[3]['accuracy'],
      metrics_svm_advanced_imputation[4]['accuracy'],
      metrics_svm_original['accuracy']]

# precision of income<=50%
y3_1 = [metrics_lr_standard_imputation[0]['precision_smaller'],
      metrics_lr_standard_imputation[1]['precision_smaller'],
      metrics_lr_standard_imputation[2]['precision_smaller'],
      metrics_lr_standard_imputation[3]['precision_smaller'],
      metrics_lr_standard_imputation[4]['precision_smaller'],
      metrics_lr_original['precision_smaller']]  
y3_2 = [metrics_svm_standard_imputation[0]['precision_smaller'],
      metrics_svm_standard_imputation[1]['precision_smaller'],
      metrics_svm_standard_imputation[2]['precision_smaller'],
      metrics_svm_standard_imputation[3]['precision_smaller'],
      metrics_svm_standard_imputation[4]['precision_smaller'],
      metrics_svm_original['precision_smaller']]
y4_1 = [metrics_lr_advanced_imputation[0]['precision_smaller'],
      metrics_lr_advanced_imputation[1]['precision_smaller'],
      metrics_lr_advanced_imputation[2]['precision_smaller'],
      metrics_lr_advanced_imputation[3]['precision_smaller'],
      metrics_lr_advanced_imputation[4]['precision_smaller'],
      metrics_lr_original['precision_smaller']]  
y4_2 = [metrics_svm_advanced_imputation[0]['precision_smaller'],
      metrics_svm_advanced_imputation[1]['precision_smaller'],
      metrics_svm_advanced_imputation[2]['precision_smaller'],
      metrics_svm_advanced_imputation[3]['precision_smaller'],
      metrics_svm_advanced_imputation[4]['precision_smaller'],
      metrics_svm_original['precision_smaller']]

# precision of income>50%
y5_1 = [metrics_lr_standard_imputation[0]['precision_greater'],
      metrics_lr_standard_imputation[1]['precision_greater'],
      metrics_lr_standard_imputation[2]['precision_greater'],
      metrics_lr_standard_imputation[3]['precision_greater'],
      metrics_lr_standard_imputation[4]['precision_greater'],
      metrics_lr_original['precision_greater']]  
y5_2 = [metrics_svm_standard_imputation[0]['precision_greater'],
      metrics_svm_standard_imputation[1]['precision_greater'],
      metrics_svm_standard_imputation[2]['precision_greater'],
      metrics_svm_standard_imputation[3]['precision_greater'],
      metrics_svm_standard_imputation[4]['precision_greater'],
      metrics_svm_original['precision_greater']]
y6_1 = [metrics_lr_advanced_imputation[0]['precision_greater'],
      metrics_lr_advanced_imputation[1]['precision_greater'],
      metrics_lr_advanced_imputation[2]['precision_greater'],
      metrics_lr_advanced_imputation[3]['precision_greater'],
      metrics_lr_advanced_imputation[4]['precision_greater'],
      metrics_lr_original['precision_greater']]  
y6_2 = [metrics_svm_advanced_imputation[0]['precision_greater'],
      metrics_svm_advanced_imputation[1]['precision_greater'],
      metrics_svm_advanced_imputation[2]['precision_greater'],
      metrics_svm_advanced_imputation[3]['precision_greater'],
      metrics_svm_advanced_imputation[4]['precision_greater'],
      metrics_svm_original['precision_greater']]

# recall of income<=50%
y7_1 = [metrics_lr_standard_imputation[0]['recall_smaller'],
      metrics_lr_standard_imputation[1]['recall_smaller'],
      metrics_lr_standard_imputation[2]['recall_smaller'],
      metrics_lr_standard_imputation[3]['recall_smaller'],
      metrics_lr_standard_imputation[4]['recall_smaller'],
      metrics_lr_original['recall_smaller']]  
y7_2 = [metrics_svm_standard_imputation[0]['recall_smaller'],
      metrics_svm_standard_imputation[1]['recall_smaller'],
      metrics_svm_standard_imputation[2]['recall_smaller'],
      metrics_svm_standard_imputation[3]['recall_smaller'],
      metrics_svm_standard_imputation[4]['recall_smaller'],
      metrics_svm_original['recall_smaller']]
y8_1 = [metrics_lr_advanced_imputation[0]['recall_smaller'],
      metrics_lr_advanced_imputation[1]['recall_smaller'],
      metrics_lr_advanced_imputation[2]['recall_smaller'],
      metrics_lr_advanced_imputation[3]['recall_smaller'],
      metrics_lr_advanced_imputation[4]['recall_smaller'],
      metrics_lr_original['recall_smaller']]  
y8_2 = [metrics_svm_advanced_imputation[0]['recall_smaller'],
      metrics_svm_advanced_imputation[1]['recall_smaller'],
      metrics_svm_advanced_imputation[2]['recall_smaller'],
      metrics_svm_advanced_imputation[3]['recall_smaller'],
      metrics_svm_advanced_imputation[4]['recall_smaller'],
      metrics_svm_original['recall_smaller']]

# recall of income>50%
y9_1 = [metrics_lr_standard_imputation[0]['recall_greater'],
      metrics_lr_standard_imputation[1]['recall_greater'],
      metrics_lr_standard_imputation[2]['recall_greater'],
      metrics_lr_standard_imputation[3]['recall_greater'],
      metrics_lr_standard_imputation[4]['recall_greater'],
      metrics_lr_original['recall_greater']]  
y9_2 = [metrics_svm_standard_imputation[0]['recall_greater'],
      metrics_svm_standard_imputation[1]['recall_greater'],
      metrics_svm_standard_imputation[2]['recall_greater'],
      metrics_svm_standard_imputation[3]['recall_greater'],
      metrics_svm_standard_imputation[4]['recall_greater'],
      metrics_svm_original['recall_greater']]
y10_1 = [metrics_lr_advanced_imputation[0]['recall_greater'],
      metrics_lr_advanced_imputation[1]['recall_greater'],
      metrics_lr_advanced_imputation[2]['recall_greater'],
      metrics_lr_advanced_imputation[3]['recall_greater'],
      metrics_lr_advanced_imputation[4]['recall_greater'],
      metrics_lr_original['recall_greater']]  
y10_2 = [metrics_svm_advanced_imputation[0]['recall_greater'],
      metrics_svm_advanced_imputation[1]['recall_greater'],
      metrics_svm_advanced_imputation[2]['recall_greater'],
      metrics_svm_advanced_imputation[3]['recall_greater'],
      metrics_svm_advanced_imputation[4]['recall_greater'],
      metrics_svm_original['recall_greater']]

# f1 of income<=50%
y11_1 = [metrics_lr_standard_imputation[0]['f1_smaller'],
      metrics_lr_standard_imputation[1]['f1_smaller'],
      metrics_lr_standard_imputation[2]['f1_smaller'],
      metrics_lr_standard_imputation[3]['f1_smaller'],
      metrics_lr_standard_imputation[4]['f1_smaller'],
      metrics_lr_original['f1_smaller']]  
y11_2 = [metrics_svm_standard_imputation[0]['f1_smaller'],
      metrics_svm_standard_imputation[1]['f1_smaller'],
      metrics_svm_standard_imputation[2]['f1_smaller'],
      metrics_svm_standard_imputation[3]['f1_smaller'],
      metrics_svm_standard_imputation[4]['f1_smaller'],
      metrics_svm_original['f1_smaller']]
y12_1 = [metrics_lr_advanced_imputation[0]['f1_smaller'],
      metrics_lr_advanced_imputation[1]['f1_smaller'],
      metrics_lr_advanced_imputation[2]['f1_smaller'],
      metrics_lr_advanced_imputation[3]['f1_smaller'],
      metrics_lr_advanced_imputation[4]['f1_smaller'],
      metrics_lr_original['f1_smaller']]  
y12_2 = [metrics_svm_advanced_imputation[0]['f1_smaller'],
      metrics_svm_advanced_imputation[1]['f1_smaller'],
      metrics_svm_advanced_imputation[2]['f1_smaller'],
      metrics_svm_advanced_imputation[3]['f1_smaller'],
      metrics_svm_advanced_imputation[4]['f1_smaller'],
      metrics_svm_original['f1_smaller']]

# f1 of income>50%
y13_1 = [metrics_lr_standard_imputation[0]['f1_greater'],
      metrics_lr_standard_imputation[1]['f1_greater'],
      metrics_lr_standard_imputation[2]['f1_greater'],
      metrics_lr_standard_imputation[3]['f1_greater'],
      metrics_lr_standard_imputation[4]['f1_greater'],
      metrics_lr_original['f1_greater']]  
y13_2 = [metrics_svm_standard_imputation[0]['f1_greater'],
      metrics_svm_standard_imputation[1]['f1_greater'],
      metrics_svm_standard_imputation[2]['f1_greater'],
      metrics_svm_standard_imputation[3]['f1_greater'],
      metrics_svm_standard_imputation[4]['f1_greater'],
      metrics_svm_original['f1_greater']]
y14_1 = [metrics_lr_advanced_imputation[0]['f1_greater'],
      metrics_lr_advanced_imputation[1]['f1_greater'],
      metrics_lr_advanced_imputation[2]['f1_greater'],
      metrics_lr_advanced_imputation[3]['f1_greater'],
      metrics_lr_advanced_imputation[4]['f1_greater'],
      metrics_lr_original['f1_greater']]  
y14_2 = [metrics_svm_advanced_imputation[0]['f1_greater'],
      metrics_svm_advanced_imputation[1]['f1_greater'],
      metrics_svm_advanced_imputation[2]['f1_greater'],
      metrics_svm_advanced_imputation[3]['f1_greater'],
      metrics_svm_advanced_imputation[4]['f1_greater'],
      metrics_svm_original['f1_greater']]


acc_std.set_title('Accuracy standard imputation')
acc_std.set_xlabel('Missing values imputed')
acc_std.set_ylabel('Accuracy')
acc_std.plot(x, y1_1)
acc_std.plot(x, y1_2)
acc_std.legend(loc='lower right', labels=["Log regression","SVM"])

acc_adv.set_title('Accuracy advanced imputation')
acc_adv.set_xlabel('Missing values imputed')
acc_adv.set_ylabel('Accuracy')
acc_adv.plot(x, y2_1)
acc_adv.plot(x, y2_2)
acc_adv.legend(loc='lower right', labels=["Log regression","SVM"])



pre1_std.set_title('Precision "<=50" standard imputation')
pre1_std.set_xlabel('Missing values imputed')
pre1_std.set_ylabel('Precision')
pre1_std.plot(x, y3_1)
pre1_std.plot(x, y3_2)
pre1_std.legend(loc='lower right', labels=["Log regression","SVM"])

pre1_adv.set_title('Precision "<=50" advanced imputation')
pre1_adv.set_xlabel('Missing values imputed')
pre1_adv.set_ylabel('Precision')
pre1_adv.plot(x, y4_1)
pre1_adv.plot(x, y4_2)
pre1_adv.legend(loc='lower right', labels=["Log regression","SVM"])


pre2_std.set_title('Precision ">50" standard imputation')
pre2_std.set_xlabel('Missing values imputed')
pre2_std.set_ylabel('Precision')
pre2_std.plot(x, y5_1)
pre2_std.plot(x, y5_2)
pre2_std.legend(loc='lower right', labels=["Log regression","SVM"])

pre2_adv.set_title('Precision ">50" advanced imputation')
pre2_adv.set_xlabel('Missing values imputed')
pre2_adv.set_ylabel('Precision')
pre2_adv.plot(x, y6_1)
pre2_adv.plot(x, y6_2)
pre2_adv.legend(loc='lower right', labels=["Log regression","SVM"])


rec1_std.set_title('Recall "<=50" standard imputation')
rec1_std.set_xlabel('Missing values imputed')
rec1_std.set_ylabel('Recall')
rec1_std.plot(x, y7_1)
rec1_std.plot(x, y7_2)
rec1_std.legend(loc='lower right', labels=["Log regression","SVM"])

rec1_adv.set_title('Recall "<=50" advanced imputation')
rec1_adv.set_xlabel('Missing values imputed')
rec1_adv.set_ylabel('Recall')
rec1_adv.plot(x, y8_1)
rec1_adv.plot(x, y8_2)
rec1_adv.legend(loc='lower right', labels=["Log regression","SVM"])


rec2_std.set_title('Recall ">50" standard imputation')
rec2_std.set_xlabel('Missing values imputed')
rec2_std.set_ylabel('Recall')
rec2_std.plot(x, y9_1)
rec2_std.plot(x, y9_2)
rec2_std.legend(loc='lower right', labels=["Log regression","SVM"])

rec2_adv.set_title('Recall ">50" advanced imputation')
rec2_adv.set_xlabel('Missing values imputed')
rec2_adv.set_ylabel('Recall')
rec2_adv.plot(x, y10_1)
rec2_adv.plot(x, y10_2)
rec2_adv.legend(loc='lower right', labels=["Log regression","SVM"])


f11_std.set_title('F1 "<=50" standard imputation')
f11_std.set_xlabel('Missing values imputed')
f11_std.set_ylabel('F1')
f11_std.plot(x, y11_1)
f11_std.plot(x, y11_2)
f11_std.legend(loc='lower right', labels=["Log regression","SVM"])

f11_adv.set_title('F1 "<=50" advanced imputation')
f11_adv.set_xlabel('Missing values imputed')
f11_adv.set_ylabel('F1')
f11_adv.plot(x, y12_1)
f11_adv.plot(x, y12_2)
f11_adv.legend(loc='lower right', labels=["Log regression","SVM"])


f12_std.set_title('F1 ">50" standard imputation')
f12_std.set_xlabel('Missing values imputed')
f12_std.set_ylabel('F1')
f12_std.plot(x, y13_1)
f12_std.plot(x, y13_2)
f12_std.legend(loc='lower right', labels=["Log regression","SVM"])

f12_adv.set_title('F1 ">50" advanced imputation')
f12_adv.set_xlabel('Missing values imputed')
f12_adv.set_ylabel('F1')
f12_adv.plot(x, y14_1)
f12_adv.plot(x, y14_2)
f12_adv.legend(loc='lower right', labels=["Log regression","SVM"])
  

# Packing all the plots and displaying them
#plt.tight_layout()
plt.show()


