# Lab 4

# Libs

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import matplotlib.ticker as mtick
import warnings
warnings.filterwarnings('ignore')
plt.style.use('ggplot')

# Functions
Some useful defined functions that you can use and modify

In [None]:
def print_classification(y_test, y_pred):
    from sklearn.metrics import classification_report, confusion_matrix
    # Classification Report
    print(classification_report(y_test, y_pred))
    
    # Plot
    cm = confusion_matrix(y_test, y_pred)

    plt.figure(figsize = (5,3))
    ax = sns.heatmap(cm, annot=True, fmt='g', cmap = 'Blues')
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')
    ax.set_title('Confusion Matrix')
    plt.show()

In [None]:
def feature_importance(features, importance):
    df_importance = pd.DataFrame({'Feature': features, 'Importance': importance})
    df_importance['Impact'] = ['Positive' if x > 0 else 'Negative' for x in df_importance['Importance']]
    df_importance.sort_values(by = 'Importance', ascending = False, inplace = True)

    plt.figure(figsize = (6,8))
    sns.barplot(data = df_importance, x = 'Importance', y = 'Feature', hue = 'Impact', hue_order = ['Negative', 'Positive'])
    plt.title('Feature Importance')
    plt.show()

In [None]:
def get_profit(y_test, y_pred, revenue, cost):
    from sklearn.metrics import confusion_matrix 
    cm = confusion_matrix(y_test, y_pred)
    total_cost = (cm[0][1] + cm[1][1]) * cost
    total_revenue = (cm[1][1]) * revenue
    profit = total_revenue - total_cost
    return profit

# 1. Import Data
Start by importing the `customer_data.xlsx` file into a pandas DataFrame

Print the first 5 rows

How many rows and columns does our dataset have?

Knowing the cost per contact (4€) and the revenue per positive answer (50€), what was the total profit of the last Marketing campaing? 

## 1.1. Split Data

Since we are expert Data Scientists and we know the dangers of overfitting, let's split our data like so:
* 80% Trainning
* 20% Testing

_Since we are dealing with a binary classification problem, remember to keep the same proportion of positive responses in each subset!_

In [None]:
from sklearn.model_selection import train_test_split


Check the dimensionality of each subset

Check the response rate in each subset

# 2. EDA: Exploratory Data Analysis

### 2.1. Data Types
Start the EDA by checking the data types in our data

### 2.2. Data summary
For each data type `numerical`, `object`, and `datetime`, let's see some summary statistics

In [None]:
# Numerical

In [None]:
# Object

In [None]:
# Datetime

Look at the following boxplot. What can you conclude on the customers preference?

In [None]:
plt.figure(figsize = (10, 8))
train[['Clothes','Kitchen','SmallAppliances','HouseKeeping','Toys']].boxplot()
plt.show()

Using a seaborn countplot, check the distribution of categories for `Gender`, `Education`, and `Marital_Status`

https://seaborn.pydata.org/generated/seaborn.countplot.html

In [None]:
sns.countplot(x = ..., data = ...)

### 2.3. Missing Values
Missing data is always a No-No. How many missings do we have per feature?

Check them visually!

In [None]:
df_na = pd.DataFrame(train.isna().sum()/train.shape[0],
                        columns = ['Missings (%)']).sort_values(by = 'Missings (%)', ascending = False)
df_na = df_na[df_na['Missings (%)'] > 0].copy()

plt.figure(figsize = (6,4))
ax = sns.barplot(data = df_na, x = df_na.index, y = 'Missings (%)' )
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
plt.title('Missing Values (%)')
plt.show()

### 2.4. Correlation
Display the correlation between all features

Can you represent the correlation visually?

https://seaborn.pydata.org/generated/seaborn.heatmap.html

In [None]:
# cmap = sns.diverging_palette(220, 20, as_cmap=True) optional colors


`Mnt` and `Frq` seem very highly correlated. Display this relationship with a scatterplot. What can you conclude?

# 3. Data Preprocessing
In this section we will transform our data to better fit an analytical model. Remember that all decisions have to be made based on the __Training__ set but replicated on the __Testing__ set.

### 3.1. Drop Correlated Features
Let's look at the pairs with over 90% correlation and drop one of the features. They are basically giving the same information.

In [None]:
# Frq, CatPurchase, Year_Birth


### 3.2. Fix Missing Values
Since there are some missing values, let's use a simple strategy of imputing the missings using the median value of numerical features. For features of type Object, use the most frequent category (mode).

In [None]:
# Numerical


In [None]:
# Object


In [None]:
# Confirm you solved the problem
train.isna().sum()

### 3.3. New Features / Encoding
Usually in a problem, we go beyond the features that already exist in our data. In fact, the features we create based on our business knowledge will probably be more important for predictions than the original ones. Having said this, create the following variables:
* __Days as Customer__ - Nr. of days as customer
* __Female__ - 1 if customer is Female, 0 if Male
* __High_Educ__ - 1 if customer has high education (Graduation, Master, or PhD)
* __Marital Status__ (One-Hot-Encoding) - One dummy binary feature for each category

In [None]:
# Days as Customer


In [None]:
train[['Dt_Customer', 'Days_as_customer']].head(3)

In [None]:
# Female


In [None]:
train[['Gender', 'Female']].head(3)

In [None]:
# High Educ


In [None]:
train[['Education', 'High_Educ']].head(3)

In [None]:
# Marital Status -> One-Hot-Encoding (with prefix MS)



In [None]:
train[['Marital_Status', 'MS_Married', 'MS_Single', 'MS_Together', 'MS_Widow']].head()

Drop the initial variables: `Dt_Customer`, `Gender`, `Education`, and `Marital_Status`

### 3.4. Split data into X and Y
We are almost done with preprocessing our data. To make things easier, let's divide our data into independent (what we use to predict) and dependent (what we want to predict) features:
* __x_train__ - all training data except `Custid` and `Response`
* __y_train__ - training `Response`
* __x_test__ - all testing data except `Custid` and `Response`
* __y_test__ - testing `Response`

In [None]:
# If you missed any previous step, use this checkpoint to import the dataset we created until now.

#train = pd.read_excel('customer_data_checkpoint.xlsx', sheet_name = 'train', index_col = 0)
#test = pd.read_excel('customer_data_checkpoint.xlsx', sheet_name = 'test', index_col = 0)

### 3.5. Normalize
Since we have features in different scale, it is common to normalize the data to avoid future problems. Let's try the Min-Max scaler that fits all values between 0 and 1 in the __x_train__ and __x_test__ subsets.

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# Train

# Test


Check the first 5 rows of the __x_train__ subset

# 4. Model
Now the cool part!

### 4.1. The Old Way
Using the old method (sending the offer to every customer), estimate the profit you would get in the testing subset. 

In [None]:
y_pred = 

In [None]:
profit = get_profit(y_test = y_test, y_pred = y_pred, revenue = revenue, cost = cost)
print('Total Profit: {} €'.format(profit))

### 4.2. Logistic Regression
Using a Logistic Regression, train a model in the training set and predict the response in the testing set. Then, calculate some main performance measures (Accuracy, Precision, Recall, F1-Score), print the feature importance and estimate the profit.

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
profit = 
print('Total Profit: {} €'.format(profit))

Build a barplot that displays the profit on the testing set by using the Old method and a Logistic Regression

In [None]:
results = pd.DataFrame({'Model': [..., ...],
                        'Profit': [..., ...]})


# 5. EXTRA
What happens if we try a different model? A Decision Tree for example.

In [None]:
from sklearn.tree import DecisionTreeClassifier

### 5.1. Decision Tree

In [None]:
model = DecisionTreeClassifier(random_state=0)

model = model.fit(x_train_scaled, y_train)

y_pred = model.predict(x_test_scaled)

In [None]:
print_classification(y_test, y_pred)

In [None]:
features = x_train_scaled.columns
importance = model.feature_importances_

df_importance = pd.DataFrame({'Feature': features, 'Importance': importance})
df_importance.sort_values(by = 'Importance', ascending = False, inplace = True)

plt.figure(figsize = (5,7))
sns.barplot(data = df_importance, x = 'Importance', y = 'Feature', color = 'salmon')
plt.title('Feature Importance')
plt.show()

### 5.2. Decision Tree with feature selection

In [None]:
# Using the most important features

cols = ['Mnt', 'Days_as_customer', 'Income', 'Rcn', 'NetPurchase', 'Toys', 
        'SmallAppliances', 'HouseKeeping', 'Kitchen', 'High_Educ']

model = DecisionTreeClassifier(random_state=0)

model = model.fit(x_train_scaled[cols], y_train)

y_pred = model.predict(x_test_scaled[cols])

In [None]:
print_classification(y_test, y_pred)

### 5.3. Compare Models

In [None]:
profit_dict = {}
# Old way
y_pred = [1] * len(y_test)
profit_dict['Old Way'] = get_profit(y_test = y_test, y_pred = y_pred, revenue = revenue, cost = cost)

# Logistic
y_pred = LogisticRegression(random_state=0).fit(x_train_scaled, y_train).predict(x_test_scaled)
profit_dict['Logistic Regression'] = get_profit(y_test = y_test, y_pred = y_pred, revenue = revenue, cost = cost)

# Decision Tree
y_pred = DecisionTreeClassifier(random_state=0).fit(x_train_scaled, y_train).predict(x_test_scaled)
profit_dict['Decision Tree'] = get_profit(y_test = y_test, y_pred = y_pred, revenue = revenue, cost = cost)

# Decision Tree with selection
cols = ['Mnt', 'Days_as_customer', 'Income', 'Rcn', 'NetPurchase', 'Toys','SmallAppliances', 'HouseKeeping', 'Kitchen', 'High_Educ']
y_pred = DecisionTreeClassifier(random_state=0).fit(x_train_scaled[cols], y_train).predict(x_test_scaled[cols])
profit_dict['Decision Tree (w selection)'] = get_profit(y_test = y_test, y_pred = y_pred, revenue = revenue, cost = cost)

In [None]:
models = pd.DataFrame.from_dict(profit_dict, orient = 'Index').reset_index()
models.columns = ['Model', 'Profit']
print(models.columns)

Knowing the `models` DataFrame has 2 columns: _Model_ and _Profit_ , can you draw a barplot that ilustrates each Model and Profit?

In [None]:
plt.figure(figsize = (10, 5))
sns.barplot(data = models, x = 'Model', y = 'Profit')
plt.title('Models Profit')
plt.show()