# Task 2: Predicting customer buying behaviour
## 5. Data balancing

## Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score

## Load the data

In [2]:
x = pd.read_pickle("x_final.pickle")
y = pd.read_pickle("y_final.pickle")

In [3]:
x.head()

Unnamed: 0,booking_origin_te,route_te,purchase_lead_qt,length_of_stay_qt,flight_duration_qt,flight_hour_qt,flight_day_oe
0,0.05,0.149552,1.517545,0.157469,-1.037937,-0.28875,6.0
1,0.05,0.149552,0.645631,0.218773,-1.037937,-1.018778,6.0
2,0.101749,0.149552,1.397837,0.366873,-1.037937,1.475269,3.0
3,0.05,0.149552,0.514923,0.814401,-1.037937,-0.809168,6.0
4,0.101749,0.149552,0.218773,0.366873,-1.037937,1.111702,3.0


## Data balancing

The main goal is to evaluate if balancing the data increases the prediction of the dataset.

To do that different methods should be tested.

### No balancing

#### Split between train and test

In [4]:
train_x,test_x,train_y,test_y = train_test_split(x,y,test_size=0.3)

#### Instantiate the model (a simple logistic regression model)

In [5]:
rl_sin_balanceo = LogisticRegression(n_jobs = -1)

#### Fit the model on the train set

In [6]:
rl_sin_balanceo.fit(train_x,train_y)

#### Predict probabilities

In [7]:
pred_rl_sin_balanceo = rl_sin_balanceo.predict_proba(test_x)[:,1]

#### Evaluate model

In [8]:
roc_rl_sin_balanceo = roc_auc_score(test_y, pred_rl_sin_balanceo)

roc_rl_sin_balanceo

0.7482344094411996

### Balance the dataset using undersampling

#### Instantiate the undersampler

In [9]:
rus = RandomUnderSampler(sampling_strategy= 1)

#### Fit and "transform" the undersampler

In [10]:
x_rus, y_rus = rus.fit_resample(x,y)

#### Split train and test

In [11]:
train_x_rus,test_x_rus,train_y_rus,test_y_rus = train_test_split(x_rus,y_rus,test_size=0.3)

#### Instantiate the model

In [12]:
rl_rus = LogisticRegression(n_jobs = -1)

#### Fit the model on the train set

In [13]:
rl_rus.fit(train_x_rus,train_y_rus)

#### Predict probabilities

In [14]:
pred_rl_rus = rl_rus.predict_proba(test_x)[:,1]

#### Evaluate model

In [15]:
roc_rl_rus = roc_auc_score(test_y, pred_rl_rus)

roc_rl_rus

0.7493256627619008

### Balance the dataset using oversampling

In [16]:
ros = RandomOverSampler(sampling_strategy= 1)

#### Instantiate and "transform" the oversampler

In [17]:
x_ros, y_ros = ros.fit_resample(x,y)

#### Split train and test

In [18]:
train_x_ros,test_x_ros,train_y_ros,test_y_ros = train_test_split(x_ros,y_ros,test_size=0.3)

#### Instantiate the model

In [19]:
rl_ros = LogisticRegression(n_jobs = -1)

#### Fit the model on the train set

In [20]:
rl_ros.fit(train_x_ros,train_y_ros)

#### Predict probabilities

In [21]:
pred_rl_ros = rl_ros.predict_proba(test_x)[:,1]

#### Evaluate model

In [22]:
roc_rl_ros = roc_auc_score(test_y, pred_rl_ros)

roc_rl_ros

0.7493543549710096

#### Since there is no significant difference using undersampling or oversampling, the data is not going to be balanced.

Save the final dataframe in .pickle format to proceed to the next step (modeling).

In [23]:
pd.to_pickle(x, "x_bal.pickle")
pd.to_pickle(y, "y_bal.pickle")