### Read the data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
from time import time

In [2]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier




In [3]:
df = pd.read_csv('data/data.csv')
df.head()

Unnamed: 0,customer_id,event,time,offer-completed,offer-received,offer-viewed,transaction,offer_id,amount,gender,...,income,reward,difficulty,duration,offer_type,email,mobile,social,web,event_id
0,78afa995795e4d85b5d9ceeca43f5fef,offer-received,0,0,1,0,0,0,,F,...,100000.0,5.0,5.0,7.0,bogo,1.0,1.0,0.0,1.0,0
1,78afa995795e4d85b5d9ceeca43f5fef,offer-viewed,6,0,0,1,0,0,,F,...,100000.0,5.0,5.0,7.0,bogo,1.0,1.0,0.0,1.0,1
2,78afa995795e4d85b5d9ceeca43f5fef,transaction,132,0,0,0,1,1,19.89,F,...,100000.0,,,,,,,,,2
3,78afa995795e4d85b5d9ceeca43f5fef,offer-completed,132,1,0,0,0,0,,F,...,100000.0,5.0,5.0,7.0,bogo,1.0,1.0,0.0,1.0,3
4,78afa995795e4d85b5d9ceeca43f5fef,transaction,144,0,0,0,1,1,17.78,F,...,100000.0,,,,,,,,,2


### One-hot encoder

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306534 entries, 0 to 306533
Data columns (total 22 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   customer_id       306534 non-null  object 
 1   event             306534 non-null  object 
 2   time              306534 non-null  int64  
 3   offer-completed   306534 non-null  int64  
 4   offer-received    306534 non-null  int64  
 5   offer-viewed      306534 non-null  int64  
 6   transaction       306534 non-null  int64  
 7   offer_id          306534 non-null  int64  
 8   amount            138953 non-null  float64
 9   gender            272762 non-null  object 
 10  age               306534 non-null  int64  
 11  became_member_on  306534 non-null  object 
 12  income            272762 non-null  float64
 13  reward            167581 non-null  float64
 14  difficulty        167581 non-null  float64
 15  duration          167581 non-null  float64
 16  offer_type        16

In [5]:
# Onehot encoder for gender col
gender = {'O': 0, 'M': 1, 'F': 2}
gender

{'O': 0, 'M': 1, 'F': 2}

In [6]:
df['gender'] = df['gender'].map(gender)

In [7]:
df.head()

Unnamed: 0,customer_id,event,time,offer-completed,offer-received,offer-viewed,transaction,offer_id,amount,gender,...,income,reward,difficulty,duration,offer_type,email,mobile,social,web,event_id
0,78afa995795e4d85b5d9ceeca43f5fef,offer-received,0,0,1,0,0,0,,2.0,...,100000.0,5.0,5.0,7.0,bogo,1.0,1.0,0.0,1.0,0
1,78afa995795e4d85b5d9ceeca43f5fef,offer-viewed,6,0,0,1,0,0,,2.0,...,100000.0,5.0,5.0,7.0,bogo,1.0,1.0,0.0,1.0,1
2,78afa995795e4d85b5d9ceeca43f5fef,transaction,132,0,0,0,1,1,19.89,2.0,...,100000.0,,,,,,,,,2
3,78afa995795e4d85b5d9ceeca43f5fef,offer-completed,132,1,0,0,0,0,,2.0,...,100000.0,5.0,5.0,7.0,bogo,1.0,1.0,0.0,1.0,3
4,78afa995795e4d85b5d9ceeca43f5fef,transaction,144,0,0,0,1,1,17.78,2.0,...,100000.0,,,,,,,,,2


In [8]:
# One hot encoder for offer col
offers = {
    'bogo': 0,
    'discount': 1,
    'informational': 2
}
offers

{'bogo': 0, 'discount': 1, 'informational': 2}

In [9]:
df['offer_type'] = df['offer_type'].map(offers)
df.head()

Unnamed: 0,customer_id,event,time,offer-completed,offer-received,offer-viewed,transaction,offer_id,amount,gender,...,income,reward,difficulty,duration,offer_type,email,mobile,social,web,event_id
0,78afa995795e4d85b5d9ceeca43f5fef,offer-received,0,0,1,0,0,0,,2.0,...,100000.0,5.0,5.0,7.0,0.0,1.0,1.0,0.0,1.0,0
1,78afa995795e4d85b5d9ceeca43f5fef,offer-viewed,6,0,0,1,0,0,,2.0,...,100000.0,5.0,5.0,7.0,0.0,1.0,1.0,0.0,1.0,1
2,78afa995795e4d85b5d9ceeca43f5fef,transaction,132,0,0,0,1,1,19.89,2.0,...,100000.0,,,,,,,,,2
3,78afa995795e4d85b5d9ceeca43f5fef,offer-completed,132,1,0,0,0,0,,2.0,...,100000.0,5.0,5.0,7.0,0.0,1.0,1.0,0.0,1.0,3
4,78afa995795e4d85b5d9ceeca43f5fef,transaction,144,0,0,0,1,1,17.78,2.0,...,100000.0,,,,,,,,,2


In [10]:
df.columns

Index(['customer_id', 'event', 'time', 'offer-completed', 'offer-received',
       'offer-viewed', 'transaction', 'offer_id', 'amount', 'gender', 'age',
       'became_member_on', 'income', 'reward', 'difficulty', 'duration',
       'offer_type', 'email', 'mobile', 'social', 'web', 'event_id'],
      dtype='object')

In [11]:
# drop all null values in income columns
df.dropna(subset=['income'], inplace = True)

In [12]:
# fill all N/A values with 0
df.fillna(0, inplace = True)

In [13]:
# Split column input and output
X = df[['time', 'amount','reward', 'difficulty', 'age', 'income', 'gender', 'offer_type', 'email', 'mobile', 'social', 'web']]
Y = df['offer-completed']

In [14]:
X.head()

Unnamed: 0,time,amount,reward,difficulty,age,income,gender,offer_type,email,mobile,social,web
0,0,0.0,5.0,5.0,75,100000.0,2.0,0.0,1.0,1.0,0.0,1.0
1,6,0.0,5.0,5.0,75,100000.0,2.0,0.0,1.0,1.0,0.0,1.0
2,132,19.89,0.0,0.0,75,100000.0,2.0,0.0,0.0,0.0,0.0,0.0
3,132,0.0,5.0,5.0,75,100000.0,2.0,0.0,1.0,1.0,0.0,1.0
4,144,17.78,0.0,0.0,75,100000.0,2.0,0.0,0.0,0.0,0.0,0.0


In [15]:
Y.head()

0    0
1    0
2    0
3    1
4    0
Name: offer-completed, dtype: int64

In [16]:
print('X shape: ', X.shape)
print('Y shape: ', Y.shape)

X shape:  (272762, 12)
Y shape:  (272762,)


### Scaling

In [17]:
scaler = MinMaxScaler()

In [18]:
normalize_columns = list(X.columns)
normalize_columns

['time',
 'amount',
 'reward',
 'difficulty',
 'age',
 'income',
 'gender',
 'offer_type',
 'email',
 'mobile',
 'social',
 'web']

In [19]:
X[normalize_columns] = scaler.fit_transform(X[normalize_columns])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[normalize_columns] = scaler.fit_transform(X[normalize_columns])


In [20]:
X.head()

Unnamed: 0,time,amount,reward,difficulty,age,income,gender,offer_type,email,mobile,social,web
0,0.0,0.0,0.5,0.25,0.686747,0.777778,1.0,0.0,1.0,1.0,0.0,1.0
1,0.008403,0.0,0.5,0.25,0.686747,0.777778,1.0,0.0,1.0,1.0,0.0,1.0
2,0.184874,0.018724,0.0,0.0,0.686747,0.777778,1.0,0.0,0.0,0.0,0.0,0.0
3,0.184874,0.0,0.5,0.25,0.686747,0.777778,1.0,0.0,1.0,1.0,0.0,1.0
4,0.201681,0.016738,0.0,0.0,0.686747,0.777778,1.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# split train/test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=13)

In [22]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((190933, 12), (81829, 12), (190933,), (81829,))

### Modeling

#### 1. Logistics Regression

In [23]:
start = time() 
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
end = time() 

lr_time = end-start

In [24]:
print("Total time using when training the model:", lr_time, '(s)')
accuracy_lr = accuracy_score(y_test, lr_pred)
print("Accuracy: %.3f%%" % (accuracy_lr * 100.0))
print('Classification Report: \n', classification_report(y_test, lr_pred))

Total time using when training the model: 0.42969799041748047 (s)
Accuracy: 88.260%
Classification Report: 
               precision    recall  f1-score   support

           0       0.88      1.00      0.94     72221
           1       0.51      0.00      0.01      9608

    accuracy                           0.88     81829
   macro avg       0.70      0.50      0.47     81829
weighted avg       0.84      0.88      0.83     81829



#### 2. Decision Tree Classifier

In [25]:
start = time() 
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
end = time() 

dt_time = end-start

In [26]:
print("Total time using when training the model:", dt_time,'(s)')
accuracy_dt = accuracy_score(y_test, dt_pred)
print("Accuracy: %.2f%%" % (accuracy_dt * 100.0))
print('Classification Report: \n', classification_report(y_test, dt_pred))

Total time using when training the model: 0.4302971363067627 (s)
Accuracy: 87.59%
Classification Report: 
               precision    recall  f1-score   support

           0       0.93      0.93      0.93     72221
           1       0.47      0.47      0.47      9608

    accuracy                           0.88     81829
   macro avg       0.70      0.70      0.70     81829
weighted avg       0.88      0.88      0.88     81829



#### 3. Random Forest Classifier

In [27]:
start = time() 
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
end = time() 

rf_time = end-start

In [28]:
print("Total time using when training the model:", rf_time,'(s)')
accuracy_rf = accuracy_score(y_test, rf_pred)
print("Accuracy: %.2f%%" % (accuracy_rf * 100.0))
print('Classification Report: \n', classification_report(y_test, rf_pred))

Total time using when training the model: 14.252693891525269 (s)
Accuracy: 86.87%
Classification Report: 
               precision    recall  f1-score   support

           0       0.91      0.95      0.93     72221
           1       0.41      0.27      0.33      9608

    accuracy                           0.87     81829
   macro avg       0.66      0.61      0.63     81829
weighted avg       0.85      0.87      0.86     81829



#### 4. K Neighboor Classifier

In [29]:
start = time() 
kn = KNeighborsClassifier()
kn.fit(X_train, y_train)
kn_pred = kn.predict(X_test)
end = time() 

kn_time = end-start

In [30]:
print("Total time using when training the model:", kn_time,'(s)')
accuracy_kn = accuracy_score(y_test, kn_pred)
print("Accuracy: %.2f%%" % (accuracy_kn * 100.0))
print('Classification Report: \n', classification_report(y_test, kn_pred))

Total time using when training the model: 7.37971305847168 (s)
Accuracy: 88.13%
Classification Report: 
               precision    recall  f1-score   support

           0       0.90      0.97      0.94     72221
           1       0.49      0.23      0.31      9608

    accuracy                           0.88     81829
   macro avg       0.70      0.60      0.62     81829
weighted avg       0.86      0.88      0.86     81829



#### 5. AdaBoost

In [31]:
start = time() 
ad = AdaBoostClassifier()
ad.fit(X_train, y_train)
ad_pred = ad.predict(X_test)
end = time() 

ad_time = end-start

In [32]:
print("Total time using when training the model:", ad_time,'(s)')
accuracy_ad = accuracy_score(y_test, ad_pred)
print("Accuracy: %.2f%%" % (accuracy_ad * 100.0))
print('Classification Report: \n', classification_report(y_test, ad_pred))

Total time using when training the model: 4.791959762573242 (s)
Accuracy: 89.26%
Classification Report: 
               precision    recall  f1-score   support

           0       0.91      0.98      0.94     72221
           1       0.60      0.25      0.36      9608

    accuracy                           0.89     81829
   macro avg       0.75      0.62      0.65     81829
weighted avg       0.87      0.89      0.87     81829



#### 6. XGBClassifier

In [33]:
start = time() 
xg = XGBClassifier()
xg.fit(X_train, y_train)
xg_pred = xg.predict(X_test)
end = time() 

xg_time = end-start

In [34]:
print("Total time using when training the model:", xg_time,'(s)')
accuracy_xg = accuracy_score(y_test, ad_pred)
print("Accuracy: %.2f%%" % (accuracy_xg * 100.0))
print('Classification Report: \n', classification_report(y_test, xg_pred))

Total time using when training the model: 1.6368701457977295 (s)
Accuracy: 89.26%
Classification Report: 
               precision    recall  f1-score   support

           0       0.94      0.95      0.95     72221
           1       0.61      0.55      0.58      9608

    accuracy                           0.91     81829
   macro avg       0.77      0.75      0.76     81829
weighted avg       0.90      0.91      0.90     81829



Based on the result, the XGBoost Classifier has the highest accuracy with 89.26% 
So I will use this model for tuning in later part.


### Tunning model

In [46]:
def perform_grid_search(X_train, y_train):
    """
    Performs a grid search to find the best hyperparameters for an XGBoost classifier.

    Args:
        X_train (array-like): The training input samples.
        y_train (array-like): The target values.

    Returns:
        dict: A dictionary containing the best parameters found by the grid search and the corresponding best score.
    """

    # Hyperparameters
    param_grid = {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.1, 0.01, 0.001],
        'n_estimators': [100, 500, 1000],
    }

    xg = XGBClassifier()

    grid_search = GridSearchCV(xg, param_grid, cv=5)

    grid_search.fit(X_train, y_train)

    # Get the best parameters and the best score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    return {'best_params': best_params, 'best_score': best_score}

In [47]:
result = perform_grid_search(X_train, y_train)

print("Best parameters: ", result['best_params'])
print("Best score: ", result['best_score'])

Best parameters:  {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 1000}
Best score:  0.9089104488742545


In [38]:
best_params = {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 1000}
start = time()
xg_tunned = XGBClassifier(learning_rate = 0.01, max_depth = 5, n_estimators = 1000)
xg_tunned.fit(X_train, y_train)
xg_tunned_pred = xg_tunned.predict(X_test)

end = time()

xg_tuned_total_time = end-start

In [39]:
print("Total time using when training the model:", xg_tuned_total_time,'(s)')
accuracy_tunned = accuracy_score(y_test, xg_tunned_pred)
print("Accuracy: %.2f%%" % (accuracy_tunned * 100.0))
print('Classification Report: \n', classification_report(y_test, xg_tunned_pred))

Total time using when training the model: 2.3156681060791016 (s)
Accuracy: 90.83%
Classification Report: 
               precision    recall  f1-score   support

           0       0.94      0.96      0.95     72221
           1       0.62      0.55      0.59      9608

    accuracy                           0.91     81829
   macro avg       0.78      0.75      0.77     81829
weighted avg       0.90      0.91      0.91     81829



The accuracy has been improved from 89.26% to 90.83%

### Features Important

In [48]:
def get_top_features(xg_model, X_train, n=10):
    """
    Retrieves the top 'n' important features from an XGBoost model and returns them in a DataFrame.

    Args:
        xg_model (object): The trained XGBoost model.
        X_train (DataFrame): The training input samples.
        n (int, optional): The number of top features to retrieve. Defaults to 10.

    Returns:
        DataFrame: A DataFrame containing the top 'n' features and their importances, sorted in descending order.
    """

    # Retrieve feature importances from the XGBoost model
    importance = xg_model.feature_importances_

    # Create a DataFrame with feature names and importances
    feature_imp = pd.DataFrame({'Feature': X_train.columns, 'Importance': importance})

    # Sort the DataFrame by importance in descending order
    feature_imp = feature_imp.sort_values(by='Importance', ascending=False).reset_index(drop=True)

    # Retrieve the top 'n' features
    top_features = feature_imp.head(n)

    return top_features



In [50]:
top_10_features = get_top_features(xg_tunned, X_train, n=10)

print(top_10_features)

      Feature  Importance
0      reward    0.834242
1        time    0.121450
2      gender    0.010695
3      social    0.010199
4      income    0.007340
5  offer_type    0.005262
6  difficulty    0.002826
7         web    0.002789
8      mobile    0.002629
9         age    0.002568


Base on the result, we can see that feature `reward` and `time` are the most important features (83.42% and 12.14%)

### Comparing the models

In [51]:
def create_model_performance_df(model_names, accuracy_scores, all_model_time):
    """
    Creates a DataFrame to compare the performance of different models.

    Args:
        model_names (list): A list of model names.
        accuracy_scores (list): A list of accuracy scores corresponding to each model.
        all_model_time (list): A list of time taken by each model.

    Returns:
        DataFrame: A DataFrame containing the model names, accuracy scores, and time taken.
                   The DataFrame is sorted in descending order based on accuracy scores.
    """

    # Create a DataFrame with model names, accuracy scores, and time taken
    df = pd.DataFrame(list(zip(accuracy_scores, all_model_time)), index=model_names, columns=['Accuracy', 'Time (s)'])

    # Sort the DataFrame by accuracy scores in descending order
    df = df.sort_values(by='Accuracy', ascending=False)

    return df



In [52]:
model_names = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'K Neighbors', 'Ada Boost', 'XGB Classifier', 'Tunned XGB Classifier']
accuracy_scores = [accuracy_score(y_test, lr_pred), accuracy_score(y_test, dt_pred), accuracy_score(y_test, rf_pred), accuracy_score(y_test, kn_pred), accuracy_score(y_test, ad_pred), accuracy_score(y_test, xg_pred), accuracy_score(y_test, xg_tunned_pred)]
all_model_time = [lr_time, dt_time, rf_time, kn_time, ad_time, xg_time, xg_tuned_total_time]

performance_df = create_model_performance_df(model_names, accuracy_scores, all_model_time)

print(performance_df)

                       Accuracy   Time (s)
Tunned XGB Classifier  0.908345   2.315668
XGB Classifier         0.905339   1.636870
Ada Boost              0.892642   4.791960
Logistic Regression    0.882597   0.429698
K Neighbors            0.881326   7.379713
Decision Tree          0.875875   0.430297
Random Forest          0.868665  14.252694


### Conclusion: 
- Top 2 most important features is `reward` (83.42%) and `time` (12.14%)
- XGB Classifier is the best model here because of it accuracy and the worst is random forest
- We can still improve the accuracy of the model by tunning it with GridSearchCV and hyperparameters