In [1]:
%pip install scikit-learn
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 23.0
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 23.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectPercentile
import pandas as pd

##### Train the model from the Bonus Assignment

In [3]:
import xgboost as xgb

def train_bonus_ass_model():
    """
        The function repeats the training process of the model in the
        Bonus assignment. Returned values are the trained model and the selected values.

        Features selection is done by '''SelectPercentile'''.
        Model used is XGBoost.
        Test is 20% of total data.
    """
    df = pd.read_csv("../BonusAssignment/text_training.csv", usecols=list(range(1,2002)))

    # separate the features and target variable
    features = df.iloc[:, 1:-1] # all columns except the last one (rating)
    labels = df.iloc[:, -1] # last column (rating)

    selector = SelectPercentile(percentile=10)
    x = selector.fit_transform(features, labels)
    selected_features = selector.get_feature_names_out()

    x_train, x_test, y_train, y_test = train_test_split(x, labels, test_size=0.2, random_state=1)

    xgb_bonus_clf = xgb.XGBClassifier(max_depth=4, seed=2)
    xgb_bonus_clf.fit(x_train, y_train)

    return xgb_bonus_clf, selected_features

In [4]:
def bonus_ass_rating_prediction(table_path, trained_model, select_features):
    """
        The function Receives the path to one of the foloowing tables: 'reviews_rollout.csv'
        or 'reviews_training.csv' and returns the predicted values in dataframe along with the
        respected ids.

        Assumptions:
        1. Both tables contain an id column
        2. Rating Column does not appear in any table
        3. model was Trained on the same tryp of population
    """    
    df = pd.read_csv(table_path)
    selected = df[select_features]
    y_pred = trained_model.predict(selected)

    return pd.concat([df["ID"], pd.Series(y_pred, name='Rating')], axis=1)

In [5]:
trained_model, selected_features = train_bonus_ass_model()
y_pred_rev_training =   bonus_ass_rating_prediction("Documents/reviews_training.csv", trained_model, selected_features)
y_pred_rev_rollout =    bonus_ass_rating_prediction("Documents/reviews_rollout.csv", trained_model, selected_features)
y_pred_rev_training

  f = msb / msw


Unnamed: 0,ID,Rating
0,2,0
1,14,1
2,18,0
3,93,1
4,95,0
...,...,...
1856,29938,0
1857,29948,0
1858,29969,1
1859,29980,1


##### Merge the predicted values with the new tables:
1. ffp_rollout_X
2. ffp_train

In [6]:
df_ffp_train = pd.read_csv("Documents/ffp_train.csv")
merged_ffp_train = pd.merge(left=df_ffp_train, right=y_pred_rev_training,how="outer", on="ID")

df_ffp_rollout = pd.read_csv("Documents/ffp_rollout_X.csv")
merged_ffp_rollout = pd.merge(left=df_ffp_rollout, right=y_pred_rev_rollout,how="outer", on="ID")

# Choosing a model for the merged data

In order to predict the ```Buyer flag``` of the ffp table, we will use the merged tables.
The problem: Some rows contain the rating value and some dont. To solve this, we will build model that is made out of two different models. One will work on the fpp table and one will work on the merged table. Predicted value will be chosen from one of the two results

In [7]:
# Devide the data according to the rating values.
# model1 will use df_ffp_data (data with no rating prediction)
# model2 will use df_combined_data (data with rating prediction)

df_model1_train_no_rating = merged_ffp_train[merged_ffp_train["Rating"].isna()].drop("Rating", axis=1)
df_model2_train_both = merged_ffp_train[~merged_ffp_train["Rating"].isna()]


### Testing different models
We will use Xgboost and Random forest

##### We will start with the combined data (ffp and rating)

In [8]:
df_model2_train_both.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1861 entries, 1 to 29983
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                1861 non-null   int64  
 1   CUSTOMER_GRADE    1861 non-null   float64
 2   STATUS_PANTINUM   1861 non-null   int64  
 3   STATUS_GOLD       1861 non-null   int64  
 4   STATUS_SILVER     1861 non-null   int64  
 5   NUM_DEAL          1861 non-null   int64  
 6   LAST_DEAL         1861 non-null   float64
 7   ADVANCE_PURCHASE  1861 non-null   int64  
 8   FARE_L_Y1         1861 non-null   float64
 9   FARE_L_Y2         1861 non-null   float64
 10  FARE_L_Y3         1861 non-null   float64
 11  FARE_L_Y4         1861 non-null   float64
 12  FARE_L_Y5         1861 non-null   float64
 13  POINTS_L_Y1       1861 non-null   float64
 14  POINTS_L_Y2       1861 non-null   float64
 15  POINTS_L_Y3       1861 non-null   float64
 16  POINTS_L_Y4       1861 non-null   float64

In [48]:
# Strip ID and label
features = df_model2_train_both.drop(["BUYER_FLAG","ID"], axis=1)
# Extract the label column
label = df_model2_train_both["BUYER_FLAG"]


Unnamed: 0,CUSTOMER_GRADE,STATUS_PANTINUM,STATUS_GOLD,STATUS_SILVER,NUM_DEAL,LAST_DEAL,ADVANCE_PURCHASE,FARE_L_Y1,FARE_L_Y2,FARE_L_Y3,...,POINTS_L_Y1,POINTS_L_Y2,POINTS_L_Y3,POINTS_L_Y4,POINTS_L_Y5,COUPON_FLAG,CANCEL_FLAG,CREDIT_FLAG,RELATED_FLAG,Rating
1,9.493459,0,0,0,5,26.7,27,120.7,112.0,118.0,...,126.3,112.5,104.6,110.9,137.3,0,0,1,0,0.0
13,12.200276,0,0,0,2,58.7,21,124.0,120.7,116.0,...,115.5,147.9,123.4,120.4,119.5,0,0,0,0,1.0
17,8.248704,0,0,1,6,24.0,28,143.3,145.3,139.3,...,125.5,167.3,163.3,169.4,152.3,0,0,0,0,0.0
92,13.308937,0,0,0,5,21.3,9,104.7,97.3,104.7,...,85.1,111.5,94.5,108.1,125.2,0,0,0,0,1.0
94,10.420929,0,0,0,6,16.0,15,90.0,90.7,94.0,...,72.1,90.3,117.7,109.7,97.2,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29937,8.813226,0,0,0,7,13.3,22,87.3,80.7,84.0,...,98.1,96.4,78.8,88.8,100.9,0,1,0,0,0.0
29947,12.060344,0,0,0,5,16.0,14,95.3,100.7,87.3,...,64.9,106.4,87.6,139.9,94.5,0,0,0,0,0.0
29968,9.308334,0,1,0,5,48.0,22,218.7,224.0,228.7,...,178.8,192.6,219.3,218.7,223.8,0,0,0,1,1.0
29979,12.411760,0,0,1,3,45.3,14,146.0,141.3,138.0,...,142.9,188.8,144.1,155.6,154.2,0,0,0,0,1.0


##### Implement features selection by correlation matrix

In [58]:
def select_features_by_corr(data, treshold):
    mat = data.corr().drop("BUYER_FLAG", axis=0)
    mat = mat[abs(mat["BUYER_FLAG"]) > treshold]
    return list(mat.index)
    #REMOVE THE BUYER FLAG

from sklearn.feature_selection import SelectPercentile

def select_features_by_SelecPer(features, lable, p):
    selector = SelectPercentile(percentile=p)
    selector.fit_transform(features, label)
    return selector.get_feature_names_out()

In [43]:
data = df_model2_train_both.iloc[:,1:] # Get rid of ID col

mat = data.corr().drop("BUYER_FLAG", axis=0)
mat["BUYER_FLAG"] = abs(mat["BUYER_FLAG"])
print(f"Minimum corrlation value with 'BUYER_FLAG': {mat.describe()['BUYER_FLAG'].loc['min']}")
print(f"Maximum corrlation value with 'BUYER_FLAG': {mat.describe()['BUYER_FLAG'].loc['max']}")

Minimum corrlation value with 'BUYER_FLAG': 0.0031826178888089756
Maximum corrlation value with 'BUYER_FLAG': 0.560140516762669


Using XGBoost

In [None]:
from sklearn.metrics import f1_score
from tqdm import tqdm

best_score = 0                         # Hold the F1 score of the best model
score = 0                              # Score of the current model
last_count = -1                        # flag for preventing reapetition
st = ""                                # Result string

for t in tqdm(range(30, 5600, 1)): # 0.03 Treshold includes all 22 features
 
    # Treshold is iterated from 0.003 to 0.56 increasing by 0.0001
    treshold = t / 10000
    selected_features = select_features_by_corr(data, treshold)

    # Break condition
    if len(selected_features) == last_count:
        continue
    else:
        last_count = len(selected_features)

    x_train, x_test, y_train, y_test = train_test_split(features[selected_features], label, test_size=0.2, random_state=1)
    for d in range(1, 10, 1):
        # Train XGBoost model
        xgbclf = xgb.XGBClassifier(max_depth=d, seed=2)
        xgbclf.fit(x_train, y_train)
        y_pred = xgbclf.predict(x_test)
        score = f1_score(y_test, y_pred)
        # print(f"Current run:\nThresh: {treshold}\nVariables: {len(selected_features)}\n d: {d}\nF1: {score}")

        # Hold the best score
        if score > best_score:
            best_score = score
            st = f"The best XGBoost model...\nHas a treshold of {treshold}\nUses {len(selected_features)} features\nHas a model depth of {d}\nand an F1 score of {best_score}"
            
print(40*"~", st, 40*"~", sep="\n")

100%|██████████| 5570/5570 [01:44<00:00, 53.44it/s] 

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The best XGBoost model...
Has a treshold of 0.0438
Uses 3 features
Has a model depth of 1
and an F1 score of 0.6294416243654822
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~





Using Random Forest

In [None]:
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm

best_score = 0                         # Hold the F1 score of the best model
score = 0                              # Score of the current model
last_count = -1                        # flag for preventing reapetition
st = ""                                # Result string

for t in tqdm(range(30, 5600, 1)): # 0.03 Treshold includes all 22 features
 
    # Treshold is iterated from 0.003 to 0.56 increasing by 0.0001
    treshold = t / 10000
    selected_features = select_features_by_corr(data, treshold)

    # Break condition
    if len(selected_features) == last_count:
        continue
    else:
        last_count = len(selected_features)

    x_train, x_test, y_train, y_test = train_test_split(features[selected_features], label, test_size=0.2, random_state=1)
    for n_e in range(1, 200, 1):
        # Train XGBoost model
        rf = RandomForestClassifier(n_estimators=n_e)
        rf.fit(x_train, y_train)
        y_pred = rf.predict(x_test)
        score = f1_score(y_test, y_pred)
        # print(f"Current run:\nThresh: {treshold}\nVariables: {len(selected_features)}\n d: {d}\nF1: {score}")

        # Hold the best score
        if score > best_score:
            best_score = score
            st = f"The best Random Forest model...\nHas a treshold of {treshold}\nUses {len(selected_features)} features\nHas {n_e} estimators\nand an F1 score of {best_score}"
            
print(40*"~", st, 40*"~", sep="\n")

100%|██████████| 5570/5570 [00:47<00:00, 117.72it/s]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The best Random Forest model...
Has a treshold of 0.0438
Uses 3 features
Has a model depth of 1
and an F1 score of 0.6294416243654822
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~





We will now repeat the algorithmes above, using the select percentile function.

In [None]:
from sklearn.metrics import f1_score
from tqdm import tqdm

best_score = 0                         # Hold the F1 score of the best model
score = 0                              # Score of the current model
last_count = -1                        # flag for preventing reapetition
st = ""                                # Result string

for p in tqdm(range(1, 101, 1)): 
 
    selected_features = select_features_by_SelecPer(features, label, p)

    # Break condition
    if len(selected_features) == last_count:
        continue
    else:
        last_count = len(selected_features)

    x_train, x_test, y_train, y_test = train_test_split(features[selected_features], label, test_size=0.2, random_state=1)
    for d in range(1, 10, 1):
        # Train XGBoost model
        xgbclf = xgb.XGBClassifier(max_depth=d, seed=2)
        xgbclf.fit(x_train, y_train)
        y_pred = xgbclf.predict(x_test)
        score = f1_score(y_test, y_pred)
        # print(f"Current run:\nThresh: {treshold}\nVariables: {len(selected_features)}\n d: {d}\nF1: {score}")

        # Hold the best score
        if score > best_score:
            best_score = score
            st = f"The best XGBoost model...\nHas a percentile of {p}\nUses {len(selected_features)} features\nHas a model depth of {d}\nand an F1 score of {best_score}"
            
print(40*"~", st, 40*"~", sep="\n")

100%|██████████| 100/100 [01:03<00:00,  1.57it/s]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The best XGBoost model...
Has a percentile of 1
Uses 1 features
Has a model depth of 1
and an F1 score of 0.6294416243654822
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~





In [63]:
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm

best_score = 0                         # Hold the F1 score of the best model
score = 0                              # Score of the current model
last_count = -1                        # flag for preventing reapetition
st = ""                                # Result string

for p in tqdm(range(1, 101, 1)): 
 
    selected_features = select_features_by_SelecPer(features, label, p)

    # Break condition
    if len(selected_features) == last_count:
        continue
    else:
        last_count = len(selected_features)

    x_train, x_test, y_train, y_test = train_test_split(features[selected_features], label, test_size=0.2, random_state=1)
    for n_e in range(1, 200, 1):
        # Train XGBoost model
        rf = RandomForestClassifier(n_estimators=n_e)
        rf.fit(x_train, y_train)
        y_pred = rf.predict(x_test)
        score = f1_score(y_test, y_pred)
        # print(f"Current run:\nThresh: {treshold}\nVariables: {len(selected_features)}\n d: {d}\nF1: {score}")

        # Hold the best score
        if score > best_score:
            best_score = score
            st = f"The best Random Forest model...\nHas a percentile of {p}\nUses {len(selected_features)} features\nHas {n_e} Estimators\nand an F1 score of {best_score}"
            
print(40*"~", st, 40*"~", sep="\n")

 86%|████████▌ | 86/100 [48:53<09:23, 40.25s/it] 

Next, We will move onto the data that has no rating prediction

In [None]:
features = df_model1_train_no_rating.iloc[:,1:-1]
label = df_model1_train_no_rating["BUYER_FLAG"]

x_train, x_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=1)

Unnamed: 0,CUSTOMER_GRADE,STATUS_PANTINUM,STATUS_GOLD,STATUS_SILVER,NUM_DEAL,LAST_DEAL,ADVANCE_PURCHASE,FARE_L_Y1,FARE_L_Y2,FARE_L_Y3,...,FARE_L_Y5,POINTS_L_Y1,POINTS_L_Y2,POINTS_L_Y3,POINTS_L_Y4,POINTS_L_Y5,COUPON_FLAG,CANCEL_FLAG,CREDIT_FLAG,RELATED_FLAG
0,11.545711,0,0,0,3,29.3,24,82.7,92.7,97.3,...,76.7,71.3,60.7,98.1,81.6,78.6,0,0,0,0
2,3.882457,0,0,0,4,26.7,9,97.3,92.0,95.3,...,88.7,107.7,120.7,62.5,63.9,92.9,0,0,0,0
3,9.511699,0,0,0,2,61.3,24,123.3,130.7,123.3,...,120.7,129.5,132.1,135.1,104.1,118.2,0,0,0,0
4,14.798792,0,0,0,4,37.3,20,136.0,135.3,144.0,...,134.7,145.7,124.5,122.7,140.1,130.9,0,0,0,0
5,11.762368,0,0,0,4,26.7,25,102.7,79.3,81.3,...,104.7,78.5,105.0,98.5,82.4,106.0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,8.993560,0,0,0,6,16.0,25,98.7,92.7,92.0,...,88.7,57.2,89.1,83.5,56.1,79.8,0,0,0,0
29996,9.809759,0,1,0,2,117.3,20,228.0,233.3,237.3,...,232.7,178.9,215.5,214.1,216.7,232.6,0,0,0,0
29997,11.226843,0,0,0,4,24.0,21,84.7,87.3,92.7,...,88.7,53.0,83.4,81.3,73.9,90.6,0,0,0,0
29998,12.926248,0,0,0,3,42.7,20,121.3,126.7,120.7,...,123.3,98.9,80.0,89.0,121.5,158.5,0,0,0,0
