In [2]:
%pip install scikit-learn
%pip install xgboost






Note: you may need to restart the kernel to use updated packages.




In [3]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectPercentile
import pandas as pd

##### Train the model from the Bonus Assignment

In [4]:
import xgboost as xgb

def train_bonus_ass_model():
    """
        The function repeats the training process of the model in the
        Bonus assignment. Returned values are the trained model and the selected values.

        Features selection is done by '''SelectPercentile'''.
        Model used is XGBoost.
        Test is 20% of total data.
    """
    df = pd.read_csv("../BonusAssignment/text_training.csv", usecols=list(range(1,2002)))

    # separate the features and target variable
    features = df.iloc[:, 1:-1] # all columns except the last one (rating)
    labels = df.iloc[:, -1] # last column (rating)

    selector = SelectPercentile(percentile=10)
    x = selector.fit_transform(features, labels)
    selected_features = selector.get_feature_names_out()

    x_train, x_test, y_train, y_test = train_test_split(x, labels, test_size=0.2, random_state=1)

    xgb_bonus_clf = xgb.XGBClassifier(max_depth=4, seed=2)
    xgb_bonus_clf.fit(x_train, y_train)

    return xgb_bonus_clf, selected_features

In [5]:
def bonus_ass_rating_prediction(table_path, trained_model, select_features):
    """
        The function Receives the path to one of the foloowing tables: 'reviews_rollout.csv'
        or 'reviews_training.csv' and returns the predicted values in dataframe along with the
        respected ids.

        Assumptions:
        1. Both tables contain an id column
        2. Rating Column does not appear in any table
        3. model was Trained on the same tryp of population
    """    
    df = pd.read_csv(table_path)
    selected = df[select_features]
    y_pred = trained_model.predict(selected)

    return pd.concat([df["ID"], pd.Series(y_pred, name='Rating')], axis=1)

In [6]:
trained_model, selected_features = train_bonus_ass_model()
y_pred_rev_training =   bonus_ass_rating_prediction("Documents/reviews_training.csv", trained_model, selected_features)
y_pred_rev_rollout =    bonus_ass_rating_prediction("Documents/reviews_rollout.csv", trained_model, selected_features)
y_pred_rev_training

  f = msb / msw


Unnamed: 0,ID,Rating
0,2,0
1,14,1
2,18,0
3,93,1
4,95,0
...,...,...
1856,29938,0
1857,29948,0
1858,29969,1
1859,29980,1


##### Merge the predicted values with the new tables:
1. ffp_rollout_X
2. ffp_train

In [39]:
df_ffp_train = pd.read_csv("Documents/ffp_train.csv")
merged_ffp_train = pd.merge(left=df_ffp_train, right=y_pred_rev_training,how="outer", on="ID")

df_ffp_rollout = pd.read_csv("Documents/ffp_rollout_X.csv")
merged_ffp_rollout = pd.merge(left=df_ffp_rollout, right=y_pred_rev_rollout,how="outer", on="ID")

# Choosing a model for the merged data

In order to predict the ```Buyer flag``` of the ffp table, we will use the merged tables.
The problem: Some rows contain the rating value and some dont. To solve this, we will build model that is made out of two different models. One will work on the fpp table and one will work on the merged table. Predicted value will be chosen from one of the two results

In [40]:
# Devide the data according to the rating values.
# model1 will use df_ffp_data (data with no rating prediction)
# model2 will use df_combined_data (data with rating prediction)

df_model1_train_no_rating = merged_ffp_train[merged_ffp_train["Rating"].isna()].drop("Rating", axis=1)
df_model2_train_both = merged_ffp_train[~merged_ffp_train["Rating"].isna()]


### Testing different models
We will use Xgboost and Random forest

##### We will start with the combined data (ffp and rating)

In [None]:
df_model2_train_both

In [44]:
# Strip ID and extract features 
features = df_model2_train_both.iloc[:,1:].drop("BUYER_FLAG", axis=1)
# Extract the label column
label = df_model2_train_both["BUYER_FLAG"]

x_train, x_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=1)

1        0
13       1
17       0
92       0
94       0
        ..
29937    0
29947    0
29968    1
29979    1
29983    0
Name: BUYER_FLAG, Length: 1861, dtype: int64

##### Implement features selection by correlation matrix

In [None]:
def select_features_by_corr(data, treshold):
    mat = data.corr()
    mat = mat[data["BUYER_FLAG"] > treshold]
    return list(mat.drop("BUYER_FLAG", axis=1).index)

In [None]:
data = df_model2_train_both[:,1:] # Get rid of ID col

for t in range(1, 40, 1):
    for d in range(1, 10, 1):
        treshold = t / 100
        selected_features = select_features_by_corr(data, treshold)
        print(f"{len(selected_features)}/{data.columns - 1} Selected. Threshold={treshold}.")

        xgbclf = xgb.XGBClassifier(max_depth=d, seed=2)
        xgbclf.fit(x_train, y_train)
        y_pred = xgbclf.predict(x_test)

        # calculate f1 and compare with best

Next, We will move onto the data that has no rating prediction

In [49]:
features = df_model1_train_no_rating.iloc[:,1:-1]
label = df_model1_train_no_rating["BUYER_FLAG"]

x_train, x_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=1)

Unnamed: 0,CUSTOMER_GRADE,STATUS_PANTINUM,STATUS_GOLD,STATUS_SILVER,NUM_DEAL,LAST_DEAL,ADVANCE_PURCHASE,FARE_L_Y1,FARE_L_Y2,FARE_L_Y3,...,FARE_L_Y5,POINTS_L_Y1,POINTS_L_Y2,POINTS_L_Y3,POINTS_L_Y4,POINTS_L_Y5,COUPON_FLAG,CANCEL_FLAG,CREDIT_FLAG,RELATED_FLAG
0,11.545711,0,0,0,3,29.3,24,82.7,92.7,97.3,...,76.7,71.3,60.7,98.1,81.6,78.6,0,0,0,0
2,3.882457,0,0,0,4,26.7,9,97.3,92.0,95.3,...,88.7,107.7,120.7,62.5,63.9,92.9,0,0,0,0
3,9.511699,0,0,0,2,61.3,24,123.3,130.7,123.3,...,120.7,129.5,132.1,135.1,104.1,118.2,0,0,0,0
4,14.798792,0,0,0,4,37.3,20,136.0,135.3,144.0,...,134.7,145.7,124.5,122.7,140.1,130.9,0,0,0,0
5,11.762368,0,0,0,4,26.7,25,102.7,79.3,81.3,...,104.7,78.5,105.0,98.5,82.4,106.0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,8.993560,0,0,0,6,16.0,25,98.7,92.7,92.0,...,88.7,57.2,89.1,83.5,56.1,79.8,0,0,0,0
29996,9.809759,0,1,0,2,117.3,20,228.0,233.3,237.3,...,232.7,178.9,215.5,214.1,216.7,232.6,0,0,0,0
29997,11.226843,0,0,0,4,24.0,21,84.7,87.3,92.7,...,88.7,53.0,83.4,81.3,73.9,90.6,0,0,0,0
29998,12.926248,0,0,0,3,42.7,20,121.3,126.7,120.7,...,123.3,98.9,80.0,89.0,121.5,158.5,0,0,0,0
