In [197]:
'''
PayPal Assignment
Submitted by Nomi Hadar
September 2019
'''
from IPython.display import display
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix


TRAIN_PATH = "../input/input/interview_dataset_train"
TEST_PATH = "../input/paypaldata/interview_dataset_test_no_tags"

#load train and test datasets
train = pd.read_csv(TRAIN_PATH, sep='\t')
test = pd.read_csv(TEST_PATH, sep='\t')

In [198]:
#print head of dataset
display(train.head())

#saperate into features and labels
train_x = train.drop(columns=['tag'])
train_y = train['tag']

'''
Describe train
'''
print('Num features: ', len(train_x.columns)) #11 features

#describe of numeric columns
display(train_x.describe()) #include='all'

#describe of object columns
display(train_x.select_dtypes(include='object').describe())

'''
Describe test
'''
#describe of numeric columns
display(test.describe()) #include='all'

#describe of object columns
display(test.select_dtypes(include='object').describe())

Unnamed: 0,#viewed_ads,#times_visited_website,#products_in_cart,target_product_price,target_product_price_color,target_product_description_length,timestamp,target_product_category,age,shopper_segment,delivery_time,tag
0,16.0,7.0,8,36,red,91,17:43:06,Video games - fifa 18,25.0,new,1-3 days,1
1,10.0,6.0,6,34,black,157,23:19:06,video games - nba 2k,,new,4-8 days,1
2,9.0,7.0,6,36,red,86,02:46:41,Video games - fifa 19,36.0,new,15+ days,0
3,,6.0,6,32,red,121,08:35:20,video games - tekken,32.0,heavy shopper,4-8 days,1
4,8.0,,8,56,red,85,07:00:45,Clothing - red hat,,heavy shopper,9-14 days,0


Num features:  11


Unnamed: 0,#viewed_ads,#times_visited_website,#products_in_cart,target_product_price,target_product_description_length,age
count,64089.0,63859.0,80000.0,80000.0,80000.0,63863.0
mean,10.006195,5.006186,6.498737,114.885887,103.867325,33.7709
std,2.233081,1.584876,1.794774,119.794007,55.64417,6.721509
min,0.0,0.0,0.0,21.0,8.0,19.0
25%,8.0,4.0,5.0,41.0,56.0,29.0
50%,10.0,5.0,7.0,51.0,104.0,34.0
75%,12.0,6.0,8.0,106.0,152.0,38.0
max,18.0,10.0,13.0,495.0,200.0,60.0


Unnamed: 0,target_product_price_color,timestamp,target_product_category,shopper_segment,delivery_time
count,80000,80000,80000,80000,80000
unique,6,52249,42,3,4
top,red,13:34:46,gardening - seed dispenser,heavy shopper,4-8 days
freq,43567,9,2722,26727,29669


Unnamed: 0,#viewed_ads,#times_visited_website,#products_in_cart,target_product_price,target_product_description_length,age
count,16004.0,15985.0,20000.0,20000.0,20000.0,16048.0
mean,9.996938,4.987301,6.51335,115.5145,104.41955,33.793931
std,2.229433,1.574804,1.810852,120.586093,56.004206,6.740614
min,2.0,0.0,0.0,21.0,8.0,19.0
25%,8.0,4.0,5.0,41.0,55.0,29.0
50%,10.0,5.0,7.0,51.0,105.0,34.0
75%,12.0,6.0,8.0,106.0,153.0,38.0
max,18.0,10.0,13.0,467.0,200.0,61.0


Unnamed: 0,target_product_price_color,timestamp,target_product_category,shopper_segment,delivery_time
count,20000,20000,20000,20000,20000
unique,6,17854,42,3,4
top,red,04:37:01,gardening - fertilizer,new,4-8 days
freq,10889,4,671,6769,7344


In [199]:
'''
Handling missing data
'''

#caculate percent of missing data per feature (for both train and test datasets)
dataset = pd.concat(objs=[train_x, test], axis=0, sort=False)
total = dataset.isnull().sum().sort_values(ascending=False)
percent = (dataset.isnull().sum()/dataset.isnull().count()).sort_values(ascending=False)
miss_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
display(miss_data)

#fill missing values of feature "#viewed_ads" with the median 
train_x["#viewed_ads"].fillna(train["#viewed_ads"].mean(),inplace=True)
test["#viewed_ads"].fillna(test["#viewed_ads"].mean(),inplace=True)

#fill missing values of feature "#times_visited_website" with the median 
train_x["#times_visited_website"].fillna(train["#times_visited_website"].mean(),inplace=True)
test["#times_visited_website"].fillna(test["#times_visited_website"].mean(),inplace=True)

#add a new feature for the "age" feature, indicating if value is missing or not
#and fill missing values with 0 in the original column 
train_x['is_null_age'] = train_x["age"].isnull().astype(int)
test['is_null_age'] = test["age"].isnull().astype(int)

train_x["age"].fillna(0, inplace=True)
test["age"].fillna(0, inplace=True)


Unnamed: 0,Total,Percent
#times_visited_website,20156,0.20156
age,20089,0.20089
#viewed_ads,19907,0.19907
delivery_time,0,0.0
shopper_segment,0,0.0
target_product_category,0,0.0
timestamp,0,0.0
target_product_description_length,0,0.0
target_product_price_color,0,0.0
target_product_price,0,0.0


In [200]:
'''
Handle Features
'''

#feature "timestamp"
#extract hour and encode it using sinus and cosinus, to treat the cyclical property of time
train_x["hour"] = train_x["timestamp"].str.slice(0, 2).astype(int)
train_x["hour_sin"] = np.sin(2 * np.pi * train_x["hour"]/23.0)
train_x["hour_cos"] = np.cos(2 * np.pi * train_x["hour"]/23.0)

test["hour"] = test["timestamp"].str.slice(0, 2).astype(int)
test["hour_sin"] = np.sin(2 * np.pi * test["hour"]/23.0)
test["hour_cos"] = np.cos(2 * np.pi * test["hour"]/23.0)

#feature "target_product_category"
#split into category and subcategory
train_x["category"], train_x["subcategory"] = train_x["target_product_category"].str.split(' - ').str
test["category"], test["subcategory"] = test["target_product_category"].str.split(' - ').str


display(train_x.head())


Unnamed: 0,#viewed_ads,#times_visited_website,#products_in_cart,target_product_price,target_product_price_color,target_product_description_length,timestamp,target_product_category,age,shopper_segment,delivery_time,is_null_age,hour,hour_sin,hour_cos,category,subcategory
0,16.0,7.0,8,36,red,91,17:43:06,Video games - fifa 18,25.0,new,1-3 days,0,17,-0.9976688,-0.068242,Video games,fifa 18
1,10.0,6.0,6,34,black,157,23:19:06,video games - nba 2k,0.0,new,4-8 days,1,23,-2.449294e-16,1.0,video games,nba 2k
2,9.0,7.0,6,36,red,86,02:46:41,Video games - fifa 19,36.0,new,15+ days,0,2,0.519584,0.854419,Video games,fifa 19
3,10.006195,6.0,6,32,red,121,08:35:20,video games - tekken,32.0,heavy shopper,4-8 days,0,8,0.8169699,-0.57668,video games,tekken
4,8.0,5.006186,8,56,red,85,07:00:45,Clothing - red hat,0.0,heavy shopper,9-14 days,1,7,0.9422609,-0.33488,Clothing,red hat


In [201]:
#encode each categorial features 
features_to_encode = ["target_product_price_color", "shopper_segment", "delivery_time", "category", "subcategory"]
for f in features_to_encode:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train_x[f].values) + list(test[f].values))
    train_x[f] = lbl.transform(list(train_x[f].values))
    test[f] = lbl.transform(list(test[f].values))


#drop irrelevant features 
train_x.drop(columns=["timestamp", "hour", "target_product_category"], inplace=True)
test.drop(columns=["timestamp", "hour", "target_product_category"], inplace=True)    
    
display(train_x.head())
display(test.head())

print('Num features: ', len(train_x.columns)) #14 features

Unnamed: 0,#viewed_ads,#times_visited_website,#products_in_cart,target_product_price,target_product_price_color,target_product_description_length,age,shopper_segment,delivery_time,is_null_age,hour_sin,hour_cos,category,subcategory
0,16.0,7.0,8,36,5,91,25.0,2,0,0,-0.9976688,-0.068242,4,3
1,10.0,6.0,6,34,0,157,0.0,2,2,1,-2.449294e-16,1.0,9,10
2,9.0,7.0,6,36,5,86,36.0,2,1,0,0.519584,0.854419,4,4
3,10.006195,6.0,6,32,5,121,32.0,1,2,0,0.8169699,-0.57668,9,18
4,8.0,5.006186,8,56,5,85,0.0,1,3,1,0.9422609,-0.33488,0,12


Unnamed: 0,#viewed_ads,#times_visited_website,#products_in_cart,target_product_price,target_product_price_color,target_product_description_length,age,shopper_segment,delivery_time,is_null_age,hour_sin,hour_cos,category,subcategory
0,7.0,5.0,9,112,3,93,21.0,2,1,0,-0.997669,-0.068242,1,15
1,11.0,4.0,8,45,1,22,26.0,2,0,0,0.997669,-0.068242,7,11
2,10.0,4.0,4,48,5,168,35.0,1,0,0,0.398401,-0.917211,2,19
3,9.996938,4.0,7,105,5,146,0.0,1,3,1,0.887885,0.460065,6,15
4,9.996938,4.0,4,36,5,16,31.0,2,0,0,0.997669,-0.068242,2,14


Num features:  14


In [202]:
'''
Train with Gradient Boosting model
'''

# split into training and validation subsets
train_x_sub, valid_x_sub, train_y_sub, valid_y_sub = train_test_split(train_x, train_y, random_state=0)

#for each learning rate compute the accuracy scores on train and validation sets
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
best_learning_rate = 0.05
best_score = 0
for learning_rate in learning_rates:
    gb = GradientBoostingClassifier(n_estimators=100, learning_rate=learning_rate, 
                                    max_features='auto', max_depth=3, random_state=0)
    gb.fit(train_x_sub, train_y_sub)
    
    train_score = gb.score(train_x_sub, train_y_sub)
    valid_score = gb.score(valid_x_sub, valid_y_sub)
    if valid_score > best_score:
        best_score = valid_score
        best_learning_rate = learning_rate
    
    print("Learning rate: ", learning_rate)
    print("Accuracy score - training: {0:.3f}".format(train_score))
    print("Accuracy score - validation: {0:.3f}".format(valid_score))
    print()


Learning rate:  0.05
Accuracy score - training: 0.805
Accuracy score - validation: 0.808

Learning rate:  0.1
Accuracy score - training: 0.832
Accuracy score - validation: 0.832

Learning rate:  0.25
Accuracy score - training: 0.843
Accuracy score - validation: 0.843

Learning rate:  0.5
Accuracy score - training: 0.848
Accuracy score - validation: 0.844

Learning rate:  0.75
Accuracy score - training: 0.848
Accuracy score - validation: 0.842

Learning rate:  1
Accuracy score - training: 0.849
Accuracy score - validation: 0.836



In [203]:
'''
Print confusion matrix and classification report of GB model on the validation set
'''

gb = GradientBoostingClassifier(n_estimators=100, learning_rate=best_learning_rate, 
                                max_features='auto', max_depth=3, random_state=0)
gb.fit(train_x_sub, train_y_sub)
predict_valid = gb.predict(valid_x_sub)

print("Confusion Matrix:")
print(confusion_matrix(valid_y_sub, predict_valid))
print()
print("Classification Report")
print(classification_report(valid_y_sub, predict_valid))

Confusion Matrix:
[[14060   769]
 [ 2344  2827]]

Classification Report
              precision    recall  f1-score   support

           0       0.86      0.95      0.90     14829
           1       0.79      0.55      0.64      5171

    accuracy                           0.84     20000
   macro avg       0.82      0.75      0.77     20000
weighted avg       0.84      0.84      0.83     20000



In [None]:
'''
Predict for test set 
'''

gb = GradientBoostingClassifier(n_estimators=100, learning_rate=best_learning_rate, 
                                max_features='auto', max_depth=3, random_state=0)
gb.fit(train_x, train_y)
predictions = gb.predict(test)

#print(len(test))
#print(predictions.sum())

#save to output
output = pd.DataFrame(predictions)
#print(output)
output.to_csv("predictions.txt", index=False)