### Libraries Importation

In [1]:
import numpy as np
import os
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

### Data importation 

In [3]:
# Data for training
features=pd.read_csv('data/Train_transactions.csv')
target=pd.read_csv('data/Train_customers_repurchase.csv')

  features=pd.read_csv('data/Train_transactions.csv')


#### Copy 

In [4]:
features_copy = features.copy()
target_copy = target.copy()

## Feature Engenering 

In [8]:
print("There are ", len(features_copy) , " transactions")

There are  3298736  transactions


In [11]:
print(features_copy.columns)

Index(['id_client', 'transaction_date', 'stores_nb', 'item_count',
       'gross_amount', 'discount_amount', 'basket_value', 'payment_gift',
       'payment_cheque', 'payment_cash', 'payment_card', 'email_domain',
       'civility', 'zip_code', 'card_subscription', 'multicard',
       'price_segmentation'],
      dtype='object')


In [9]:
features_copy.head()

Unnamed: 0,id_client,transaction_date,stores_nb,item_count,gross_amount,discount_amount,basket_value,payment_gift,payment_cheque,payment_cash,payment_card,email_domain,civility,zip_code,card_subscription,multicard,price_segmentation
0,37520523,2019-12-26T16:44:00.000Z,12015,57,138.69,0.49,138.2,0,0,1,0,,MME,12420,,0.0,
1,84917629,2019-08-01T12:16:00.000Z,77131,6,9.87,0.0,9.87,0,0,1,0,,MME,77450,,0.0,
2,20962900,2019-09-25T18:10:00.000Z,92786,6,26.25,2.21,24.04,0,0,0,1,GMAIL.COM,MME,92110,,0.0,Quali
3,85917543,2020-01-24T17:34:00.000Z,21554,5,15.08,0.0,15.08,0,0,0,1,SFR.FR,M,21130,,0.0,Mixte
4,6861392,2019-08-22T19:28:00.000Z,83820,25,81.16,30.6,50.56,0,0,0,1,ORANGE.FR,MME,83600,,0.0,Mixte


#### Count if duplicated

In [22]:
features_copy.duplicated().sum()

np.int64(0)

-   We don't have any duplicated row ! 

#### Count number NaN by cols 

In [17]:
features_copy.isna().mean()

id_client             0.000000
transaction_date      0.000000
stores_nb             0.000000
item_count            0.000000
gross_amount          0.000000
discount_amount       0.000000
basket_value          0.000000
payment_gift          0.000000
payment_cheque        0.000000
payment_cash          0.000000
payment_card          0.000000
email_domain          0.388485
civility              0.001976
zip_code              0.092311
card_subscription     0.934478
multicard             0.001976
price_segmentation    0.473834
dtype: float64

-   `email_domain` = 39% of **NaN**
-   `civility` < 1% of **NaN**
-   `zip_code` = 10% of **NaN**
-   `card_subscription` = 9% of **NaN**    
-   `multicard` < 1% of **NaN** 
-   `price_segmentation    ` = 47% of **NaN** 

Among all the features: we will not interested by: card_subscription  (The date of Subscription card but it concerned to less customers)

#### First: Clean some variables

In [None]:
def clean_zip(x):
    if pd.isna(x):
        return 0
    x = str(x) #be sure we have string
    if x.isdigit() and len(x) >=2:
        return int(x[:2])   # return 77, 75, etc.
    
    # case where it's not digit (ex BE for belgium)
    return 0

### create binary feature if email is provided
def clean_mail(x):
    if pd.isna(x):
        return 0
    else : 
        return 1
    

In [None]:
# get binary variable: 1 female and 0 for male
features_copy["sex"] = features_copy["civility"].map ({'M':0, 'MME':1, 'NR':np.nan})
#clean transaction (use format  YYYY-MM-DD)
features_copy["transaction_date_clean"] = pd.to_datetime(features_copy["transaction_date"].str[:10])
#clean zip code (keep only first two digits)
features_copy["zip_code_clean"]=features_copy["zip_code"].apply(clean_zip)
# create binary feature if email is provided
features_copy["is_mail"] = features_copy["email_domain"].apply(clean_mail)
# create binary features for multicard and price_segmentation
features_copy["multicard_clean"]= features_copy["multicard"].apply(lambda x: 1 if x else 0)
features_copy["is_access"] = features_copy["price_segmentation"].apply(lambda x: 1 if x == "Access" else 0)
features_copy["is_mixte"] = features_copy["price_segmentation"].apply(lambda x: 1 if x == "Mixte" else 0)
features_copy["is_quali"] = features_copy["price_segmentation"].apply(lambda x: 1 if x == "Quali" else 0)

In [142]:
# create new features for target cities customers (it will not be used for prediction)
# Zip of top 4 cities in France
Paris_zip = [75, 77, 78, 91, 92, 93, 94, 95]  
Marseille_zip = [13]
Lyon_zip = [69]
Toulouse_zip = [31]

# Create binary features for each city
features_copy["is_paris"] = features_copy["zip_code_clean"].apply(lambda x: 1 if x in Paris_zip else 0)
features_copy["is_marseille"] = features_copy["zip_code_clean"].apply(lambda x: 1 if x in Marseille_zip else 0)
features_copy["is_lyon"] = features_copy["zip_code_clean"].apply(lambda x: 1 if x in Lyon_zip else 0)
features_copy["is_toulouse"] = features_copy["zip_code_clean"].apply(lambda x: 1 if x in Toulouse_zip else 0)


is_lyon
0    3151732
1     147004
Name: count, dtype: int64

In [None]:

features_merge = features_copy.merge(target_copy, on="id_client", how="left")
pd.crosstab(features_merge["is_quali"], features_merge["repurchase"], normalize='index')

repurchase,0,1
is_quali,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.75832,0.24168
1,0.765403,0.234597


In [165]:
# explore multicard, price_segmentation, payment_gift
features_copy[features_copy["multicard"]].value_counts()

KeyError: "None of [Index([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,\n       ...\n       0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],\n      dtype='float64', length=3298736)] are in the [columns]"

In [136]:
features_copy= features_copy.drop(columns=["civility", "zip_code", "transaction_date"])
features_copy.tail(10)

Unnamed: 0,id_client,stores_nb,item_count,gross_amount,discount_amount,basket_value,payment_gift,payment_cheque,payment_cash,payment_card,email_domain,card_subscription,multicard,price_segmentation,zip_code_clean,transaction_date_clean,is_paris
3298726,87193554,22300,25,76.34,2.25,74.09,0,0,0,1,GMAIL.COM,,0.0,,22,2020-05-20,0
3298727,14750709,43504,4,9.81,0.0,9.81,0,0,0,0,ORANGE.FR,,1.0,Mixte,43,2020-04-23,0
3298728,21399387,83863,2,19.24,0.0,19.24,0,0,0,1,LIVE.FR,,1.0,,83,2020-01-16,0
3298729,16601586,42311,10,32.77,0.0,32.77,0,0,0,0,HOTMAIL.FR,,0.0,,42,2020-05-11,0
3298730,80148436,73338,31,103.12,0.0,103.12,0,0,0,1,,,0.0,,73,2020-06-02,0
3298731,86847374,92669,13,33.86,0.0,33.86,0,0,1,0,,,0.0,,92,2020-03-21,1
3298732,32408995,56141,11,20.61,0.0,20.61,0,0,0,1,GMAIL.COM,,0.0,Mixte,56,2020-03-13,0
3298733,4858203,44324,30,71.48,6.39,65.09,0,0,0,1,FREE.FR,,0.0,,44,2020-02-09,0
3298734,33558343,33880,19,47.75,4.14,43.61,0,0,0,1,HOTMAIL.FR,,0.0,,0,2020-02-08,0
3298735,29208681,37208,13,35.37,0.0,35.37,0,0,0,1,AOL.COM,,0.0,,37,2020-02-18,0


In [59]:
features_copy[features_copy["id_client"]== 3693324]

Unnamed: 0,id_client,transaction_date,stores_nb,item_count,gross_amount,discount_amount,basket_value,payment_gift,payment_cheque,payment_cash,payment_card,email_domain,civility,zip_code,card_subscription,multicard,price_segmentation,repurchase
79469,3693324,2020-06-02T10:48:00.000Z,13800,69,209.05,22.61,186.44,0,0,0,1,FREE.FR,M,13013,,0.0,Mixte,1
104963,3693324,2020-02-01T18:34:00.000Z,13800,3,364.0,39.9,324.1,0,0,0,0,FREE.FR,M,13013,,0.0,Mixte,1
117119,3693324,2019-10-28T18:23:00.000Z,13800,34,108.53,10.97,97.56,0,0,0,1,FREE.FR,M,13013,,0.0,Mixte,1
263090,3693324,2020-02-08T14:19:00.000Z,13800,13,40.82,4.08,36.74,0,0,0,1,FREE.FR,M,13013,,0.0,Mixte,1
307990,3693324,2020-01-03T11:56:00.000Z,13800,29,80.6,10.81,69.79,1,0,0,1,FREE.FR,M,13013,,0.0,Mixte,1
355290,3693324,2019-08-12T11:38:00.000Z,13764,7,25.83,0.0,25.83,0,0,0,1,FREE.FR,M,13013,,0.0,Mixte,1
588526,3693324,2020-02-06T14:06:00.000Z,13800,50,125.24,26.28,98.96,0,0,0,1,FREE.FR,M,13013,,0.0,Mixte,1
615435,3693324,2019-11-19T11:17:00.000Z,13764,28,68.68,6.87,61.81,0,0,0,1,FREE.FR,M,13013,,0.0,Mixte,1
685457,3693324,2020-01-07T12:25:00.000Z,13800,19,66.44,15.82,50.62,0,0,0,1,FREE.FR,M,13013,,0.0,Mixte,1
861365,3693324,2020-05-28T11:10:00.000Z,13800,1,48.9,0.0,48.9,0,0,0,1,FREE.FR,M,13013,,0.0,Mixte,1
