In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from multiprocessing import  Pool

In [2]:
def most_frequent(s):
    return s.value_counts().index[0]

def least_frequent(s):
    return s.value_counts().index[-1]

def most_frequent_count(s):
    return s.value_counts().values[0]

def least_frequent_count(s):
    return s.value_counts().values[-1]

def parallelize_dataframe(df, func, n_cores=4):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

# Data Cleaning

### Item Information

In [3]:
item_data = pd.read_csv('data/train/item_data.csv')
item_data['brand_type'] = item_data['brand_type'].replace({'Established': 1, 'Local': 0})
item_data.head()

Unnamed: 0,item_id,brand,brand_type,category
0,1,1,1,Grocery
1,2,1,1,Miscellaneous
2,3,56,0,Bakery
3,4,56,0,Grocery
4,5,56,0,Grocery


In [4]:
item_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74066 entries, 0 to 74065
Data columns (total 4 columns):
item_id       74066 non-null int64
brand         74066 non-null int64
brand_type    74066 non-null int64
category      74066 non-null object
dtypes: int64(3), object(1)
memory usage: 2.3+ MB


In [5]:
total_items = item_data['item_id'].nunique()
total_brands= item_data['brand'].nunique()
total_brand_types = item_data['brand_type'].nunique()
total_categories = item_data['category'].nunique()

print("total_items: {}".format(total_items))
print("total_brands: {}".format(total_brands))
print("total_brand_types: {}".format(total_brand_types))
print("total_categories: {}".format(total_categories))

total_items: 74066
total_brands: 5528
total_brand_types: 2
total_categories: 19


### Campaign Information

In [6]:
campaign_data = pd.read_csv('data/train/campaign_data.csv', parse_dates=['start_date', 'end_date'], dayfirst=True)
campaign_data.head()

Unnamed: 0,campaign_id,campaign_type,start_date,end_date
0,24,Y,2013-10-21,2013-12-20
1,25,Y,2013-10-21,2013-11-22
2,20,Y,2013-09-07,2013-11-16
3,23,Y,2013-10-08,2013-11-15
4,21,Y,2013-09-16,2013-10-18


In [7]:
campaign_data['campaign_type'] = campaign_data['campaign_type'].replace({'X': 0, 'Y': 1})
campaign_data['duration'] = (campaign_data['end_date'] - campaign_data['start_date']).dt.days
campaign_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 5 columns):
campaign_id      28 non-null int64
campaign_type    28 non-null int64
start_date       28 non-null datetime64[ns]
end_date         28 non-null datetime64[ns]
duration         28 non-null int64
dtypes: datetime64[ns](2), int64(3)
memory usage: 1.2 KB


In [8]:
campaign_data_index = campaign_data.set_index('campaign_id')
campaign_data_index.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28 entries, 24 to 26
Data columns (total 4 columns):
campaign_type    28 non-null int64
start_date       28 non-null datetime64[ns]
end_date         28 non-null datetime64[ns]
duration         28 non-null int64
dtypes: datetime64[ns](2), int64(2)
memory usage: 1.1 KB


### Customer Information

In [9]:
customer_data = pd.read_csv('data/train/customer_demographics.csv')
customer_data.head()

Unnamed: 0,customer_id,age_range,marital_status,rented,family_size,no_of_children,income_bracket
0,1,70+,Married,0,2,,4
1,6,46-55,Married,0,2,,5
2,7,26-35,,0,3,1.0,3
3,8,26-35,,0,4,2.0,6
4,10,46-55,Single,0,1,,5


In [10]:
customer_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 760 entries, 0 to 759
Data columns (total 7 columns):
customer_id       760 non-null int64
age_range         760 non-null object
marital_status    431 non-null object
rented            760 non-null int64
family_size       760 non-null object
no_of_children    222 non-null object
income_bracket    760 non-null int64
dtypes: int64(3), object(4)
memory usage: 41.6+ KB


Filling the missing values in customer informations

In [11]:
def get_marital_status(row):
    na_row = row.isna()
    if not na_row['marital_status']:
        return row['marital_status']
    return 'Married' if row['family_size'] - row['no_of_children'] > 1 else 'Single'

customer_data['family_size'] = customer_data['family_size'].str.replace('+','').astype('int')
customer_data['no_of_children'] = customer_data['no_of_children'].fillna('0').str.replace('+','').astype('int')
customer_data['marital_status'] = customer_data.apply(get_marital_status, axis=1)
customer_data['marital_status'] = customer_data['marital_status'].replace({'Single': 0, 'Married': 1})

customer_data.head()

Unnamed: 0,customer_id,age_range,marital_status,rented,family_size,no_of_children,income_bracket
0,1,70+,1,0,2,0,4
1,6,46-55,1,0,2,0,5
2,7,26-35,1,0,3,1,3
3,8,26-35,1,0,4,2,6
4,10,46-55,0,0,1,0,5


In [12]:
customer_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 760 entries, 0 to 759
Data columns (total 7 columns):
customer_id       760 non-null int64
age_range         760 non-null object
marital_status    760 non-null int64
rented            760 non-null int64
family_size       760 non-null int64
no_of_children    760 non-null int64
income_bracket    760 non-null int64
dtypes: int64(6), object(1)
memory usage: 41.6+ KB


### Customer Transactions

In [13]:
transaction_data = pd.read_csv('data/train/customer_transaction_data.csv', parse_dates=['date'])
transaction_data.head()

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount
0,2012-01-02,1501,26830,1,35.26,-10.69,0.0
1,2012-01-02,1501,54253,1,53.43,-13.89,0.0
2,2012-01-02,1501,31962,1,106.5,-14.25,0.0
3,2012-01-02,1501,33647,1,67.32,0.0,0.0
4,2012-01-02,1501,48199,1,71.24,-28.14,0.0


In [14]:
transaction_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1324566 entries, 0 to 1324565
Data columns (total 7 columns):
date               1324566 non-null datetime64[ns]
customer_id        1324566 non-null int64
item_id            1324566 non-null int64
quantity           1324566 non-null int64
selling_price      1324566 non-null float64
other_discount     1324566 non-null float64
coupon_discount    1324566 non-null float64
dtypes: datetime64[ns](1), float64(3), int64(3)
memory usage: 70.7 MB


### Coupon Item Mapping

In [15]:
coupon_item = pd.read_csv('data/train/coupon_item_mapping.csv')
coupon_item.head()

Unnamed: 0,coupon_id,item_id
0,105,37
1,107,75
2,494,76
3,522,77
4,518,77


In [16]:
coupon_item.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92663 entries, 0 to 92662
Data columns (total 2 columns):
coupon_id    92663 non-null int64
item_id      92663 non-null int64
dtypes: int64(2)
memory usage: 1.4 MB


In [17]:
coupon_item = coupon_item.merge(item_data, how='left', on='item_id')
coupon_item.head()

Unnamed: 0,coupon_id,item_id,brand,brand_type,category
0,105,37,56,0,Grocery
1,107,75,56,0,Grocery
2,494,76,209,1,Grocery
3,522,77,278,1,Grocery
4,518,77,278,1,Grocery


In [18]:
coupon_item_index = coupon_item.set_index('coupon_id')
coupon_item_index.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 92663 entries, 105 to 31
Data columns (total 4 columns):
item_id       92663 non-null int64
brand         92663 non-null int64
brand_type    92663 non-null int64
category      92663 non-null object
dtypes: int64(3), object(1)
memory usage: 3.5+ MB


### Train data

In [19]:
train_data = pd.read_csv('data/train/train.csv')
train_data.head()

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status
0,1,13,27,1053,0
1,2,13,116,48,0
2,6,9,635,205,0
3,7,13,644,1050,0
4,9,8,1017,1489,0


In [20]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78369 entries, 0 to 78368
Data columns (total 5 columns):
id                   78369 non-null int64
campaign_id          78369 non-null int64
coupon_id            78369 non-null int64
customer_id          78369 non-null int64
redemption_status    78369 non-null int64
dtypes: int64(5)
memory usage: 3.0 MB


### Test data

In [21]:
test_data = pd.read_csv('data/test/test.csv')
test_data.head()

Unnamed: 0,id,campaign_id,coupon_id,customer_id
0,3,22,869,967
1,4,20,389,1566
2,5,22,981,510
3,8,25,1069,361
4,10,17,498,811


In [22]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50226 entries, 0 to 50225
Data columns (total 4 columns):
id             50226 non-null int64
campaign_id    50226 non-null int64
coupon_id      50226 non-null int64
customer_id    50226 non-null int64
dtypes: int64(4)
memory usage: 1.5 MB


# Data Preprocessing

#### Combine train and test data

In [23]:
columns = train_data.columns[train_data.columns != 'redemption_status']
total_data = train_data[columns].append(test_data, sort=True)
total_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 128595 entries, 0 to 50225
Data columns (total 4 columns):
campaign_id    128595 non-null int64
coupon_id      128595 non-null int64
customer_id    128595 non-null int64
id             128595 non-null int64
dtypes: int64(4)
memory usage: 4.9 MB


### Data Transform

Transforming Coupon-Item mapping into coupon specific variables

In [24]:
coupon_data = coupon_item.groupby('coupon_id').agg({
    'item_id': ['nunique'],
    'brand': ['nunique', most_frequent, least_frequent, most_frequent_count, least_frequent_count],
    'brand_type': ['nunique', most_frequent, least_frequent, most_frequent_count, least_frequent_count],
    'category': ['nunique', most_frequent, least_frequent, most_frequent_count, least_frequent_count]
})
coupon_data.columns = ['c_unique_items', 'c_unique_brand', 'c_freq_brand', 'c_rare_brand', 
                       'c_items_freq_brand', 'c_items_rare_brand', 'c_unique_brandt', 'c_freq_brandt',
                       'c_rare_brandt', 'c_items_freq_brandt', 'c_items_rare_brandt', 
                       'c_unique_category', 'c_freq_category', 'c_rare_category', 'c_items_freq_category', 
                       'c_items_rare_category']
coupon_data['c_coverage_item'] = coupon_data['c_unique_items'] / total_items
coupon_data['c_coverage_brand'] = coupon_data['c_unique_brand'] / total_brands
coupon_data['c_coverage_brandt'] = coupon_data['c_unique_brandt'] / total_brand_types
coupon_data['c_coverage_category'] = coupon_data['c_unique_category'] / total_categories

coupon_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1116 entries, 1 to 1116
Data columns (total 20 columns):
c_unique_items           1116 non-null int64
c_unique_brand           1116 non-null int64
c_freq_brand             1116 non-null int64
c_rare_brand             1116 non-null int64
c_items_freq_brand       1116 non-null int64
c_items_rare_brand       1116 non-null int64
c_unique_brandt          1116 non-null int64
c_freq_brandt            1116 non-null int64
c_rare_brandt            1116 non-null int64
c_items_freq_brandt      1116 non-null int64
c_items_rare_brandt      1116 non-null int64
c_unique_category        1116 non-null int64
c_freq_category          1116 non-null object
c_rare_category          1116 non-null object
c_items_freq_category    1116 non-null int64
c_items_rare_category    1116 non-null int64
c_coverage_item          1116 non-null float64
c_coverage_brand         1116 non-null float64
c_coverage_brandt        1116 non-null float64
c_coverage_category      1116 

Transforming Customer Transactions into Customer Buying habbit

In [25]:
transaction_data['total_discount'] = transaction_data['coupon_discount'] + transaction_data['other_discount']
transaction_data['buying_price'] = transaction_data['selling_price'] + transaction_data['other_discount']
transaction_data['selling_price_pq'] = transaction_data['selling_price'] / transaction_data['quantity']
transaction_data['other_discount_pq'] = transaction_data['other_discount'] / transaction_data['quantity']
transaction_data['coupon_discount_pq'] = transaction_data['coupon_discount'] / transaction_data['quantity']
transaction_data['total_discount_pq'] = transaction_data['coupon_discount_pq'] + transaction_data['other_discount_pq']
transaction_data['buying_price_pq'] = transaction_data['selling_price_pq'] + transaction_data['other_discount_pq']
transaction_data['date'] = pd.to_datetime(transaction_data['date'])
transaction_data = transaction_data.merge(item_data, on='item_id', how='left')
transaction_data = transaction_data.set_index(['customer_id','date']).sort_index()
transaction_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,item_id,quantity,selling_price,other_discount,coupon_discount,total_discount,buying_price,selling_price_pq,other_discount_pq,coupon_discount_pq,total_discount_pq,buying_price_pq,brand,brand_type,category
customer_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,2012-02-21,4953,1,142.12,0.0,0.0,0.0,142.12,142.12,0.0,0.0,0.0,142.12,1061,1,Grocery
1,2012-02-21,5626,1,106.5,0.0,0.0,0.0,106.5,106.5,0.0,0.0,0.0,106.5,278,1,Grocery
1,2012-02-21,7808,1,99.38,-42.74,0.0,-42.74,56.64,99.38,-42.74,0.0,-42.74,56.64,56,0,Grocery
1,2012-02-21,8107,1,256.11,-46.31,0.0,-46.31,209.8,256.11,-46.31,0.0,-46.31,209.8,914,1,Grocery
1,2012-02-21,8307,1,89.05,-17.45,0.0,-17.45,71.6,89.05,-17.45,0.0,-17.45,71.6,141,1,Grocery


In [26]:
transaction_data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1324566 entries, (1, 2012-02-21 00:00:00) to (1582, 2013-06-25 00:00:00)
Data columns (total 15 columns):
item_id               1324566 non-null int64
quantity              1324566 non-null int64
selling_price         1324566 non-null float64
other_discount        1324566 non-null float64
coupon_discount       1324566 non-null float64
total_discount        1324566 non-null float64
buying_price          1324566 non-null float64
selling_price_pq      1324566 non-null float64
other_discount_pq     1324566 non-null float64
coupon_discount_pq    1324566 non-null float64
total_discount_pq     1324566 non-null float64
buying_price_pq       1324566 non-null float64
brand                 1324566 non-null int64
brand_type            1324566 non-null int64
category              1324566 non-null object
dtypes: float64(10), int64(4), object(1)
memory usage: 156.7+ MB


In [27]:
transaction_data.describe()

Unnamed: 0,item_id,quantity,selling_price,other_discount,coupon_discount,total_discount,buying_price,selling_price_pq,other_discount_pq,coupon_discount_pq,total_discount_pq,buying_price_pq,brand,brand_type
count,1324566.0,1324566.0,1324566.0,1324566.0,1324566.0,1324566.0,1324566.0,1324566.0,1324566.0,1324566.0,1324566.0,1324566.0,1324566.0,1324566.0
mean,29519.03,130.6633,114.6036,-17.76871,-0.5948983,-18.3636,96.83493,87.70231,-13.55429,-0.4759791,-14.03027,74.14802,884.0647,0.7308024
std,17908.06,1311.545,152.9053,37.88867,7.069367,39.29699,148.1945,92.87411,25.75621,5.405928,26.55212,91.11477,1125.452,0.443543
min,1.0,1.0,0.36,-3120.31,-1992.23,-3258.16,-1424.44,0.05911488,-2671.5,-1068.24,-2671.5,-1424.44,1.0,0.0
25%,14684.0,1.0,49.16,-23.15,0.0,-24.58,35.26,35.62,-17.81,0.0,-17.81,27.43,56.0,0.0
50%,26597.0,1.0,78.01,-1.78,0.0,-3.21,64.47,70.88,-0.003560459,0.0,-0.7125,53.08,533.0,1.0
75%,42405.75,1.0,124.31,0.0,0.0,0.0,106.86,106.5,0.0,0.0,0.0,95.82,1124.0,1.0
max,74066.0,89638.0,17809.64,0.0,0.0,0.0,16028.64,17809.64,0.0,0.0,0.0,16028.64,5528.0,1.0


In [28]:
customer_history = transaction_data.groupby('customer_id').agg({
    'item_id': ['nunique', 'count'],
    'quantity': 'sum',
    'selling_price': 'mean',
    'buying_price': 'mean',
    'other_discount': 'mean',
    'coupon_discount': 'mean',
    'total_discount': 'mean',
    'selling_price_pq': 'mean',
    'buying_price_pq': 'mean',
    'other_discount_pq': 'mean',
    'coupon_discount_pq': 'mean',
    'total_discount_pq': 'mean',
    'brand': ['nunique', most_frequent, least_frequent, most_frequent_count, least_frequent_count],
    'brand_type': ['nunique', most_frequent, least_frequent, most_frequent_count, least_frequent_count],
    'category': ['nunique', most_frequent, least_frequent, most_frequent_count, least_frequent_count]
})
customer_history.columns = ['overall_unique_items', 'overall_items', 'overall_quantity', 'overall_sprice', 'overall_bprice', 'overall_odiscount', 'overall_cdiscount', 'overall_tdiscount', 'overall_sprice_pq', 'overall_bprice_pq', 'overall_odiscount_pq', 'overall_cdiscount_pq', 'overall_tdiscount_pq', 'overall_unique_brand', 'overall_freq_brand', 'overall_rare_brand', 'overall_items_freq_brand', 'overall_items_rare_brand', 'overall_unique_brandt', 'overall_freq_brandt', 'overall_rare_brandt', 'overall_items_freq_brandt', 'overall_items_rare_brandt', 'overall_unique_category', 'overall_freq_category', 'overall_rare_category', 'overall_items_freq_category', 'overall_items_rare_category']
customer_history['overall_coverage_item'] = customer_history['overall_unique_items'] / total_items
customer_history['overall_coverage_brand'] = customer_history['overall_unique_brand'] / total_brands
customer_history['overall_coverage_brandt'] = customer_history['overall_unique_brandt'] / total_brand_types
customer_history['overall_coverage_category'] = customer_history['overall_unique_category'] / total_categories
customer_history['overall_podiscount'] = customer_history['overall_odiscount'] / customer_history['overall_bprice']
customer_history['overall_pcdiscount'] = customer_history['overall_cdiscount'] / customer_history['overall_bprice']
customer_history['overall_ptdiscount'] = customer_history['overall_tdiscount'] / customer_history['overall_bprice']
customer_history['overall_podiscount_pq'] = customer_history['overall_odiscount_pq'] / customer_history['overall_bprice_pq']
customer_history['overall_pcdiscount_pq'] = customer_history['overall_cdiscount_pq'] / customer_history['overall_bprice_pq']
customer_history['overall_ptdiscount_pq'] = customer_history['overall_tdiscount_pq'] / customer_history['overall_bprice_pq']
customer_history = customer_history.reset_index()
customer_history.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1582 entries, 0 to 1581
Data columns (total 39 columns):
customer_id                    1582 non-null int64
overall_unique_items           1582 non-null int64
overall_items                  1582 non-null int64
overall_quantity               1582 non-null int64
overall_sprice                 1582 non-null float64
overall_bprice                 1582 non-null float64
overall_odiscount              1582 non-null float64
overall_cdiscount              1582 non-null float64
overall_tdiscount              1582 non-null float64
overall_sprice_pq              1582 non-null float64
overall_bprice_pq              1582 non-null float64
overall_odiscount_pq           1582 non-null float64
overall_cdiscount_pq           1582 non-null float64
overall_tdiscount_pq           1582 non-null float64
overall_unique_brand           1582 non-null int64
overall_freq_brand             1582 non-null int64
overall_rare_brand             1582 non-null int64
overal

### Merge Dataset

In [29]:
total_data = total_data.merge(campaign_data, on='campaign_id', how='left')
total_data = total_data.merge(customer_data, on='customer_id', how='left')
total_data = total_data.merge(coupon_data, on='coupon_id', how='left')
total_data = total_data.merge(customer_history, on='customer_id', how='left')
total_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 128595 entries, 0 to 128594
Data columns (total 72 columns):
campaign_id                    128595 non-null int64
coupon_id                      128595 non-null int64
customer_id                    128595 non-null int64
id                             128595 non-null int64
campaign_type                  128595 non-null int64
start_date                     128595 non-null datetime64[ns]
end_date                       128595 non-null datetime64[ns]
duration                       128595 non-null int64
age_range                      74600 non-null object
marital_status                 74600 non-null float64
rented                         74600 non-null float64
family_size                    74600 non-null float64
no_of_children                 74600 non-null float64
income_bracket                 74600 non-null float64
c_unique_items                 128595 non-null int64
c_unique_brand                 128595 non-null int64
c_freq_brand      

In [30]:
range_start_delta = pd.Timedelta('223 days')
range_end_delta = pd.Timedelta('110 days')

def extract_range_features(row):
    start_time = pd.Timestamp(campaign_data_index.loc[row.campaign_id].start_date)
    range_start, range_end = start_time - range_start_delta, start_time - range_end_delta
    items = pd.Series(coupon_item_index.loc[row.coupon_id].item_id)

    overall_history = transaction_data.loc[row.customer_id]
    history = overall_history[range_start:range_end]
    overall_coupon_history = overall_history[overall_history.item_id.isin(items)]
    coupon_history = history[history.item_id.isin(items)]

    for df_name, df in [('overall_coupon_', overall_coupon_history), ('range_', history), ('range_coupon_', coupon_history)]:
        row[df_name+'unique_items'] = df['item_id'].nunique()
        row[df_name+'items'] = df['item_id'].count()
        row[df_name+'quantity'] = df['quantity'].sum()
        
        for new_name, old_name in [('sprice', 'selling_price'), ('bprice', 'buying_price'), ('odiscount', 'other_discount'), ('cdiscount', 'coupon_discount'), ('tdiscount', 'total_discount'), ('sprice_pq', 'selling_price_pq'), ('bprice_pq', 'buying_price_pq'), ('odiscount_pq', 'other_discount_pq'), ('cdiscount_pq', 'coupon_discount_pq'), ('tdiscount_pq', 'total_discount_pq')]:
            row[df_name+''+new_name] = df[old_name].mean()
        
        for name in ['odiscount', 'cdiscount', 'tdiscount']:
            bprice = row[df_name+'bprice']
            row[df_name+'p'+name] = row[df_name+''+name] / bprice if bprice > 0 else np.NaN
            
        for name in ['odiscount_pq', 'cdiscount_pq', 'tdiscount_pq']:
            bprice_pq = row[df_name+'bprice_pq']
            row[df_name+'p'+name] = row[df_name+''+name] / bprice_pq if bprice_pq > 0 else np.NaN
        
        for new_name, old_name in [('brand', 'brand'), ('brandt', 'brand_type'), ('category','category')]:
            unique_count = df[old_name].nunique()
            row[df_name+'unique_'+new_name] = unique_count
            
            if unique_count == 0:
                row[df_name+'items_freq_'+new_name] = 0
                row[df_name+'items_rare_'+new_name] = 0
                row[df_name+'freq_'+new_name] = np.NaN
                row[df_name+'rare_'+new_name] = np.NaN
            else:
                freq = most_frequent(df[old_name])
                row[df_name+'freq_'+new_name] = freq
                row[df_name+'items_freq_'+new_name] = df[df[old_name] == freq]['item_id'].nunique()
                row[df_name+'items_p_freq_'+new_name] = row[df_name+'items_freq_'+new_name] / df['item_id'].nunique()

                rare = least_frequent(df[old_name])
                row[df_name+'rare_'+new_name] = rare
                row[df_name+'items_rare_'+new_name] = df[df[old_name] == rare]['item_id'].nunique()
                row[df_name+'items_p_rare_'+new_name] = row[df_name+'items_rare_'+new_name] / df['item_id'].nunique()
            
        row[df_name+'coverage_item'] = row[df_name+'unique_items'] / total_items
        row[df_name+'coverage_brand'] = row[df_name+'unique_brand'] / total_brands
        row[df_name+'coverage_brandt'] = row[df_name+'unique_brandt'] / total_brand_types
        row[df_name+'coverage_category'] = row[df_name+'unique_category'] / total_categories
    
    if coupon_history.shape[0] > 0:
        row['redemption_ratio'] = coupon_history[coupon_history['coupon_discount'] < 0].shape[0] / coupon_history.shape[0]
    else:
        row['redemption_ratio'] = np.NaN
    
    if history.shape[0] > 0:
        row['overall_redemption_ratio'] = history[history['coupon_discount'] < 0].shape[0] / history.shape[0]
    else:
        row['overall_redemption_ratio'] = np.NaN
        
    return row

def apply_range_features(df):
    return df.apply(extract_range_features, axis=1)

In [31]:
total_data = parallelize_dataframe(total_data, apply_range_features, n_cores=4)
# total_data = apply_range_features(total_data.head(5))
total_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 128595 entries, 0 to 128594
Columns: 206 entries, age_range to start_date
dtypes: datetime64[ns](2), float64(117), int64(76), object(11)
memory usage: 203.1+ MB


#### Derive new variables

In [32]:
# Deriving more variables from existing
for column in ['unique_items', 'items', 'quantity', 'sprice', 'bprice', 'odiscount', 'cdiscount', 'tdiscount', 'podiscount', 'pcdiscount', 'ptdiscount', 'sprice_pq', 'bprice_pq', 'odiscount_pq', 'cdiscount_pq', 'tdiscount_pq', 'podiscount_pq', 'pcdiscount_pq', 'ptdiscount_pq', 'unique_brand', 'unique_brandt', 'unique_category', 'coverage_brand', 'coverage_category']:
    total_data['diff_overall_'+column] = total_data['overall_'+column] - total_data['overall_coupon_'+column]
    total_data['diff_range_'+column] = total_data['range_'+column] - total_data['range_coupon_'+column]
    total_data['diff_coupon_'+column] = total_data['overall_coupon_'+column] - total_data['range_coupon_'+column]
    
    total_data['diff_p_overall_'+column] = total_data['diff_overall_'+column] / total_data['overall_'+column]
    total_data['diff_p_range_'+column] = total_data['diff_range_'+column] / total_data['range_'+column]
    total_data['diff_p_coupon_'+column] = total_data['diff_coupon_'+column] / total_data['overall_coupon_'+column]

for column in ['unique_items', 'unique_brand', 'unique_brandt', 'unique_category', 'coverage_item', 'coverage_brand', 'coverage_brandt', 'coverage_category']:
    total_data['c_diff_'+column] = total_data['c_'+column] - total_data['range_coupon_'+column]
    
for column in ['freq_brand', 'rare_brand', 'freq_brandt', 'rare_brandt', 'freq_category', 'rare_category']:
    total_data['match_'+column] = (total_data['c_'+column] == total_data['range_coupon_'+column]).astype('int')
    
total_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 128595 entries, 0 to 128594
Columns: 364 entries, age_range to match_rare_category
dtypes: datetime64[ns](2), float64(247), int64(104), object(11)
memory usage: 358.1+ MB


 #### Splitting total data into train and test

In [33]:
test_data = test_data[['id']].merge(total_data, on='id', how='left')
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50226 entries, 0 to 50225
Columns: 364 entries, id to match_rare_category
dtypes: datetime64[ns](2), float64(247), int64(104), object(11)
memory usage: 139.9+ MB


In [34]:
test_data.to_csv('data/test/test_feature.csv', index=False)

In [35]:
train_data = train_data[['id','redemption_status']].merge(total_data, on='id', how='left')
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78369 entries, 0 to 78368
Columns: 365 entries, id to match_rare_category
dtypes: datetime64[ns](2), float64(247), int64(105), object(11)
memory usage: 218.8+ MB


In [36]:
train_data.to_csv('data/train/train_feature.csv', index=False)

# Summary

1. Customer Information<br>
    a. There were many missing values in "no_of_children" and "marital_status"<br>
    b. Assuming missing no_of_children as zero childrens<br>
    c. Marital Status was calculated using family_size and no_of_children<br>
2. Train and Test data contains many customer whose information is not available
3. Derived new variables from coupons and customer transactions
4. Merged all other data with train and test dataset
5. Derived new variables related to change is customer behaviors and matching with coupon data
6. There are lots of Outliers present in the dataset, model must learn that in order to predict the redemption