In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
item_data = pd.read_csv('data/train/item_data.csv')
item_data['brand_type'] = item_data['brand_type'].replace({'Established': 1, 'Local': 0})
item_data.head()

Unnamed: 0,item_id,brand,brand_type,category
0,1,1,1,Grocery
1,2,1,1,Miscellaneous
2,3,56,0,Bakery
3,4,56,0,Grocery
4,5,56,0,Grocery


In [3]:
campaign_data = pd.read_csv('data/train/campaign_data.csv', 
                            parse_dates=['start_date', 'end_date'], dayfirst=True)
campaign_data['campaign_type'] = campaign_data['campaign_type'].replace({'X': 0, 'Y': 1})
campaign_data['duration'] = (campaign_data['end_date'] - campaign_data['start_date']).dt.days
campaign_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 5 columns):
campaign_id      28 non-null int64
campaign_type    28 non-null int64
start_date       28 non-null datetime64[ns]
end_date         28 non-null datetime64[ns]
duration         28 non-null int64
dtypes: datetime64[ns](2), int64(3)
memory usage: 1.2 KB


In [4]:
campaign_data_2 = campaign_data.set_index('campaign_id')
campaign_data_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28 entries, 24 to 26
Data columns (total 4 columns):
campaign_type    28 non-null int64
start_date       28 non-null datetime64[ns]
end_date         28 non-null datetime64[ns]
duration         28 non-null int64
dtypes: datetime64[ns](2), int64(2)
memory usage: 1.1 KB


In [5]:
def get_marital_status(row):
    na_row = row.isna()
    if not na_row['marital_status']:
        return row['marital_status']
    return 'Married' if row['family_size'] - row['no_of_children'] > 1 else 'Single'

customer_data = pd.read_csv('data/train/customer_demographics.csv')
customer_data['family_size'] = customer_data['family_size'].str.replace('+','').astype('int')
customer_data['no_of_children'] = customer_data['no_of_children'].fillna('0').str.replace('+','').astype('int')
customer_data['marital_status'] = customer_data.apply(get_marital_status, axis=1)

customer_data.head()

Unnamed: 0,customer_id,age_range,marital_status,rented,family_size,no_of_children,income_bracket
0,1,70+,Married,0,2,0,4
1,6,46-55,Married,0,2,0,5
2,7,26-35,Married,0,3,1,3
3,8,26-35,Married,0,4,2,6
4,10,46-55,Single,0,1,0,5


In [6]:
transaction_data = pd.read_csv('data/train/customer_transaction_data.csv', parse_dates=['date'])
transaction_data['total_discount'] = transaction_data['coupon_discount'] + transaction_data['other_discount']
transaction_data['selling_price_pq'] = transaction_data['selling_price'] / transaction_data['quantity']
transaction_data['other_discount_pq'] = transaction_data['other_discount'] / transaction_data['quantity']
transaction_data['coupon_discount_pq'] = transaction_data['coupon_discount'] / transaction_data['quantity']
transaction_data['total_discount_pq'] = transaction_data['coupon_discount_pq'] + transaction_data['other_discount_pq']
transaction_data['date'] = pd.to_datetime(transaction_data['date'])
transaction_data = transaction_data.merge(item_data, on='item_id', how='left')
transaction_data = transaction_data.set_index(['customer_id','date'])
transaction_data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1324566 entries, (1501, 2012-01-02 00:00:00) to (1129, 2013-06-30 00:00:00)
Data columns (total 13 columns):
item_id               1324566 non-null int64
quantity              1324566 non-null int64
selling_price         1324566 non-null float64
other_discount        1324566 non-null float64
coupon_discount       1324566 non-null float64
total_discount        1324566 non-null float64
selling_price_pq      1324566 non-null float64
other_discount_pq     1324566 non-null float64
coupon_discount_pq    1324566 non-null float64
total_discount_pq     1324566 non-null float64
brand                 1324566 non-null int64
brand_type            1324566 non-null int64
category              1324566 non-null object
dtypes: float64(8), int64(4), object(1)
memory usage: 136.4+ MB


In [7]:
coupon_item = pd.read_csv('data/train/coupon_item_mapping.csv')
coupon_item = coupon_item.merge(item_data, how='left', on='item_id')
coupon_item.head()

Unnamed: 0,coupon_id,item_id,brand,brand_type,category
0,105,37,56,0,Grocery
1,107,75,56,0,Grocery
2,494,76,209,1,Grocery
3,522,77,278,1,Grocery
4,518,77,278,1,Grocery


In [8]:
def most_frequent(s):
    return s.value_counts().index[0]

def least_frequent(s):
    return s.value_counts().index[-1]

def most_frequent_count(s):
    return s.value_counts().values[0]

def least_frequent_count(s):
    return s.value_counts().values[-1]

coupon_data = coupon_item.groupby('coupon_id').agg({
    'item_id': ['nunique'],
    'brand': ['nunique', most_frequent, least_frequent, most_frequent_count, least_frequent_count],
    'brand_type': ['nunique', most_frequent, least_frequent, most_frequent_count, least_frequent_count],
    'category': ['nunique', most_frequent, least_frequent, most_frequent_count, least_frequent_count]
})
coupon_data.columns = ['c_item_id_count', 'c_brand_count', 'c_brand_freq', 'c_brand_rare', 
                       'c_brand_freq_count', 'c_brand_rare_count', 'c_brand_type_count', 'c_brand_type_freq', 
                       'c_brand_type_rare', 'c_brand_type_freq_count', 'c_brand_type_rare_count', 
                       'c_category_count', 'c_category_freq', 'c_category_rare', 'c_category_freq_count', 
                       'c_category_rare_count']
coupon_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1116 entries, 1 to 1116
Data columns (total 16 columns):
c_item_id_count            1116 non-null int64
c_brand_count              1116 non-null int64
c_brand_freq               1116 non-null int64
c_brand_rare               1116 non-null int64
c_brand_freq_count         1116 non-null int64
c_brand_rare_count         1116 non-null int64
c_brand_type_count         1116 non-null int64
c_brand_type_freq          1116 non-null int64
c_brand_type_rare          1116 non-null int64
c_brand_type_freq_count    1116 non-null int64
c_brand_type_rare_count    1116 non-null int64
c_category_count           1116 non-null int64
c_category_freq            1116 non-null object
c_category_rare            1116 non-null object
c_category_freq_count      1116 non-null int64
c_category_rare_count      1116 non-null int64
dtypes: int64(14), object(2)
memory usage: 148.2+ KB


In [9]:
train_data = pd.read_csv('data/train/train.csv')
train_data.shape

(78369, 5)

In [10]:
test_data = pd.read_csv('data/test/test.csv')
test_data.shape

(50226, 4)

In [11]:
columns = train_data.columns[train_data.columns != 'redemption_status']
total_data = train_data[columns].append(test_data, sort=True)
total_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 128595 entries, 0 to 50225
Data columns (total 4 columns):
campaign_id    128595 non-null int64
coupon_id      128595 non-null int64
customer_id    128595 non-null int64
id             128595 non-null int64
dtypes: int64(4)
memory usage: 4.9 MB


In [12]:
total_data = total_data.merge(campaign_data, on='campaign_id', how='left')
total_data = total_data.merge(customer_data, on='customer_id', how='left')
total_data = total_data.merge(coupon_data, on='coupon_id', how='left')
total_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 128595 entries, 0 to 128594
Data columns (total 30 columns):
campaign_id                128595 non-null int64
coupon_id                  128595 non-null int64
customer_id                128595 non-null int64
id                         128595 non-null int64
campaign_type              128595 non-null int64
start_date                 128595 non-null datetime64[ns]
end_date                   128595 non-null datetime64[ns]
duration                   128595 non-null int64
age_range                  74600 non-null object
marital_status             74600 non-null object
rented                     74600 non-null float64
family_size                74600 non-null float64
no_of_children             74600 non-null float64
income_bracket             74600 non-null float64
c_item_id_count            128595 non-null int64
c_brand_count              128595 non-null int64
c_brand_freq               128595 non-null int64
c_brand_rare               128595 

In [14]:
def get_transaction_details(row):
    start_date = campaign_data_2.loc[row.campaign_id].start_date
    history = transaction_data.loc[row.customer_id][:str(start_date)]
    
    row['t_item_id_count'] = history['item_id'].nunique()
    
    row['t_brand_count'] = history['brand'].nunique()
    row['t_brand_freq'] = most_frequent(history['brand'])
    row['t_brand_rare'] = least_frequent(history['brand'])
    row['t_brand_freq_count'] = most_frequent_count(history['brand'])
    row['t_brand_rare_count'] = least_frequent_count(history['brand'])
    
    row['t_brand_type_count'] = history['brand_type'].nunique()
    row['t_brand_type_freq'] = most_frequent(history['brand_type'])
    row['t_brand_type_rare'] = least_frequent(history['brand_type'])
    row['t_brand_type_freq_count'] = most_frequent_count(history['brand_type'])
    row['t_brand_type_rare_count'] = least_frequent_count(history['brand_type'])
    
    row['t_category_count'] = history['category'].nunique()
    row['t_category_freq'] = most_frequent(history['category'])
    row['t_category_rare'] = least_frequent(history['category'])
    row['t_category_freq_count'] = most_frequent_count(history['category'])
    row['t_category_rare_count'] = least_frequent_count(history['category'])
    
    row['min_selling_price'] = history['selling_price'].min()
    row['max_selling_price'] = history['selling_price'].max()
    row['mean_selling_price'] = history['selling_price'].mean()
    
    row['min_other_discount'] = history['other_discount'].min()
    row['max_other_discount'] = history['other_discount'].max()
    row['mean_other_discount'] = history['other_discount'].mean()
    
    row['min_coupon_discount'] = history['coupon_discount'].min()
    row['max_coupon_discount'] = history['coupon_discount'].max()
    row['mean_coupon_discount'] = history['coupon_discount'].mean()
    
    row['min_total_discount'] = history['total_discount'].min()
    row['max_total_discount'] = history['total_discount'].max()
    row['mean_total_discount'] = history['total_discount'].mean()
    
    row['min_selling_price_pq'] = history['selling_price_pq'].min()
    row['max_selling_price_pq'] = history['selling_price_pq'].max()
    row['mean_selling_price_pq'] = history['selling_price_pq'].mean()
    
    row['min_other_discount_pq'] = history['other_discount_pq'].min()
    row['max_other_discount_pq'] = history['other_discount_pq'].max()
    row['mean_other_discount_pq'] = history['other_discount_pq'].mean()
    
    row['min_coupon_discount_pq'] = history['coupon_discount_pq'].min()
    row['max_coupon_discount_pq'] = history['coupon_discount_pq'].max()
    row['mean_coupon_discount_pq'] = history['coupon_discount_pq'].mean()
    
    row['min_total_discount_pq'] = history['total_discount_pq'].min()
    row['max_total_discount_pq'] = history['total_discount_pq'].max()
    row['mean_total_discount_pq'] = history['total_discount_pq'].mean()
    
    if history.shape[0] != 0:
        row['transc_coupon'] = history[history.coupon_discount < 0].shape[0] / history.shape[0]
        row['transc_other'] = history[history.other_discount < 0].shape[0] / history.shape[0]
        row['transc_any'] = history[history.total_discount < 0].shape[0] / history.shape[0]
    else:
        row['transc_coupon'] = 0
        row['transc_other'] = 0
        row['transc_any'] = 0
    
    return row
    
total_data = total_data.apply(get_transaction_details, axis=1)
total_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15 entries, 0 to 14
Data columns (total 73 columns):
campaign_id                15 non-null int64
coupon_id                  15 non-null int64
customer_id                15 non-null int64
id                         15 non-null int64
campaign_type              15 non-null int64
start_date                 15 non-null datetime64[ns]
end_date                   15 non-null datetime64[ns]
duration                   15 non-null int64
age_range                  12 non-null object
marital_status             12 non-null object
rented                     12 non-null float64
family_size                12 non-null float64
no_of_children             12 non-null float64
income_bracket             12 non-null float64
c_item_id_count            15 non-null int64
c_brand_count              15 non-null int64
c_brand_freq               15 non-null int64
c_brand_rare               15 non-null int64
c_brand_freq_count         15 non-null int64
c_brand_rare_c

In [None]:
test_data = test_data[['id']].merge(total_data, on='id', how='left')
test_data.info()

In [None]:
test_data.to_csv('data/test/test_feature.csv', index=False)

In [None]:
train_data = train_data[['id','redemption_status']].merge(total_data, on='id', how='left')
train_data.info()

In [None]:
train_data.to_csv('data/train/train_feature.csv', index=False)