In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
item_data = pd.read_csv('data/train/item_data.csv')
item_data['brand_type'] = item_data['brand_type'].replace({'Established': 1, 'Local': 0})
item_data.head()

Unnamed: 0,item_id,brand,brand_type,category
0,1,1,1,Grocery
1,2,1,1,Miscellaneous
2,3,56,0,Bakery
3,4,56,0,Grocery
4,5,56,0,Grocery


In [3]:
campaign_data = pd.read_csv('data/train/campaign_data.csv', 
                            parse_dates=['start_date', 'end_date'], dayfirst=True)
campaign_data['campaign_type'] = campaign_data['campaign_type'].replace({'X': 0, 'Y': 1})
campaign_data['duration'] = (campaign_data['end_date'] - campaign_data['start_date']).dt.days
campaign_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 5 columns):
campaign_id      28 non-null int64
campaign_type    28 non-null int64
start_date       28 non-null datetime64[ns]
end_date         28 non-null datetime64[ns]
duration         28 non-null int64
dtypes: datetime64[ns](2), int64(3)
memory usage: 1.2 KB


In [4]:
def get_marital_status(row):
    na_row = row.isna()
    if not na_row['marital_status']:
        return row['marital_status']
    return 'Married' if row['family_size'] - row['no_of_children'] > 1 else 'Single'

customer_data = pd.read_csv('data/train/customer_demographics.csv')
customer_data['family_size'] = customer_data['family_size'].str.replace('+','').astype('int')
customer_data['no_of_children'] = customer_data['no_of_children'].fillna('0').str.replace('+','').astype('int')
customer_data['marital_status'] = customer_data.apply(get_marital_status, axis=1)

customer_data.head()

Unnamed: 0,customer_id,age_range,marital_status,rented,family_size,no_of_children,income_bracket
0,1,70+,Married,0,2,0,4
1,6,46-55,Married,0,2,0,5
2,7,26-35,Married,0,3,1,3
3,8,26-35,Married,0,4,2,6
4,10,46-55,Single,0,1,0,5


In [5]:
transaction_data = pd.read_csv('data/train/customer_transaction_data.csv', parse_dates=['date'])
transaction_data['total_discount'] = transaction_data['coupon_discount'] + transaction_data['other_discount']
transaction_data['selling_price_pq'] = transaction_data['selling_price'] / transaction_data['quantity']
transaction_data['other_discount_pq'] = transaction_data['other_discount'] / transaction_data['quantity']
transaction_data['coupon_discount_pq'] = transaction_data['coupon_discount'] / transaction_data['quantity']
transaction_data['total_discount_pq'] = transaction_data['coupon_discount_pq'] + transaction_data['other_discount_pq']
transaction_data.head()

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount,total_discount,selling_price_pq,other_discount_pq,coupon_discount_pq,total_discount_pq
0,2012-01-02,1501,26830,1,35.26,-10.69,0.0,-10.69,35.26,-10.69,0.0,-10.69
1,2012-01-02,1501,54253,1,53.43,-13.89,0.0,-13.89,53.43,-13.89,0.0,-13.89
2,2012-01-02,1501,31962,1,106.5,-14.25,0.0,-14.25,106.5,-14.25,0.0,-14.25
3,2012-01-02,1501,33647,1,67.32,0.0,0.0,0.0,67.32,0.0,0.0,0.0
4,2012-01-02,1501,48199,1,71.24,-28.14,0.0,-28.14,71.24,-28.14,0.0,-28.14


In [6]:
coupon_item = pd.read_csv('data/train/coupon_item_mapping.csv')
coupon_item = coupon_item.merge(item_data, how='left', on='item_id')
coupon_item.head()

Unnamed: 0,coupon_id,item_id,brand,brand_type,category
0,105,37,56,0,Grocery
1,107,75,56,0,Grocery
2,494,76,209,1,Grocery
3,522,77,278,1,Grocery
4,518,77,278,1,Grocery


In [7]:
def most_frequent(s):
    return s.value_counts().index[0]

def least_frequent(s):
    return s.value_counts().index[-1]

def most_frequent_count(s):
    return s.value_counts().values[0]

def least_frequent_count(s):
    return s.value_counts().values[-1]

coupon_data = coupon_item.groupby('coupon_id').agg({
    'item_id': ['nunique'],
    'brand': ['nunique', most_frequent, least_frequent, most_frequent_count, least_frequent_count],
    'brand_type': ['nunique', most_frequent, least_frequent, most_frequent_count, least_frequent_count],
    'category': ['nunique', most_frequent, least_frequent, most_frequent_count, least_frequent_count]
})
coupon_data.columns = ['item_id_count', 'brand_count', 'brand_freq', 'brand_rare', 'brand_freq_count', 
                       'brand_rare_count', 'brand_type_count', 'brand_type_freq', 'brand_type_rare', 
                       'brand_type_freq_count', 'brand_type_rare_count', 'category_count', 'category_freq', 
                       'category_rare', 'category_freq_count', 'category_rare_count']
coupon_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1116 entries, 1 to 1116
Data columns (total 16 columns):
item_id_count            1116 non-null int64
brand_count              1116 non-null int64
brand_freq               1116 non-null int64
brand_rare               1116 non-null int64
brand_freq_count         1116 non-null int64
brand_rare_count         1116 non-null int64
brand_type_count         1116 non-null int64
brand_type_freq          1116 non-null int64
brand_type_rare          1116 non-null int64
brand_type_freq_count    1116 non-null int64
brand_type_rare_count    1116 non-null int64
category_count           1116 non-null int64
category_freq            1116 non-null object
category_rare            1116 non-null object
category_freq_count      1116 non-null int64
category_rare_count      1116 non-null int64
dtypes: int64(14), object(2)
memory usage: 148.2+ KB


In [8]:
train_data = pd.read_csv('data/train/train.csv')
train_data.shape

(78369, 5)

In [9]:
test_data = pd.read_csv('data/test/test.csv')
test_data.shape

(50226, 4)

In [10]:
columns = train_data.columns[train_data.columns != 'redemption_status']
total_data = train_data[columns].append(test_data, sort=True)
total_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 128595 entries, 0 to 50225
Data columns (total 4 columns):
campaign_id    128595 non-null int64
coupon_id      128595 non-null int64
customer_id    128595 non-null int64
id             128595 non-null int64
dtypes: int64(4)
memory usage: 4.9 MB


In [11]:
total_data = total_data.merge(campaign_data, on='campaign_id', how='left')
total_data = total_data.merge(customer_data, on='customer_id', how='left')
total_data = total_data.merge(coupon_data, on='coupon_id', how='left')
total_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 128595 entries, 0 to 128594
Data columns (total 30 columns):
campaign_id              128595 non-null int64
coupon_id                128595 non-null int64
customer_id              128595 non-null int64
id                       128595 non-null int64
campaign_type            128595 non-null int64
start_date               128595 non-null datetime64[ns]
end_date                 128595 non-null datetime64[ns]
duration                 128595 non-null int64
age_range                74600 non-null object
marital_status           74600 non-null object
rented                   74600 non-null float64
family_size              74600 non-null float64
no_of_children           74600 non-null float64
income_bracket           74600 non-null float64
item_id_count            128595 non-null int64
brand_count              128595 non-null int64
brand_freq               128595 non-null int64
brand_rare               128595 non-null int64
brand_freq_count     

In [12]:
test_data = test_data[['id']].merge(total_data, on='id', how='left')
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50226 entries, 0 to 50225
Data columns (total 30 columns):
id                       50226 non-null int64
campaign_id              50226 non-null int64
coupon_id                50226 non-null int64
customer_id              50226 non-null int64
campaign_type            50226 non-null int64
start_date               50226 non-null datetime64[ns]
end_date                 50226 non-null datetime64[ns]
duration                 50226 non-null int64
age_range                30939 non-null object
marital_status           30939 non-null object
rented                   30939 non-null float64
family_size              30939 non-null float64
no_of_children           30939 non-null float64
income_bracket           30939 non-null float64
item_id_count            50226 non-null int64
brand_count              50226 non-null int64
brand_freq               50226 non-null int64
brand_rare               50226 non-null int64
brand_freq_count         50226 non-

In [13]:
test_data.to_csv('data/test/test_feature.csv', index=False)

In [14]:
train_data = train_data[['id','redemption_status']].merge(total_data, on='id', how='left')
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78369 entries, 0 to 78368
Data columns (total 31 columns):
id                       78369 non-null int64
redemption_status        78369 non-null int64
campaign_id              78369 non-null int64
coupon_id                78369 non-null int64
customer_id              78369 non-null int64
campaign_type            78369 non-null int64
start_date               78369 non-null datetime64[ns]
end_date                 78369 non-null datetime64[ns]
duration                 78369 non-null int64
age_range                43661 non-null object
marital_status           43661 non-null object
rented                   43661 non-null float64
family_size              43661 non-null float64
no_of_children           43661 non-null float64
income_bracket           43661 non-null float64
item_id_count            78369 non-null int64
brand_count              78369 non-null int64
brand_freq               78369 non-null int64
brand_rare               78369 non-

In [15]:
train_data.to_csv('data/train/train_feature.csv', index=False)