In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from multiprocessing import  Pool

In [2]:
def most_frequent(s):
    return s.value_counts().index[0]

def least_frequent(s):
    return s.value_counts().index[-1]

def most_frequent_count(s):
    return s.value_counts().values[0]

def least_frequent_count(s):
    return s.value_counts().values[-1]

def parallelize_dataframe(df, func, n_cores=4):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [3]:
item_data = pd.read_csv('data/train/item_data.csv')
item_data['brand_type'] = item_data['brand_type'].replace({'Established': 1, 'Local': 0})
item_data.head()

Unnamed: 0,item_id,brand,brand_type,category
0,1,1,1,Grocery
1,2,1,1,Miscellaneous
2,3,56,0,Bakery
3,4,56,0,Grocery
4,5,56,0,Grocery


In [4]:
total_items = item_data['item_id'].nunique()
total_brands= item_data['brand'].nunique()
total_brand_types = item_data['brand_type'].nunique()
total_categories = item_data['category'].nunique()

In [5]:
campaign_data = pd.read_csv('data/train/campaign_data.csv', 
                            parse_dates=['start_date', 'end_date'], dayfirst=True)
campaign_data['campaign_type'] = campaign_data['campaign_type'].replace({'X': 0, 'Y': 1})
campaign_data['duration'] = (campaign_data['end_date'] - campaign_data['start_date']).dt.days
campaign_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 5 columns):
campaign_id      28 non-null int64
campaign_type    28 non-null int64
start_date       28 non-null datetime64[ns]
end_date         28 non-null datetime64[ns]
duration         28 non-null int64
dtypes: datetime64[ns](2), int64(3)
memory usage: 1.2 KB


In [6]:
campaign_data_2 = campaign_data.set_index('campaign_id')
campaign_data_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28 entries, 24 to 26
Data columns (total 4 columns):
campaign_type    28 non-null int64
start_date       28 non-null datetime64[ns]
end_date         28 non-null datetime64[ns]
duration         28 non-null int64
dtypes: datetime64[ns](2), int64(2)
memory usage: 1.1 KB


In [7]:
def get_marital_status(row):
    na_row = row.isna()
    if not na_row['marital_status']:
        return row['marital_status']
    return 'Married' if row['family_size'] - row['no_of_children'] > 1 else 'Single'

customer_data = pd.read_csv('data/train/customer_demographics.csv')
customer_data['family_size'] = customer_data['family_size'].str.replace('+','').astype('int')
customer_data['no_of_children'] = customer_data['no_of_children'].fillna('0').str.replace('+','').astype('int')
customer_data['marital_status'] = customer_data.apply(get_marital_status, axis=1)
customer_data['marital_status'] = customer_data['marital_status'].replace({'Single': 0, 'Married': 1})

customer_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 760 entries, 0 to 759
Data columns (total 7 columns):
customer_id       760 non-null int64
age_range         760 non-null object
marital_status    760 non-null int64
rented            760 non-null int64
family_size       760 non-null int64
no_of_children    760 non-null int64
income_bracket    760 non-null int64
dtypes: int64(6), object(1)
memory usage: 41.6+ KB


In [8]:
transaction_data = pd.read_csv('data/train/customer_transaction_data.csv', parse_dates=['date'])
transaction_data['total_discount'] = transaction_data['coupon_discount'] + transaction_data['other_discount']
transaction_data['buying_price'] = transaction_data['selling_price'] + transaction_data['other_discount']
transaction_data['selling_price_pq'] = transaction_data['selling_price'] / transaction_data['quantity']
transaction_data['other_discount_pq'] = transaction_data['other_discount'] / transaction_data['quantity']
transaction_data['coupon_discount_pq'] = transaction_data['coupon_discount'] / transaction_data['quantity']
transaction_data['total_discount_pq'] = transaction_data['coupon_discount_pq'] + transaction_data['other_discount_pq']
transaction_data['buying_price_pq'] = transaction_data['selling_price_pq'] + transaction_data['other_discount_pq']
transaction_data['date'] = pd.to_datetime(transaction_data['date'])
transaction_data = transaction_data.merge(item_data, on='item_id', how='left')
transaction_data = transaction_data.set_index(['customer_id','date']).sort_index()
transaction_data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1324566 entries, (1, 2012-02-21 00:00:00) to (1582, 2013-06-25 00:00:00)
Data columns (total 15 columns):
item_id               1324566 non-null int64
quantity              1324566 non-null int64
selling_price         1324566 non-null float64
other_discount        1324566 non-null float64
coupon_discount       1324566 non-null float64
total_discount        1324566 non-null float64
buying_price          1324566 non-null float64
selling_price_pq      1324566 non-null float64
other_discount_pq     1324566 non-null float64
coupon_discount_pq    1324566 non-null float64
total_discount_pq     1324566 non-null float64
buying_price_pq       1324566 non-null float64
brand                 1324566 non-null int64
brand_type            1324566 non-null int64
category              1324566 non-null object
dtypes: float64(10), int64(4), object(1)
memory usage: 156.7+ MB


In [9]:
customer_history = transaction_data.reset_index().groupby('customer_id').agg({
    'item_id': ['nunique', 'count'],
    'quantity': 'sum',
    'selling_price': 'mean',
    'buying_price': 'mean',
    'other_discount': 'mean',
    'coupon_discount': 'mean',
    'total_discount': 'mean',
    'selling_price_pq': 'mean',
    'buying_price_pq': 'mean',
    'other_discount_pq': 'mean',
    'coupon_discount_pq': 'mean',
    'total_discount_pq': 'mean',
    'brand': ['nunique', most_frequent, least_frequent, most_frequent_count, least_frequent_count],
    'brand_type': ['nunique', most_frequent, least_frequent, most_frequent_count, least_frequent_count],
    'category': ['nunique', most_frequent, least_frequent, most_frequent_count, least_frequent_count]
})
customer_history.columns = ['overall_unique_items', 'overall_items', 'overall_quantity', 'overall_sprice', 'overall_bprice', 'overall_odiscount', 'overall_cdiscount', 'overall_tdiscount', 'overall_sprice_pq', 'overall_bprice_pq', 'overall_odiscount_pq', 'overall_cdiscount_pq', 'overall_tdiscount_pq', 'overall_unique_brand', 'overall_freq_brand', 'overall_rare_brand', 'overall_items_freq_brand', 'overall_items_rare_brand', 'overall_unique_brandt', 'overall_freq_brandt', 'overall_rare_brandt', 'overall_items_freq_brandt', 'overall_items_rare_brandt', 'overall_unique_category', 'overall_freq_category', 'overall_rare_category', 'overall_items_freq_category', 'overall_items_rare_category']
customer_history['overall_coverage_item'] = customer_history['overall_unique_items'] / total_items
customer_history['overall_coverage_brand'] = customer_history['overall_unique_brand'] / total_brands
customer_history['overall_coverage_brandt'] = customer_history['overall_unique_brandt'] / total_brand_types
customer_history['overall_coverage_category'] = customer_history['overall_unique_category'] / total_categories
customer_history = customer_history.reset_index()
customer_history.head()

Unnamed: 0,customer_id,overall_unique_items,overall_items,overall_quantity,overall_sprice,overall_bprice,overall_odiscount,overall_cdiscount,overall_tdiscount,overall_sprice_pq,...,overall_items_rare_brandt,overall_unique_category,overall_freq_category,overall_rare_category,overall_items_freq_category,overall_items_rare_category,overall_coverage_item,overall_coverage_brand,overall_coverage_brandt,overall_coverage_category
0,1,463,1048,1227,94.001842,77.75146,-16.250382,-2.019876,-18.270258,84.632943,...,90,9,Grocery,Meat,774,2,0.006251,0.029486,1.0,0.473684
1,2,352,419,474,102.864033,86.033604,-16.83043,-0.595084,-17.425513,94.373588,...,92,9,Grocery,Seafood,295,1,0.004753,0.027677,1.0,0.473684
2,3,406,705,8163,103.617404,80.903177,-22.714227,-3.091546,-25.805773,71.007598,...,112,8,Grocery,Fuel,577,1,0.005482,0.020622,1.0,0.421053
3,4,125,220,280,154.423727,141.118318,-13.305409,-0.404773,-13.710182,129.373114,...,28,8,Grocery,"Dairy, Juices & Snacks",168,1,0.001688,0.013025,1.0,0.421053
4,5,490,792,93353,130.827146,117.16923,-13.657917,-0.114684,-13.772601,104.222145,...,178,14,Grocery,Miscellaneous,405,1,0.006616,0.030391,1.0,0.736842


In [10]:
coupon_item = pd.read_csv('data/train/coupon_item_mapping.csv')
coupon_item = coupon_item.merge(item_data, how='left', on='item_id')
coupon_item.head()

Unnamed: 0,coupon_id,item_id,brand,brand_type,category
0,105,37,56,0,Grocery
1,107,75,56,0,Grocery
2,494,76,209,1,Grocery
3,522,77,278,1,Grocery
4,518,77,278,1,Grocery


In [11]:
coupon_item_2 = coupon_item.set_index('coupon_id')
coupon_item_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 92663 entries, 105 to 31
Data columns (total 4 columns):
item_id       92663 non-null int64
brand         92663 non-null int64
brand_type    92663 non-null int64
category      92663 non-null object
dtypes: int64(3), object(1)
memory usage: 3.5+ MB


In [12]:


coupon_data = coupon_item.groupby('coupon_id').agg({
    'item_id': ['nunique'],
    'brand': ['nunique', most_frequent, least_frequent, most_frequent_count, least_frequent_count],
    'brand_type': ['nunique', most_frequent, least_frequent, most_frequent_count, least_frequent_count],
    'category': ['nunique', most_frequent, least_frequent, most_frequent_count, least_frequent_count]
})
coupon_data.columns = ['c_unique_items', 'c_unique_brand', 'c_freq_brand', 'c_rare_brand', 
                       'c_items_freq_brand', 'c_items_rare_brand', 'c_unique_brandt', 'c_freq_brandt',
                       'c_rare_brandt', 'c_items_freq_brandt', 'c_items_rare_brandt', 
                       'c_unique_category', 'c_freq_category', 'c_rare_category', 'c_items_freq_category', 
                       'c_items_rare_category']
coupon_data['c_coverage_item'] = coupon_data['c_unique_items'] / total_items
coupon_data['c_coverage_brand'] = coupon_data['c_unique_brand'] / total_brands
coupon_data['c_coverage_brandt'] = coupon_data['c_unique_brandt'] / total_brand_types
coupon_data['c_coverage_category'] = coupon_data['c_unique_category'] / total_categories

coupon_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1116 entries, 1 to 1116
Data columns (total 20 columns):
c_unique_items           1116 non-null int64
c_unique_brand           1116 non-null int64
c_freq_brand             1116 non-null int64
c_rare_brand             1116 non-null int64
c_items_freq_brand       1116 non-null int64
c_items_rare_brand       1116 non-null int64
c_unique_brandt          1116 non-null int64
c_freq_brandt            1116 non-null int64
c_rare_brandt            1116 non-null int64
c_items_freq_brandt      1116 non-null int64
c_items_rare_brandt      1116 non-null int64
c_unique_category        1116 non-null int64
c_freq_category          1116 non-null object
c_rare_category          1116 non-null object
c_items_freq_category    1116 non-null int64
c_items_rare_category    1116 non-null int64
c_coverage_item          1116 non-null float64
c_coverage_brand         1116 non-null float64
c_coverage_brandt        1116 non-null float64
c_coverage_category      1116 

In [13]:
train_data = pd.read_csv('data/train/train.csv')
train_data.shape

(78369, 5)

In [14]:
test_data = pd.read_csv('data/test/test.csv')
test_data.shape

(50226, 4)

In [15]:
columns = train_data.columns[train_data.columns != 'redemption_status']
total_data = train_data[columns].append(test_data, sort=True)
total_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 128595 entries, 0 to 50225
Data columns (total 4 columns):
campaign_id    128595 non-null int64
coupon_id      128595 non-null int64
customer_id    128595 non-null int64
id             128595 non-null int64
dtypes: int64(4)
memory usage: 4.9 MB


In [16]:
total_data = total_data.merge(campaign_data, on='campaign_id', how='left')
total_data = total_data.merge(customer_data, on='customer_id', how='left')
total_data = total_data.merge(coupon_data, on='coupon_id', how='left')
total_data = total_data.merge(customer_history, on='customer_id', how='left')
total_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 128595 entries, 0 to 128594
Data columns (total 66 columns):
campaign_id                    128595 non-null int64
coupon_id                      128595 non-null int64
customer_id                    128595 non-null int64
id                             128595 non-null int64
campaign_type                  128595 non-null int64
start_date                     128595 non-null datetime64[ns]
end_date                       128595 non-null datetime64[ns]
duration                       128595 non-null int64
age_range                      74600 non-null object
marital_status                 74600 non-null float64
rented                         74600 non-null float64
family_size                    74600 non-null float64
no_of_children                 74600 non-null float64
income_bracket                 74600 non-null float64
c_unique_items                 128595 non-null int64
c_unique_brand                 128595 non-null int64
c_freq_brand      

In [17]:
range_start_delta = pd.Timedelta('223 days')
range_end_delta = pd.Timedelta('110 days')

def extract_range_features(row):
    start_time = pd.Timestamp(campaign_data_2.loc[row.campaign_id].start_date)
    range_start, range_end = start_time - range_start_delta, start_time - range_end_delta
    items = pd.Series(coupon_item_2.loc[row.coupon_id].item_id)

    overall_history = transaction_data.loc[row.customer_id]
    history = overall_history[range_start:range_end]
    overall_coupon_history = overall_history[overall_history.item_id.isin(items)]
    coupon_history = history[history.item_id.isin(items)]

    for df_name, df in [('overall_coupon_', overall_coupon_history), ('range_', history), ('range_coupon_', coupon_history)]:
        row[df_name+'unique_items'] = df['item_id'].nunique()
        row[df_name+'items'] = df['item_id'].count()
        row[df_name+'quantity'] = df['quantity'].sum()
        
        for new_name, old_name in [('sprice', 'selling_price'), ('bprice', 'buying_price'), ('odiscount', 'other_discount'), ('cdiscount', 'coupon_discount'), ('tdiscount', 'total_discount'), ('sprice_pq', 'selling_price_pq'), ('bprice_pq', 'buying_price_pq'), ('odiscount_pq', 'other_discount_pq'), ('cdiscount_pq', 'coupon_discount_pq'), ('tdiscount_pq', 'total_discount_pq')]:
            row[df_name+''+new_name] = df[old_name].mean()
        
        for new_name, old_name in [('brand', 'brand'), ('brandt', 'brand_type'), ('category','category')]:
            unique_count = df[old_name].nunique()
            row[df_name+'unique_'+new_name] = unique_count
            
            if unique_count == 0:
                row[df_name+'items_freq_'+new_name] = 0
                row[df_name+'items_rare_'+new_name] = 0
                row[df_name+'freq_'+new_name] = np.NaN
                row[df_name+'rare_'+new_name] = np.NaN
            else:
                freq = most_frequent(df[old_name])
                row[df_name+'freq_'+new_name] = freq
                row[df_name+'items_freq_'+new_name] = df[df[old_name] == freq]['item_id'].nunique()
                row[df_name+'items_p_freq_'+new_name] = row[df_name+'items_freq_'+new_name] / df['item_id'].nunique()

                rare = least_frequent(df[old_name])
                row[df_name+'rare_'+new_name] = rare
                row[df_name+'items_rare_'+new_name] = df[df[old_name] == rare]['item_id'].nunique()
                row[df_name+'items_p_rare_'+new_name] = row[df_name+'items_rare_'+new_name] / df['item_id'].nunique()
            
        row[df_name+'coverage_item'] = row[df_name+'unique_items'] / total_items
        row[df_name+'coverage_brand'] = row[df_name+'unique_brand'] / total_brands
        row[df_name+'coverage_brandt'] = row[df_name+'unique_brandt'] / total_brand_types
        row[df_name+'coverage_category'] = row[df_name+'unique_category'] / total_categories
    
    if coupon_history.shape[0] > 0:
        row['redemption_ratio'] = coupon_history[coupon_history['coupon_discount'] < 0].shape[0] / coupon_history.shape[0]
    else:
        row['redemption_ratio'] = np.NaN
    
    if history.shape[0] > 0:
        row['overall_redemption_ratio'] = history[history['coupon_discount'] < 0].shape[0] / history.shape[0]
    else:
        row['overall_redemption_ratio'] = np.NaN
        
    return row

def apply_range_features(df):
    return df.apply(extract_range_features, axis=1)

In [18]:
total_data = parallelize_dataframe(total_data, apply_range_features, n_cores=4)
# total_data = apply_range_features(total_data.head(5))
total_data.info()

Process ForkPoolWorker-4:
Process ForkPoolWorker-1:
Process ForkPoolWorker-2:
Process ForkPoolWorker-3:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/monilgudhka/anaconda3/envs/ds/lib/python3.7/site-packages/pandas/core/series.py", line 1046, in _set_with_engine
    self.index._engine.set_value(values, key, value)
  File "/home/monilgudhka/anaconda3/envs/ds/lib/python3.7/site-packages/pandas/core/series.py", line 1046, in _set_with_engine
    self.index._engine.set_value(values, key, value)
  File "pandas/_libs/index.pyx", line 95, in pandas._libs.index.IndexEngine.set_value
  File "pandas/_libs/index.pyx", line 103, in pandas._libs.index.IndexEngine.set_value
  File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 95, in pandas._libs.index.IndexEngine.set_value
  File "pandas/_libs/hashtable_class_helper.pxi", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "

  File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/monilgudhka/anaconda3/envs/ds/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/monilgudhka/anaconda3/envs/ds/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "pandas/_libs/hashtable_class_helper.pxi", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "/home/monilgudhka/anaconda3/envs/ds/lib/python3.7/multiprocessing/pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
  File "/home/monilgudhka/anaconda3/envs/ds/lib/python3.7/multiprocessing/pool.py", line 44, in mapstar
    return list(map(*args))
  File "pandas/_libs/hashtable_class_helper.pxi", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'overall_coupon_item

  File "/home/monilgudhka/anaconda3/envs/ds/lib/python3.7/site-packages/pandas/core/apply.py", line 257, in apply_standard
    self.apply_series_generator()
  File "/home/monilgudhka/anaconda3/envs/ds/lib/python3.7/site-packages/pandas/core/apply.py", line 286, in apply_series_generator
    results[i] = self.f(v)
  File "<ipython-input-17-7f2db0712eae>", line 40, in extract_range_features
    row[df_name+'items_p_rare_'+new_name] = row[df_name+'items_rare_'+new_name] / df['item_id'].nunique()
  File "/home/monilgudhka/anaconda3/envs/ds/lib/python3.7/site-packages/pandas/core/series.py", line 1039, in __setitem__
    setitem(key, value)
  File "/home/monilgudhka/anaconda3/envs/ds/lib/python3.7/site-packages/pandas/core/series.py", line 1015, in setitem
    self.loc[key] = value
  File "/home/monilgudhka/anaconda3/envs/ds/lib/python3.7/site-packages/pandas/core/indexing.py", line 190, in __setitem__
    self._setitem_with_indexer(indexer, value)
  File "/home/monilgudhka/anaconda3/envs/d

KeyboardInterrupt: 

In [None]:
# Deriving more variables from existing
for column in ['unique_items', 'items', 'quantity', 'sprice', 'bprice', 'odiscount', 'cdiscount', 'tdiscount', 'sprice_pq', 'bprice_pq', 'odiscount_pq', 'cdiscount_pq', 'tdiscount_pq', 'unique_brand', 'unique_brandt', 'unique_category', 'coverage_brand', 'coverage_category']:
    total_data['diff_overall_'+column] = total_data['overall_'+column] - total_data['overall_coupon_'+column]
    total_data['diff_range_'+column] = total_data['range_'+column] - total_data['range_coupon_'+column]
    total_data['diff_coupon_'+column] = total_data['overall_coupon_'+column] - total_data['range_coupon_'+column]
    
    total_data['diff_p_overall_'+column] = total_data['diff_overall_'+column] / total_data['overall_'+column]
    total_data['diff_p_range_'+column] = total_data['diff_range_'+column] / total_data['range_'+column]
    total_data['diff_p_coupon_'+column] = total_data['diff_coupon_'+column] / total_data['overall_coupon_'+column]

for column in ['unique_items', 'unique_brand', 'unique_brandt', 'unique_category', 'coverage_item', 'coverage_brand', 'coverage_brandt', 'coverage_category']:
    total_data['c_diff_'+column] = total_data['c_'+column] - total_data['range_coupon_'+column]
    
for column in ['freq_brand', 'rare_brand', 'freq_brandt', 'rare_brandt', 'freq_category', 'rare_category']:
    total_data['match_'+column] = (total_data['c_'+column] == total_data['range_coupon_'+column]).astype('int')
    
total_data.info()

In [None]:
test_data = test_data[['id']].merge(total_data, on='id', how='left')
test_data.info()

In [None]:
# test_data.to_csv('data/test/test_feature.csv', index=False)

In [None]:
train_data = train_data[['id','redemption_status']].merge(total_data, on='id', how='left')
train_data.info()

In [None]:
# train_data.to_csv('data/train/train_feature.csv', index=False)