In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import Lasso

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [123]:
# date-parser
date_parser_func = lambda x: pd.datetime.strptime(x, '%d-%b-%y')

In [124]:
data = pd.read_csv('./data/Train_seers_accuracy.csv', parse_dates=['Transaction_Date', 'DOB'], date_parser=date_parser_func)
sub = pd.read_csv('./data/Sample_K7zT2mf.csv')

In [125]:
data.head()

Unnamed: 0,Transaction_ID,Transaction_Date,Store_ID,Number_of_EMI,Purchased_in_Sale,Var1,Var2,Var3,Client_ID,Gender,DOB,Referred_Friend,Sales_Executive_ID,Sales_Executive_Category,Lead_Source_Category,Payment_Mode,Product_Category,Transaction_Amount
0,TRA98825550,2003-01-01,STO1281,2,N,1,1,1,345821599,F,1971-08-19,NO,SD23011859,B,Advertisment,Credit/Debit Card,Cat A,17455
1,TRA98825710,2003-01-01,STO1247,2,N,1,2,1,345821734,M,1976-07-03,NO,SD23000293,B,Advertisment,Credit/Debit Card,Cat A,16503
2,TRA98823874,2003-01-01,STO1244,2,N,1,1,1,345820365,F,2059-01-04,NO,SD23011768,B,Advertisment,Credit/Debit Card,Cat A,15012
3,TRA98823889,2003-01-01,STO1256,2,N,1,1,1,345820377,M,2060-05-11,NO,SD23011691,B,Advertisment,Credit/Debit Card,Cat A,16051
4,TRA98824521,2003-01-01,STO1445,2,N,1,2,1,345820841,F,2062-03-24,NO,SD23003031,B,Reference,Cheque,Cat A,15108


## Exploratory Data Analysis

In [126]:
data.loc[:, 'transaction_year'] = data.Transaction_Date.dt.year

In [201]:
processed_df = data.groupby(['Client_ID', 'transaction_year']).size().unstack(1).fillna(0)

In [204]:
processed_df.ix[345612161][2003]

1.0

In [205]:
processed_df.head()

transaction_year,2003,2004,2005,2006
Client_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
345612161,1,0,0,0
345612467,1,0,0,0
345612552,0,0,1,0
345612557,0,1,0,0
345612625,1,0,1,0


## Model

In [219]:
def did_purchase(client_id, year):
    return int(processed_df.ix[client_id][year] > 0)

def first_purchase(client_id, year):
    year_range = [2003, 2004, 2005, 2006]
    all_other_years = [y for y in year_range if y != year]
    
    if processed_df.ix[client_id][all_other_years].any():
        return 0
    elif processed_df.ix[client_id][year] > 0:
        return 1
    else:
        return 0

In [220]:
def prob_(client_id):
    
    running_sum = 0
    for year_index in range(1, 5):
        calculated_year = 2007 - year_index
        
        did_purchase_val = (0.875 * did_purchase(client_id, calculated_year))
        was_it_first_purchase = (0.125 * first_purchase(client_id, calculated_year))
        factor = ( 5 - year_index )  ** 2
        
        running_sum = running_sum + (factor * (did_purchase_val + was_it_first_purchase))
        
    return (1 / 30.) * ( running_sum )

In [221]:
client_ids = data.Client_ID.unique()
preds = np.array([prob_(client_id) for client_id in client_ids])

In [226]:
submission = pd.DataFrame({'Client_ID': client_ids, 'Cross_Sell': preds})
submission.reset_index().to_csv('./submissions/weighted_parzen.csv', index=False)