In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats
from IPython.display import Audio
sound_file = './beep-01a.wav'

In [2]:
data = pd.read_csv('train.csv')
data.head(n=3)

Unnamed: 0,customer_ID,shopping_pt,record_type,day,time,state,location,group_size,homeowner,car_age,...,C_previous,duration_previous,A,B,C,D,E,F,G,cost
0,10000000,1,0,0,08:35,IN,10001,2,0,2,...,1.0,2.0,1,0,2,2,1,2,2,633
1,10000000,2,0,0,08:38,IN,10001,2,0,2,...,1.0,2.0,1,0,2,2,1,2,1,630
2,10000000,3,0,0,08:38,IN,10001,2,0,2,...,1.0,2.0,1,0,2,2,1,2,1,630


In [26]:
np.unique(data.columns)

array(['A', 'B', 'C', 'C_previous', 'D', 'E', 'F', 'G', 'age_oldest',
       'age_youngest', 'car_age', 'car_value', 'cost', 'customer_ID',
       'day', 'duration_previous', 'group_size', 'homeowner', 'location',
       'married_couple', 'record_type', 'risk_factor', 'shopping_pt',
       'state', 'time'], dtype=object)

#1. Baseline prediction

In [12]:
purchase_rows = data.groupby('customer_ID').shopping_pt.idxmax()

In [22]:
quotes_purchase = data.iloc[purchase_rows].reset_index()
quotes_before_purchase = data.iloc[purchase_rows - 1].reset_index()

In [16]:
assert len(quotes_purchase) == len(quotes_before_purchase)

In [20]:
def char_range(c1, c2):
    """Generates the characters from `c1` to `c2`, inclusive."""
    for c in xrange(ord(c1), ord(c2)+1):
        yield chr(c)
def get_plan_representation(row):
    return ''.join([`int(row[char])` for char in char_range('A', 'G')])

In [23]:
correct_count = 0
total_count = len(quotes_purchase)
for index, purchase_row in quotes_purchase.iterrows():
    before_purchase_row = quotes_before_purchase.iloc[index]
    correct_count += 1 if get_plan_representation(purchase_row) == get_plan_representation(before_purchase_row) else 0
correct_rate = float(correct_count) / total_count
Audio(url=sound_file, autoplay=True)

#2. To get a sense of the shape of the data

In [50]:
# Check the integrity of the training data
def any_null(series):
    return any(series.isnull())
pd.DataFrame(data.apply(any_null))

Unnamed: 0,0
customer_ID,False
shopping_pt,False
record_type,False
day,False
time,False
state,False
location,False
group_size,False
homeowner,False
car_age,False


In [55]:
# Also check the integrity of the test data
test_file = 'test_v2.csv'
test_data = pd.read_csv(test_file)
pd.DataFrame(test_data.apply(any_null))

Unnamed: 0,0
customer_ID,False
shopping_pt,False
record_type,False
day,False
time,False
state,False
location,True
group_size,False
homeowner,False
car_age,False


In [60]:
test_data.location.mode()

0    11179.0
dtype: float64

In [73]:
mode = lambda x: int(scipy.stats.mode(x)[0][0])
state_location_mode = test_data.groupby('state')['location'].apply(mode)
#Audio(url=sound_file, autoplay=True)

In [41]:
# Assign 'none' for all blank car values, because it can be assumed that those customers with that column blank have no cars. 
# They belong to a separate class
data.car_value[data.car_value.isnull()] = 'n'
test_data.car_value[test_data.car_value.isnull()] = 'n'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [45]:
# Assign 0 for all blank C_previous, because it can be assumed that those customers with that column blank did not enroll plan C
data.C_previous[data.C_previous.isnull()] = 0
test_data.C_previous[test_data.C_previous.isnull()] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [46]:
# Assign 0 for all blank duration_previous, because it can be assumed that these customers are new
data.duration_previous[data.duration_previous.isnull()] = 0
test_data.duration_previous[test_data.duration_previous.isnull()] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [47]:
len(quotes_before_purchase)

97009

In [51]:
# Initialize again quotes before purchase
quotes_before_purchase = data.iloc[purchase_rows - 1].reset_index()
quotes_before_purchase['average_age'] = (quotes_before_purchase.age_youngest + quotes_before_purchase.age_oldest) / 2

In [76]:
risk_factor_predictors = quotes_before_purchase[[#'state', 'location', 
                                                 'homeowner', 'car_age',
                                                 'car_value', 'average_age', 
                                                 'married_couple', 'duration_previous']]
risk_factor = quotes_before_purchase['risk_factor']

In [77]:
risk_factor_predictors.head()

Unnamed: 0,homeowner,car_age,car_value,average_age,married_couple,duration_previous
0,0,2,g,44.0,1,2.0
1,0,10,e,28.0,0,13.0
2,0,11,c,43.0,0,4.0
3,1,3,d,61.0,1,3.0
4,0,5,d,30.0,1,2.0


In [80]:
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import OneHotEncoder

In [82]:
one_hot = OneHotEncoder().fit(risk_factor_predictors)
risk_factor_predictors_onehot = one_hot.transform(risk_factor_predictors)

ValueError: invalid literal for long() with base 10: 'e'

In [79]:
lr = LogisticRegression()
score = cross_val_score(lr, risk_factor_predictors, risk_factor, cv=5, verbose=1)

ValueError: could not convert string to float: e