In [3]:
%matplotlib notebook
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from matplotlib import cm
import calendar

In [4]:
# read data
data0 = pd.read_csv("assignmentdata.csv")
data0.columns  # Note 'Unnamed: 0' was created when converting .csv to .rds then to .csv again

Index(['Unnamed: 0', 'merchant', 'time', 'amount_usd_in_cents'], dtype='object')

In [5]:
# remove the extra column
data0.drop('Unnamed: 0', axis = 1, inplace = True)
data0['time'] = [dt.datetime.strptime(i, "%Y-%m-%d %H:%M:%S") for i in data0.time]
data0.reset_index(inplace = True)
data0['X'] = data0.index+1
data0.drop('index', axis = 1, inplace = True)

In [6]:
data0.head()

Unnamed: 0,merchant,time,amount_usd_in_cents,X
0,faa029c6b0,2034-06-17 23:34:14,6349,1
1,ed7a7d91aa,2034-12-27 00:40:38,3854,2
2,5608f200cf,2034-04-30 01:29:42,789,3
3,15b1a0d61e,2034-09-16 01:06:23,4452,4
4,4770051790,2034-07-22 16:21:42,20203,5


In [7]:
# convert time column from string to timestamp
data1 = data0.copy()
print(data1.shape)
print(data1.dropna().shape)  #No NAs from the dataset

# create time-related columns
data1['TransDate'] = [dt.date(i.year, i.month,i.day) for i in data1.time]
data1['TransMonth'] = [dt.date(i.year, i.month, calendar.monthrange(i.year, i.month)[1]) for i in data1.time]

(1513719, 4)
(1513719, 4)


In [8]:
data = data1.copy()
data.head()

Unnamed: 0,merchant,time,amount_usd_in_cents,X,TransDate,TransMonth
0,faa029c6b0,2034-06-17 23:34:14,6349,1,2034-06-17,2034-06-30
1,ed7a7d91aa,2034-12-27 00:40:38,3854,2,2034-12-27,2034-12-31
2,5608f200cf,2034-04-30 01:29:42,789,3,2034-04-30,2034-04-30
3,15b1a0d61e,2034-09-16 01:06:23,4452,4,2034-09-16,2034-09-30
4,4770051790,2034-07-22 16:21:42,20203,5,2034-07-22,2034-07-31


In [9]:
#************* find oneday and onetime users
dfN = pd.DataFrame(data.groupby('merchant').count().X)  # for each merchant, calculate total No. of transactions
dfN.rename(columns = {'X':'N'},  inplace = True) # call it N
data = pd.merge(dfN, data, how = 'outer', left_index = True, right_on = 'merchant') # merge to transaction dataset

Noftheday = pd.DataFrame(data.groupby(['merchant', 'TransDate']).count().X) # for each merchant, each trasaction date, count total No. of transactions on that date
Noftheday.rename(columns = {'X':'Noftheday'}, inplace = True) # call it Noftheday
data = pd.merge(Noftheday, data, how = 'outer', left_index = True, right_on = ['merchant', 'TransDate']) # merge to transaction dateset

data['Foftheday'] = data['Noftheday'] / data['N'] # calculate proportion of total transactions made on each transaction date

#********* One day user if all transactions were made on a single date
data['Oneday'] = data['Foftheday'].apply(lambda x: 1 if x == 1 else 0) 
#********* One time user if only one transaction was made
data['Onetime'] = data['N'].apply(lambda x: 1 if x == 1 else 0) 

#******** Create a df to store this information
dfOne = data[['merchant', 'Oneday', 'Onetime','N']].drop_duplicates()
dfOne.set_index('merchant', inplace = True)

#*************** identify first payment date, last payment date and user horizon of each merchant
dfhorizon = data.groupby('merchant')['TransDate'].agg({'FirstTransDate':np.min, 'LastTransDate':np.max})
dfhorizon['Horizon'] = (dfhorizon.LastTransDate - dfhorizon.FirstTransDate).dt.total_seconds()/(24 * 60 * 60)

#************** create dfmerchant to store information by merchant
dfmerchant = pd.merge(dfOne, dfhorizon, how = 'outer', left_index = True, right_index = True)

is deprecated and will be removed in a future version. Use                 named aggregation instead.

    >>> grouper.agg(name_1=func_1, name_2=func_2)



In [10]:
#********* for each merchant, find # of transactions made on each recorded payment dates
dftransbymerbyday = pd.DataFrame(data.groupby(['merchant', 'TransDate']).count().X) # for each merchant, each trasaction date, count total No. of transactions on that date
dftransbymerbyday.rename(columns = {'X':'N_of_Date'}, inplace = True) # call it Noftheday
dftransbymerbyday.reset_index(inplace = True)

#********* for each merchant, find # of days between any two consecutive transaction dates
dftransbymerbyday['delta_t'] = dftransbymerbyday.groupby('merchant')['TransDate'].diff()
df = dftransbymerbyday[['merchant','TransDate','delta_t']].copy()
df.dropna(inplace = True)
df['delta_t'] = [i.total_seconds()/(24 * 60 * 60) for i in df['delta_t']]
dftransbymerbyday.drop('delta_t', axis = 1, inplace = True)
dftransbymerbyday = pd.merge(dftransbymerbyday, df, how = 'outer', left_on = ['merchant', 'TransDate'], right_on = ['merchant', 'TransDate'])
dftransbymerbyday.delta_t[pd.isnull(dftransbymerbyday.delta_t) == True] = 0

#******** for each merchant, find the first and last payment date
dfdayrange = dftransbymerbyday.groupby('merchant')['TransDate'].agg({'initialdate':np.min, 'finaldate':np.max})
dftransbymerbyday = pd.merge(dftransbymerbyday, dfdayrange,how = 'outer', right_index =  True, left_on = 'merchant')

#******** for each merchant, find the min, max and average delta_t
dfdelta_t = dftransbymerbyday.groupby('merchant')['delta_t'].agg({'min_delta_t':np.min, 'avg_delta_t':np.mean, 'max_delta_t':np.max})
dftransbymerbyday = pd.merge(dftransbymerbyday, dfdelta_t, how = 'outer', right_index =  True, left_on = 'merchant')

#******** calculate days past since last payment date as of 12/31/2034
dftransbymerbyday['DistanceToLastD'] = (dt.date(2034,12,31) - dftransbymerbyday['finaldate']).dt.total_seconds()/(24*60*60)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
is deprecated and will be removed in a future version. Use                 named aggregation instead.

    >>> grouper.agg(name_1=func_1, name_2=func_2)

  app.launch_new_instance()
is deprecated and will be removed in a future version. Use                 named aggregation instead.

    >>> grouper.agg(name_1=func_1, name_2=func_2)



In [11]:
df1 = data.groupby(['merchant','TransDate']).agg({'amount_usd_in_cents':np.sum})
df1['lg_amount'] = np.log(df1.amount_usd_in_cents)
df1.reset_index(inplace = True)
dftransbymerbyday = pd.merge(dftransbymerbyday,df1,how = 'outer', left_on = ['merchant', 'TransDate'], right_on = ['merchant', 'TransDate'])
dftransbymerbyday.head()

Unnamed: 0,merchant,TransDate,N_of_Date,delta_t,initialdate,finaldate,min_delta_t,avg_delta_t,max_delta_t,DistanceToLastD,amount_usd_in_cents,lg_amount
0,0002b63b92,2033-05-16,1,0.0,2033-05-16,2033-05-16,0.0,0.0,0.0,594.0,3379,8.125335
1,0002d07bba,2034-10-11,1,0.0,2034-10-11,2034-12-15,0.0,16.25,32.0,16.0,5549,8.621373
2,0002d07bba,2034-10-17,1,6.0,2034-10-11,2034-12-15,0.0,16.25,32.0,16.0,37830,10.540858
3,0002d07bba,2034-11-13,1,27.0,2034-10-11,2034-12-15,0.0,16.25,32.0,16.0,43842,10.688348
4,0002d07bba,2034-12-15,1,32.0,2034-10-11,2034-12-15,0.0,16.25,32.0,16.0,2057,7.629004


In [12]:
df_all = dftransbymerbyday
df_all.head()

Unnamed: 0,merchant,TransDate,N_of_Date,delta_t,initialdate,finaldate,min_delta_t,avg_delta_t,max_delta_t,DistanceToLastD,amount_usd_in_cents,lg_amount
0,0002b63b92,2033-05-16,1,0.0,2033-05-16,2033-05-16,0.0,0.0,0.0,594.0,3379,8.125335
1,0002d07bba,2034-10-11,1,0.0,2034-10-11,2034-12-15,0.0,16.25,32.0,16.0,5549,8.621373
2,0002d07bba,2034-10-17,1,6.0,2034-10-11,2034-12-15,0.0,16.25,32.0,16.0,37830,10.540858
3,0002d07bba,2034-11-13,1,27.0,2034-10-11,2034-12-15,0.0,16.25,32.0,16.0,43842,10.688348
4,0002d07bba,2034-12-15,1,32.0,2034-10-11,2034-12-15,0.0,16.25,32.0,16.0,2057,7.629004


In [13]:
#*************** Create snapshot date
snapshotdate1 = list()
for i in [2033,2034]:
    for j in range(1,13):
        snapshotdate1.append(dt.date(i, j, calendar.monthrange(i, j)[1]))

snapshotdate2 = list()
for i in [2033,2034]:
    for j in range(1,13):
        snapshotdate2.append(dt.date(i, j, 15))
    
snapshotdate = snapshotdate1
snapshotdate.sort()

In [14]:
# df_temp1 = df_all.groupby('merchant')['lg_amount'].agg([np.min, np.max, np.mean])
# df_temp1.rename(columns = {'amin':'lg_min','amax':'lg_max','mean':'lg_mean'}, inplace = True)
# df_temp1.head()

In [15]:
#snapshotdate = [dt.date(2033, 6, 30),dt.date(2033, 7, 31)]
cols = ['merchant', 'thissnapshot', 'thislastdate', 'thisdistance', 'this_max_delta_t', 
        'TtoSnapshot','avg_N_per_Pday','total_N_to_date', 'avg_N_per_Cday','lg_min','lg_max','lg_mean']
cols2 = ['merchant', 'thissnapshot', 'thislastdate', 'thisdistance', 'this_max_delta_t', 
         'TtoSnapshot','avg_N_per_Pday','total_N_to_date', 'avg_N_per_Cday','lg_min','lg_max','lg_mean','Churn']
dataset = pd.DataFrame(columns = cols2)

minthisdistance=list()
churnrate=list()

#************* stack the dataset
for i in range(len(snapshotdate)-1):
    
    # find all merchants that have previous transactions as of this snapshot dates
    df = df_all[df_all.TransDate <= snapshotdate[i]] 
    
    df['N_of_Date2'] = df.N_of_Date
    df_temp = df.groupby('merchant').agg({'TransDate':np.max, 'delta_t':np.max, 'N_of_Date':np.max, 'N_of_Date2':np.sum}) 
    df_temp.rename(columns = {'TransDate':'thislastdate','delta_t':'this_max_delta_t',
                              'N_of_Date':'avg_N_per_Pday','N_of_Date2':'total_N_to_date'}, inplace = True)
    df_temp['thissnapshot'] = snapshotdate[i]
    df_temp['thisdistance'] = (df_temp.thissnapshot - df_temp.thislastdate).dt.total_seconds()/(24*60*60)
    df = pd.merge(df, df_temp, how = 'outer', left_on = 'merchant', right_index = True)
    df['TtoSnapshot'] = (df.thissnapshot - df.initialdate).dt.total_seconds()/(24*60*60)
    df['avg_N_per_Cday'] = df.total_N_to_date/df.TtoSnapshot
    
    df_temp1 = df.groupby('merchant')['lg_amount'].agg([np.min, np.max, np.mean])
    df_temp1.rename(columns = {'amin':'lg_min','amax':'lg_max','mean':'lg_mean'}, inplace = True)
    df = pd.merge(df, df_temp1, how = 'outer', left_on = 'merchant', right_index = True)
    
    # ******* DEFINE ACTIVE merchants as of the snapshot date as
    df_active = df[cols][df.thisdistance <= 28].drop_duplicates()
    #print(df_active.shape[0])
    # ****** Similarly, find the 'non-active' merchants (within the df_active) as of the next snapshot date and mark it as 1 (churned)
    df2 = df_all[df_all.TransDate <= snapshotdate[i+1]] 
    df_temp2 = df2.groupby('merchant').agg({'TransDate':np.max, 'delta_t':np.max})
    df_temp2.rename(columns = {'TransDate':'thislastdate','delta_t':'forecast_window_max_delta_t'}, inplace = True)
    df_temp2['forecast_window_D'] = (snapshotdate[i+1]-df_temp2.thislastdate).dt.total_seconds()/(24*60*60)
    df_temp2['Churn'] = df_temp2.forecast_window_D > 28
    
    dfchurn = pd.DataFrame(df_temp2['Churn'])
    #print(dfchurn.shape[0])
    df_active = pd.merge(df_active, dfchurn, how = 'left', left_on = 'merchant', right_index = True)
    churnrate.append(df_active[df_active.Churn == 1].shape[0]/df_active.shape[0])
    minthisdistance.append(np.min(df_active.thisdistance))
    
    dataset = pd.concat([dataset,df_active])
    
dataset['Churn'] = dataset['Churn'].apply(lambda x: 1 if x == True else 0)
dataset['max_to_horizon'] = dataset.this_max_delta_t/dataset.TtoSnapshot
print(dataset[dataset.Churn == 1].shape[0]/dataset.shape[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0.26949286277885376


In [16]:
churnrate

[0.34464751958224543,
 0.4094955489614243,
 0.346448087431694,
 0.35092348284960423,
 0.34268833087149186,
 0.32877563704164076,
 0.3086763070077864,
 0.3111111111111111,
 0.29365079365079366,
 0.3043127771060056,
 0.27808471454880296,
 0.30411328388401887,
 0.2380038387715931,
 0.275521714608009,
 0.2596306068601583,
 0.2620897018291155,
 0.2611764705882353,
 0.257800398318212,
 0.23912106850495476,
 0.24612598108271283,
 0.24050632911392406,
 0.2418478260869565,
 0.250042756969386]

In [17]:
dftomerge = pd.DataFrame(dfmerchant['Oneday'])
dataset = pd.merge(dataset, dftomerge, how = 'left', left_on = 'merchant', right_index = True)
print(dataset.shape[0])
dataset.head()

69705


Unnamed: 0,merchant,thissnapshot,thislastdate,thisdistance,this_max_delta_t,TtoSnapshot,avg_N_per_Pday,total_N_to_date,avg_N_per_Cday,lg_min,lg_max,lg_mean,Churn,max_to_horizon,Oneday
4943,02da898d61,2033-01-31,2033-01-21,10.0,19.0,29.0,1,2,0.068966,9.441293,9.588297,9.514795,1,0.655172,0
5260,0325fa16d1,2033-01-31,2033-01-08,23.0,0.0,23.0,1,1,0.043478,8.927049,8.927049,8.927049,1,0.0,0
5794,039fb73850,2033-01-31,2033-01-31,0.0,7.0,14.0,4,9,0.642857,9.135509,11.35612,10.654439,0,0.5,0
7592,045a1c428b,2033-01-31,2033-01-26,5.0,1.0,12.0,8,25,2.083333,10.494436,12.674049,11.450175,0,0.083333,0
8664,05368eca16,2033-01-31,2033-01-07,24.0,1.0,25.0,1,2,0.08,12.062404,12.062404,12.062404,1,0.04,0


In [18]:
allfeatures = ['thisdistance','this_max_delta_t', 'TtoSnapshot', 'avg_N_per_Pday', 'total_N_to_date', 'lg_min', 'lg_max', 'lg_mean','Oneday']

remainingfeatures = allfeatures
dfinteract = pd.DataFrame()
for i in allfeatures:
    remainingfeatures = [x for x in remainingfeatures if x != i]
    for j in remainingfeatures:
        dfinteract[i+'_'+j] = dataset[i]*dataset[j]
        
dataset = pd.merge(dataset, dfinteract, how = 'outer', left_index = True, right_index = True)

In [19]:
dataset.head()

Unnamed: 0,merchant,thissnapshot,thislastdate,thisdistance,this_max_delta_t,TtoSnapshot,avg_N_per_Pday,total_N_to_date,avg_N_per_Cday,lg_min,...,total_N_to_date_lg_min,total_N_to_date_lg_max,total_N_to_date_lg_mean,total_N_to_date_Oneday,lg_min_lg_max,lg_min_lg_mean,lg_min_Oneday,lg_max_lg_mean,lg_max_Oneday,lg_mean_Oneday
0,0002b63b92,2033-05-31,2033-05-16,15.0,0.0,15.0,1,1,0.066667,8.125335,...,8.12534,8.12534,8.12534,1,66.02107,66.02107,8.125335,66.02107,8.125335,8.125335
1,0002d07bba,2034-10-31,2034-10-17,14.0,6.0,20.0,1,2,0.1,8.621373,...,17.2427,21.0817,19.1622,0,90.876666,82.602369,0.0,100.993174,0.0,0.0
1,0002d07bba,2034-10-31,2034-10-17,14.0,6.0,20.0,1,2,0.1,8.621373,...,25.8641,32.065,29.8506,0,92.148231,85.784323,0.0,106.351118,0.0,0.0
1,0002d07bba,2034-11-30,2034-11-13,17.0,27.0,50.0,1,3,0.06,8.621373,...,17.2427,21.0817,19.1622,0,90.876666,82.602369,0.0,100.993174,0.0,0.0
1,0002d07bba,2034-11-30,2034-11-13,17.0,27.0,50.0,1,3,0.06,8.621373,...,25.8641,32.065,29.8506,0,92.148231,85.784323,0.0,106.351118,0.0,0.0


In [2]:
dataset1 = dataset.copy()
#'thisdistance','this_max_delta_t', 'TtoSnapshot', 
#'avg_N_per_Pday', 'total_N_to_date','avg_N_per_Cday', 
#'lg_min', 'lg_max', 'lg_mean','Oneday'

features_selected = ['thisdistance','this_max_delta_t', 'TtoSnapshot','lg_min', 'Oneday']
X1 = dataset1[features_selected]
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame([variance_inflation_factor(X1.values, i) for i in range(X1.shape[1])], index=X1.columns)
print(vif)

# interact_selected = list()
# remaining = features_selected
# for i in features_selected:
#     remaining = [x for x in remaining if x != i]
#     #print(remaining)
#     for j in remaining:
#         interact_selected.append(i+'_'+j)

features = features_selected
print(features)

NameError: name 'dataset' is not defined

In [21]:
X = dataset1[features]
y = dataset1['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [22]:
#*************** classifiers
from sklearn.linear_model import LogisticRegression
#clf = LogisticRegression().fit(X_train_scaled, y_train)

from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=5)

clf.fit(X_train_scaled, y_train)
y_predicted = clf.predict(X_test_scaled)

In [1]:
#*************** evaluations
from sklearn.model_selection import cross_val_score
print(cross_val_score(clf, X_train_scaled, y_train, cv = 5, scoring = 'precision'))
print(cross_val_score(clf, X_train_scaled, y_train, cv = 5, scoring = 'recall'))

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
print(classification_report(y_test, y_predicted))
print(confusion_matrix(y_test, y_predicted))

In [33]:
print(accuracy_score(y_test, y_predicted))
print(precision_score(y_test, y_predicted)) 
print(recall_score(y_test, y_predicted))
print(f1_score(y_test, y_predicted))

0.9771355214766497
0.9514587420176134
0.8909211709071012
0.9201953739958825


In [34]:
df = df_all[df_all.TransDate <= dt.date(2034,12,31)] 
df['N_of_Date2'] = df.N_of_Date
df_temp = df.groupby('merchant').agg({'TransDate':np.max, 'delta_t':np.max, 'N_of_Date':np.max, 'N_of_Date2':np.sum}) 
df_temp.rename(columns = {'TransDate':'thislastdate','delta_t':'this_max_delta_t',
                              'N_of_Date':'avg_N_per_Pday','N_of_Date2':'total_N_to_date'}, inplace = True)
df_temp['thissnapshot'] = dt.date(2034,12,31)
df_temp['thisdistance'] = (df_temp.thissnapshot - df_temp.thislastdate).dt.total_seconds()/(24*60*60)
df = pd.merge(df, df_temp, how = 'outer', left_on = 'merchant', right_index = True)
df['TtoSnapshot'] = (df.thissnapshot - df.initialdate).dt.total_seconds()/(24*60*60)
df['avg_N_per_Cday'] = df.total_N_to_date/df.TtoSnapshot
    
df_temp1 = df.groupby('merchant')['lg_amount'].agg([np.min, np.max, np.mean])
df_temp1.rename(columns = {'amin':'lg_min','amax':'lg_max','mean':'lg_mean'}, inplace = True)
df = pd.merge(df, df_temp1, how = 'outer', left_on = 'merchant', right_index = True)
df_active = df[cols][df.thisdistance <= 28].drop_duplicates()
df_active.head()

Unnamed: 0,merchant,thissnapshot,thislastdate,thisdistance,this_max_delta_t,TtoSnapshot,avg_N_per_Pday,total_N_to_date,avg_N_per_Cday,lg_min,lg_max,lg_mean
1,0002d07bba,2034-12-31,2034-12-15,16.0,32.0,81.0,1,4,0.049383,7.629004,10.688348,9.369896
9,000ed1585f,2034-12-31,2034-12-06,25.0,97.0,681.0,2,62,0.091043,9.596962,10.946429,10.206914
67,000f8c3297,2034-12-31,2034-12-30,1.0,14.0,458.0,7,290,0.633188,8.16166,11.403144,9.697874
226,002a66c0e1,2034-12-31,2034-12-28,3.0,20.0,353.0,3,102,0.288952,10.13856,11.651278,10.690376
307,003d24c5d3,2034-12-31,2034-12-09,22.0,70.0,162.0,4,8,0.049383,8.557375,10.381242,9.297282


In [35]:
dftomerge = pd.DataFrame(dfmerchant['Oneday'])
df_active = pd.merge(df_active, dftomerge, how = 'left', left_on = 'merchant', right_index = True)
df_active.set_index('merchant', inplace = True)

In [36]:
current_active = df_active[features]
current_X_scaled = scaler.transform(current_active)
current_y = clf.predict(current_X_scaled)

current_active['Label'] = current_y
current_active.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0_level_0,thisdistance,this_max_delta_t,TtoSnapshot,lg_min,Oneday,Label
merchant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0002d07bba,16.0,32.0,81.0,7.629004,0,0
000ed1585f,25.0,97.0,681.0,9.596962,0,0
000f8c3297,1.0,14.0,458.0,8.16166,0,0
002a66c0e1,3.0,20.0,353.0,10.13856,0,0
003d24c5d3,22.0,70.0,162.0,8.557375,0,1


In [37]:
print('current churn rate = ')
print(np.sum(current_active.Label)/current_active.shape[0])

print('# of current active users = ')
print(current_active.shape[0])

print('# of predicted churn = ')
print(np.sum(current_active.Label))

current churn rate = 
0.25339673913043476
# of current active users = 
5888
# of predicted churn = 
1492


In [146]:
dataset.shape

(691725, 51)

In [39]:
current_active.to_csv("Churn_prediction.csv")

In [None]:
from sklearn.model_selection import cross_val_score
for i in [3,5,7,9]:
    clf = KNeighborsClassifier(n_neighbors=i)
    print(cross_val_score(clf, X_train_scaled, y_train, cv = 5, scoring = 'precision'))
    print(cross_val_score(clf, X_train_scaled, y_train, cv = 5, scoring = 'recall'))

is deprecated and will be removed in a future version
  


Unnamed: 0_level_0,Oneday,Onetime,N,FirstTransDate,LastTransDate,Horizon
merchant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0002b63b92,1,1,1,2033-05-16,2033-05-16,0.0
0002d07bba,0,0,4,2034-10-11,2034-12-15,65.0
00057d4302,0,0,28,2033-05-30,2033-08-04,66.0
000bcff341,1,1,1,2033-08-09,2033-08-09,0.0
000ddbf0ca,1,1,1,2033-06-02,2033-06-02,0.0


In [None]:
# See last payment date distribution of these merchants
#plt.figure()
#plt.hist(dfhorizon.LastTransDate,bins = 48)                                                                                                                                                  

In [1171]:
#******************* Q1 and Q2
# identify merchants with last payment date of more than 2 months ago = churned
dfmerchant['churned'] = dfmerchant['LastTransDate'].apply(lambda x: 1 if x < dt.date(2034,11,1) else 0)
print(np.sum(dfmerchant.churned)/dfmerchant.shape[0])

0.4755069333147516


In [1181]:
#********** find total and average transaction amount for each transation date for each merchant
dfmerchantdate = data.groupby(['merchant','TransDate'])['amount_usd_in_cents'].agg({'TotaloftheDate':np.sum, 'AvgoftheDate':np.mean})
dfmerchantdate = pd.merge(dfmerchantdate, Noftheday, how = 'inner', left_index = True, right_index = True)
dfmerchantdate.reset_index(inplace = True)

#********* for each merchant, find # of days between any two consecutive transaction dates
dfmerchantdate['delta_t'] = dfmerchantdate.groupby('merchant')['TransDate'].diff()
df = dfmerchantdate[['merchant','TransDate','delta_t']].copy()
df.dropna(inplace = True)
df['delta_t'] = [i.total_seconds()/(24 * 60 * 60) for i in df['delta_t']]
dfmerchantdate.drop('delta_t', axis = 1, inplace = True)
dfmerchantdate = pd.merge(dfmerchantdate, df, how = 'outer', left_on = ['merchant', 'TransDate'], right_on = ['merchant', 'TransDate'])
dfmerchantdate.delta_t[pd.isnull(dfmerchantdate.delta_t) == True] = 0

is deprecated and will be removed in a future version
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


In [1182]:
dfmerchantdate = pd.merge(dfmerchantdate, dfmerchant, how = 'outer', left_on = 'merchant', right_index = True)

Unnamed: 0,merchant,TransDate,TotaloftheDate,AvgoftheDate,Noftheday,delta_t,Oneday,Onetime,N,FirstTransDate,LastTransDate,Horizon,churned
0,0002b63b92,2033-05-16,3379,3379.0,1,0.0,1,1,1,2033-05-16,2033-05-16,0.0,1
1,0002d07bba,2034-10-11,5549,5549.0,1,0.0,0,0,4,2034-10-11,2034-12-15,65.0,0
2,0002d07bba,2034-10-17,37830,37830.0,1,6.0,0,0,4,2034-10-11,2034-12-15,65.0,0
3,0002d07bba,2034-11-13,43842,43842.0,1,27.0,0,0,4,2034-10-11,2034-12-15,65.0,0
4,0002d07bba,2034-12-15,2057,2057.0,1,32.0,0,0,4,2034-10-11,2034-12-15,65.0,0


In [1186]:
df1 = dfmerchantdate.groupby('merchant')['delta_t'].agg({'avg_delta_t':np.mean, 'max_delta_t':np.max})
dfmerchant = pd.merge(df1, dfmerchant, how = 'inner', left_index = True, right_index = True)

is deprecated and will be removed in a future version
  """Entry point for launching an IPython kernel.


In [1231]:
dfmerchantMoredays = dfmerchant[dfmerchant.Oneday != 1]
dfmerchantOneday = dfmerchant[dfmerchant.Oneday == 1]
print(dfmerchantOneday.shape[0]/dfmerchant.shape[0])

0.1376210717023204


In [1532]:
#plt.figure()
#plt.hist(dfmerchantMoredays.LastTransDate, bins = 48)

In [1533]:
#plt.figure()
#plt.hist(dfmerchant.max_delta_t[dfmerchant.churned == 0], bins = 50, label = 'max_delta_t of the not churned', color = 'black')
#plt.hist(dfmerchant.max_delta_t[dfmerchant.churned == 1], bins = 50, label = 'max_delta_t of the churned')
#plt.legend()