In [5]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Printing config:
pd.options.display.float_format = '{:,.4f}'.format

# Load the data
dataFilePath = "./usage_records.csv"
data_df = pd.read_csv(dataFilePath)

# Convert relevant columns to datetime type:
date_cols = ['subscription_date', 'redshifted_at']
for col in date_cols:
    data_df.loc[:, col] = pd.to_datetime(data_df.loc[:, col])

In [2]:
# # # # DADA ANALYSIS # # # #

print("\nData head:\n" + str(data_df.head()))
print("\nData shape: " + str(data_df.shape))  

# Checking cols data type and existence of missing values 
print(("\nData info: \n%s") % data_df.info())  
print(("\nMissing values: \n%s") % data_df.isnull().sum( ))  # no missing values

# Check data validity:
groupby_cols = ['id','renewed']
print("\nValidatiov: each 'id' has only one label - %s" % 
      (len(data_df.groupby(groupby_cols).size()) == len(data_df.id.unique())))
groupby_cols = ['id','subscription_date']
print("Validatiov: each 'id' has only one subscription_date - %s" % 
      (len(data_df.groupby(groupby_cols).size()) == len(data_df.id.unique())))
print("Validatiov: all 'redshifted_at' values are later (or equal to) 'subscription_date' - %s" % 
      (sum(data_df.subscription_date <= data_df.redshifted_at) == data_df.shape[0]))
      
# Basic statistics:
num_of_users = len(data_df.id.unique())
num_of_churned_users = len(data_df[data_df['renewed'] == False].id.unique())
print(("\nThere are %d churned users out of %d users (%.4f percent)") % 
      (num_of_churned_users,
       num_of_users,
       100 * num_of_churned_users / num_of_users))
print(("\nPopular features normalized frequency: \n%s") % 
      data_df.feature_name.value_counts(normalize=True).head())

# Analysis per label:
# Number of actions:
groupby_cols = ['renewed', 'id']
label_id_counts_df = data_df.groupby(groupby_cols).size().reset_index(name='counts')
print("\n\nActivity stats of churned users: \n%s" % 
label_id_counts_df[label_id_counts_df['renewed'] == False].counts.describe())
print("\nActivity stats of renewed users: \n%s" % 
label_id_counts_df[label_id_counts_df['renewed'] == True].counts.describe())
# Number of features being used:
unique_fe_by_label_id_df = data_df.groupby(groupby_cols).agg({"feature_name": lambda x: x.nunique()})
unique_fe_by_label_id_df.reset_index(inplace=True)
print("\n\nAmount of used features stats of churned users: \n%s" % 
      unique_fe_by_label_id_df[unique_fe_by_label_id_df['renewed'] == False].feature_name.describe())
print("\nAmount of used features stats of renewed users: \n%s" % 
      unique_fe_by_label_id_df[unique_fe_by_label_id_df['renewed'] == True].feature_name.describe())
# Min / Max 'subscription_date':
print("\n\nChurned users subscription_date: min - %s, max - %s" % 
      (data_df[data_df['renewed'] == False].subscription_date.min(),
       data_df[data_df['renewed'] == False].subscription_date.max()))
print("Renewed users subscription_date: min - %s, max - %s" % 
      (data_df[data_df['renewed'] == True].subscription_date.min(),
       data_df[data_df['renewed'] == True].subscription_date.max()))
# Min / Max 'redshifted_at':
print("\nChurned users redshifted_at: min - %s, max - %s" % 
      (data_df[data_df['renewed'] == False].redshifted_at.min(),
       data_df[data_df['renewed'] == False].redshifted_at.max()))
print("Renewed users redshifted_at: min - %s, max - %s" % 
      (data_df[data_df['renewed'] == True].redshifted_at.min(),
       data_df[data_df['renewed'] == True].redshifted_at.max()))


Data head:
     id  renewed   subscription_date  feature_name       redshifted_at
0  5147    False 2017-10-13 17:12:41            20 2017-10-21 13:05:45
1  7423    False 2017-10-08 19:18:56            25 2017-10-08 19:52:00
2  3491     True 2017-10-10 15:08:54            18 2017-10-11 16:47:53
3  9891     True 2017-10-15 01:12:10             3 2017-10-15 01:22:11
4  6378     True 2017-10-23 00:31:45            35 2017-10-28 16:57:53

Data shape: (1162830, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1162830 entries, 0 to 1162829
Data columns (total 5 columns):
id                   1162830 non-null int64
renewed              1162830 non-null bool
subscription_date    1162830 non-null datetime64[ns]
feature_name         1162830 non-null int64
redshifted_at        1162830 non-null datetime64[ns]
dtypes: bool(1), datetime64[ns](2), int64(2)
memory usage: 36.6 MB

Data info: 
None

Missing values: 
id                   0
renewed              0
subscription_date    0
feature_name   

In [6]:
# Label extraction - 'churned' (!renewed):
label = 'churned'
data_df[label] = (data_df['renewed'] == False)

# FEATURE EXTRACTION
# Compute days since subsrciption (redshifted_at - subscription_date):
data_df['days_since_sub'] = (data_df['redshifted_at'] - data_df['subscription_date']).dt.ceil("D").dt.days
# Compute weeks since subscription (w1:0-7, w2:8-14, w3:15-21, w4:22-28)
data_df['weeks_since_sub'] = pd.to_numeric(np.ceil(data_df['days_since_sub'] / 7), downcast='signed')
data_df.loc[data_df.loc[:, 'weeks_since_sub'] == 0, 'weeks_since_sub'] = 1
data_df.head()

Unnamed: 0,id,renewed,subscription_date,feature_name,redshifted_at,churned,days_since_sub,weeks_since_sub
0,5147,False,2017-10-13 17:12:41,20,2017-10-21 13:05:45,True,8,2
1,7423,False,2017-10-08 19:18:56,25,2017-10-08 19:52:00,True,1,1
2,3491,True,2017-10-10 15:08:54,18,2017-10-11 16:47:53,False,2,1
3,9891,True,2017-10-15 01:12:10,3,2017-10-15 01:22:11,False,1,1
4,6378,True,2017-10-23 00:31:45,35,2017-10-28 16:57:53,False,6,1


In [7]:
# Generate features - combinations of feature_name & weeks from subscription 
# (e.g. w2_f1 represents the usage of feature_name 1 during week 2)
groupby_cols = ['id', label, 'feature_name', 'weeks_since_sub'] 
user_fe_act_per_week_df = data_df.groupby(groupby_cols).size().unstack(['weeks_since_sub', 'feature_name'])
new_col_names = ['w' + str(a) + '_f' + str(b)  for a,b in user_fe_act_per_week_df.columns.get_values()] 
user_fe_act_per_week_df.columns = new_col_names
cols = user_fe_act_per_week_df.columns
for i in range(1,5):
    w_fe_list = cols[np.where([x.startswith('w'+str(i)) for x in cols])[0]] # finds the features of the relevant week
    #DEBUG: print(w_fe_list); print(user_fe_act_per_week_df[w_fe_list].sum(axis=1).head())
    user_fe_act_per_week_df['sum_w'+str(i)] = user_fe_act_per_week_df[w_fe_list].sum(axis=1)
    #user_fe_act_per_week_df[w_fe_list] = user_fe_act_per_week_df[w_fe_list].div(user_fe_act_per_week_df['sum_w'+str(i)], axis=0)
    
    w_fe_len = len(w_fe_list)
    user_fe_act_per_week_df['uniq_fe_w'+str(i)] = w_fe_len - user_fe_act_per_week_df[w_fe_list].isnull().sum(axis=1)
cols = user_fe_act_per_week_df.columns
drop_cols = cols[np.where([x.startswith('sum_w') for x in cols])[0]]
user_fe_act_per_week_df.drop(columns=drop_cols, inplace=True)
user_fe_act_per_week_df.reset_index(label, inplace=True)

user_fe_act_per_week_df.head()

Unnamed: 0_level_0,churned,w1_f3,w1_f15,w1_f16,w1_f31,w2_f31,w1_f32,w2_f32,w1_f6,w1_f7,...,w4_f40,w4_f56,w3_f65,w4_f65,w3_f59,w2_f22,uniq_fe_w1,uniq_fe_w2,uniq_fe_w3,uniq_fe_w4
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,False,1.0,3.0,1.0,1.0,1.0,5.0,5.0,,,...,,,,,,,5,2,0,0
2,False,2.0,,,6.0,,4.0,,3.0,5.0,...,,,,,,,11,4,0,0
3,True,1.0,4.0,1.0,2.0,,5.0,,1.0,2.0,...,,,,,,,20,0,0,0
4,False,,,20.0,,,,,,,...,,,,,,,12,4,1,1
5,True,4.0,2.0,1.0,5.0,,2.0,,5.0,5.0,...,,,,,,,29,0,8,0


In [8]:
# PREPROCESSING:

def generateMCFeatures(df, fe_cols, last_state=2):
    # # # # # # # # # #
    # Discretization of the of the related feature_name usage per week to the Markov Chain states:
    # (0, 1, 2, ..., last_state), where last_state represents all values >= last_state. 
    # Note that the first state must starts at 0.
    # Args:
    #       df - (pandas dataframe), feature_name usage per week
    #       fe_cols - (list of strings), df column names for discretization 
    #                 Note - unlisted col names won't be returned
    #       last_state - (int), the last state name, which represents all values >= last_state
    # Return:
    #       pp_df - (pandas dataframe) preprocessed data
    # # # # # # # # # #
    
    pp_df = df[fe_cols].fillna(0)

    def usg_discretization(n):
        last_ctg = 2
        out = n

        if out > last_ctg: 
            out = last_ctg
        return int(out)


    for f in fe_cols:
        pp_df[f] = pp_df[f].apply(usg_discretization)
        
    return pp_df

In [9]:
feature_name_list = [15, 31, 3, 29] # Most popular feature_names tha also had the largest diff between churned / renewd usage mean
ctg_fe = ['w1', 'w2', 'w3', 'w4']
fe_cols = list()
for i in feature_name_list:
    fe_cols += list(pd.Series(ctg_fe).apply(lambda x: x + '_f' + str(i)))
# DEBUG: print(fe_cols)
df = generateMCFeatures(user_fe_act_per_week_df, fe_cols=fe_cols).join(data_df[['id', label]].drop_duplicates().set_index('id'))
# DEBUG: user_fe_act_per_week_df.iloc[0:5, :][fe_cols]
df.head()

Unnamed: 0_level_0,w1_f15,w2_f15,w3_f15,w4_f15,w1_f31,w2_f31,w3_f31,w4_f31,w1_f3,w2_f3,w3_f3,w4_f3,w1_f29,w2_f29,w3_f29,w4_f29,churned
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,2,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,False
2,0,1,0,0,2,0,0,0,2,0,0,0,2,2,0,0,False
3,2,0,0,0,2,0,0,0,1,0,0,0,0,0,0,0,True
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False
5,2,0,0,0,2,0,1,0,2,0,0,0,0,0,0,0,True


In [10]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=label) # design matrix
y = df[label].astype(int)  # label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_df = X_train.join(y_train)

In [11]:
# # # # # HELPER FUNCTIONS FOR TRAINING # # # # # 

def calcFirstStateProbs(df, w1_col):
    # # # # # # # # # #
    # Estimates the probability to be in each state during the first week (w1) based on observations. 
    # Args:
    #       df - (pandas dataframe), MC state per week
    #       w1_col - (string), the column name that represents the state in the first week
    # Return:
    #       (list of floats) Estimated probabilities
    # # # # # # # # # #
    
    return list(df.loc[:, w1_col].value_counts(normalize=True))


def calcTranMatrix(df, fe_cols):
    # # # # # # # # # #
    # Estimates Makov Chain transition probabilities based on observations. Note that the first state must start at 0. 
    # The state names must be sequential (0,1,2,...,last_state), where last_state represents all values >= last_state
    # Args:
    #       df - (pandas dataframe), MC state per week
    #       fe_cols - (list of strings), df MC column names  
    # Return:
    #       M - (list of lists) Estimated transition matrix
    # # # # # # # # # #
    
    n_states = 0
    for f in fe_cols:
        n_states = max(n_states, len(df[f].unique()))
    # DEBUG: print(n_states)
    
    M = [[0]*n_states for _ in range(n_states)]

    for idx, row in df.loc[:, fe_cols].iterrows(): 
        for (i,j) in zip(row, row[1:]):
            M[i][j] += 1
            #print("\nDEBUG: i = %d, j = %d" % (i,j))
            #for row in M: print(' '.join('{0:.2f}'.format(x) for x in row))

    #now convert to probabilities:
    for row in M:
        s = sum(row)
        if s > 0:
            row[:] = [f/s for f in row]
    for row in M: print(' '.join('{0:.2f}'.format(x) for x in row))
    return M


def calcPopulationProbs(df, label_col, pop_lbl, feature_name):
    # # # # # # # # # #
    # Estimates the first state probabilities and trainsition matrix of a population (churned / renewed),
    # by callyng calcFirstStateProbs() and calcTranMatrix().
    # Args:
    #       df - (pandas dataframe), MC state per week and label
    #       label_col - (string), df label column name
    #       pop_lbl - (bool-True/False), the required population (churned / renewed)
    #       feature_name - (int), the required feture_name 
    # Return:
    #       (first_p, tran_p) - (tuple of lists) Estimated probabilities
    # # # # # # # # # #
    
    ctg_fe = ['w1', 'w2', 'w3', 'w4']
    fe_cols = pd.Series(ctg_fe).apply(lambda x: x + '_f' + str(feature_name))
    pop_df = df.loc[df[label_col] == pop_lbl, :]
    
    first_p = calcFirstStateProbs(df=pop_df, w1_col=fe_cols[0])
    tran_p = calcTranMatrix(df=pop_df, fe_cols=fe_cols)
    
    # DEBUG: print("\n"); 
    print(first_p)
    return (first_p, tran_p)

In [12]:
# Estimate the relevant MC probabilities (a MC process per feature_name)

# feature_name = 15
feature_name = feature_name_list[0]
print("Estimating first state probs and transition matrix for churned population of feature_name %d" % feature_name)
(c_first_state_p_1, c_tran_mat_1) = calcPopulationProbs(df=train_df, label_col=label, pop_lbl=True, feature_name=feature_name)
print("\nEstimating first state probs and transition matrix for renewed population of feature_name %d" % feature_name)
(r_first_state_p_1, r_tran_mat_1) = calcPopulationProbs(df=train_df, label_col=label, pop_lbl=False, feature_name=feature_name)

# feature_name = 31
feature_name = feature_name_list[1]
print("\n\nEstimating first state probs and transition matrix for churned population of feature_name %d" % feature_name)
(c_first_state_p_2, c_tran_mat_2) = calcPopulationProbs(df=train_df, label_col=label, pop_lbl=True, feature_name=feature_name)
print("\nEstimating first state probs and transition matrix for renewed population of feature_name %d" % feature_name)
(r_first_state_p_2, r_tran_mat_2) = calcPopulationProbs(df=train_df, label_col=label, pop_lbl=False, feature_name=feature_name)

# feature_name = 3
feature_name = feature_name_list[2]
print("\n\nEstimating first state probs and transition matrix for churned population of feature_name %d" % feature_name)
(c_first_state_p_3, c_tran_mat_3) = calcPopulationProbs(df=train_df, label_col=label, pop_lbl=True, feature_name=feature_name)
print("\nEstimating first state probs and transition matrix for renewed population of feature_name %d" % feature_name)
(r_first_state_p_3, r_tran_mat_3) = calcPopulationProbs(df=train_df, label_col=label, pop_lbl=False, feature_name=feature_name)

# feature_name = 29
feature_name = feature_name_list[3]
print("\n\nEstimating first state probs and transition matrix for churned population of feature_name %d" % feature_name)
(c_first_state_p_4, c_tran_mat_4) = calcPopulationProbs(df=train_df, label_col=label, pop_lbl=True, feature_name=feature_name)
print("\nEstimating first state probs and transition matrix for renewed population of feature_name %d" % feature_name)
(r_first_state_p_4, r_tran_mat_4) = calcPopulationProbs(df=train_df, label_col=label, pop_lbl=False, feature_name=feature_name)

c_first_state_p_list = [c_first_state_p_1, c_first_state_p_2, c_first_state_p_3, c_first_state_p_4]
c_tran_mat_list = [c_tran_mat_1, c_tran_mat_2, c_tran_mat_3, c_tran_mat_4]

r_first_state_p_list = [r_first_state_p_1, r_first_state_p_2, r_first_state_p_3, r_first_state_p_4]
r_tran_mat_list = [r_tran_mat_1, r_tran_mat_2, r_tran_mat_3, r_tran_mat_4]

Estimating first state probs and transition matrix for churned population of feature_name 15
0.88 0.05 0.07
0.75 0.09 0.16
0.53 0.10 0.37
[0.5033975084937712, 0.34428086070215175, 0.152321630804077]

Estimating first state probs and transition matrix for renewed population of feature_name 15
0.85 0.06 0.10
0.69 0.11 0.20
0.44 0.10 0.46
[0.5302340191613004, 0.3313962619758128, 0.13836971886288676]


Estimating first state probs and transition matrix for churned population of feature_name 31
0.88 0.06 0.06
0.74 0.13 0.13
0.58 0.13 0.30
[0.4881087202718007, 0.3029445073612684, 0.2089467723669309]

Estimating first state probs and transition matrix for renewed population of feature_name 31
0.83 0.08 0.09
0.70 0.13 0.18
0.47 0.14 0.39
[0.5305481388408984, 0.2797235746819538, 0.1897282864771478]


Estimating first state probs and transition matrix for churned population of feature_name 3
0.90 0.05 0.05
0.78 0.10 0.12
0.65 0.11 0.24
[0.48187995469988676, 0.3187995469988675, 0.1993204983012457

In [14]:
# # # # # HELPER FUNCTIONS FOR PREDICTION # # # # # 

def calcObsProb(obs, tran_mat, first_state_p):
    # # # # # # # # # #
    # Computes the probability of an observation based on MC transition matrix and state probabilities of first week
    # (simply multiplying the relevant probablities)
    # Args:
    #       obs - (pandas Series), MC ordered state per week (w1, w2, ...)
    #       tran_mat - (list of lists), MC transition matrix
    #       first_state_p - (list of floats), the state probabilities of the first week  
    # Return:
    #       (first_p, tran_p) - (tuple of lists) Estimated probabilities
    # # # # # # # # # #
    
    p = first_state_p[obs[0]]
    
    for (i,j) in zip(obs, obs[1:]):
        p = p * tran_mat[i][j] 
    
    return p

In [15]:
#train_probs_df = pd.DataFrame()
test_probs_df = pd.DataFrame()

for i in range(0, len(feature_name_list)):
    # Compute the probability of the observations given a churned population model:
    #train_probs_df['c_p_'+str(feature_name_list[i])] = X_train.apply(lambda x: calcObsProb(x, c_tran_mat_list[i], c_first_state_p_list[i]), axis=1)
    test_probs_df['c_p_'+str(feature_name_list[i])] = X_test.apply(lambda x: calcObsProb(x, c_tran_mat_list[i], c_first_state_p_list[i]), axis=1)
    # Compute the probability of the observations given a renewed population model:
    #train_probs_df['r_p_'+str(feature_name_list[i])] = X_train.apply(lambda x: calcObsProb(x, r_tran_mat_list[i], r_first_state_p_list[i]), axis=1)
    test_probs_df['r_p_'+str(feature_name_list[i])] = X_test.apply(lambda x: calcObsProb(x, r_tran_mat_list[i], r_first_state_p_list[i]), axis=1)

test_probs_df.describe()

Unnamed: 0,c_p_15,r_p_15,c_p_31,r_p_31,c_p_3,r_p_3,c_p_29,r_p_29
count,2034.0,2034.0,2034.0,2034.0,2034.0,2034.0,2034.0,2034.0
mean,0.0037,0.0021,0.0038,0.0017,0.0048,0.0029,0.0064,0.0035
std,0.0146,0.0085,0.0152,0.0068,0.0191,0.0113,0.0261,0.0137
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001
max,0.0731,0.043,0.0767,0.0347,0.096,0.0571,0.1307,0.0676


In [17]:
# Compute the diff ratio between the probabilities base on churned and renewed MC processes:
for f in feature_name_list:
    cp = 'c_p_'+str(f)
    rp = 'r_p_'+str(f)
    #train_probs_df['diff_ratio_'+str(f)] = (train_probs_df[cp] - train_probs_df[rp]) / train_probs_df[cp]
    test_probs_df['diff_ratio_'+str(f)] = (test_probs_df[cp] - test_probs_df[rp]) / test_probs_df[cp]

#cols = train_probs_df.columns
#train_fe_list = cols[np.where([x.startswith('diff_ratio') for x in cols])[0]] # finds the relevant column names
cols = test_probs_df.columns
test_fe_list = cols[np.where([x.startswith('diff_ratio') for x in cols])[0]] # finds the relevant column names
# DEBUG: print(train_fe_list); print(test_fe_list)
test_probs_df[test_fe_list].head()

Unnamed: 0_level_0,diff_ratio_15,diff_ratio_31,diff_ratio_3,diff_ratio_29
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8566,0.0093,-0.2328,-0.2604,-0.8733
6099,-0.7525,-2.3472,-0.9226,-5.0746
4339,-0.0731,-1.3148,-0.4723,-0.682
2233,-0.2499,-0.6602,-0.4484,-0.4085
9799,-0.1161,-0.7998,-0.7031,-0.572


In [18]:
def evaluation(y_test, y_pred):
    # # # # # # # # # #
    # Evaluate classifier results - conf. matrix, precision, recall, f1
    # Args:
    #       y_test - (pandas series), the test labels
    #       y_pred - (pandas series), the test predictions
    # Return:
    #       Nothing
    # # # # # # # # # #
    #from sklearn import metrics

    print(metrics.classification_report(y_test, y_pred))
    print("\nConfusion matrix:\n%s" % pd.crosstab(y_pred, y_test, rownames=['Predicted'], colnames=['True'], margins=True))#metrics.confusion_matrix(y_test, y_pred))

    return

In [19]:
# Classification model based on the diff ratio of feature_name 15:
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(y_test, test_probs_df.diff_ratio_15)
print("AUC: %f\n" % metrics.auc(fpr, tpr))

diff_threshold = 0 # It is better to learn this threshold as a mean of cross validation
y_pred = (test_probs_df.diff_ratio_15 > diff_threshold).astype(int)
evaluation(y_test, y_pred) # It is better to evaluate our model based on mean scores (precision, recall, etc.) of cross valdation

AUC: 0.581964

             precision    recall  f1-score   support

          0       0.84      0.55      0.66      1645
          1       0.23      0.57      0.33       389

avg / total       0.73      0.55      0.60      2034


Confusion matrix:
True          0    1   All
Predicted                 
0           901  166  1067
1           744  223   967
All        1645  389  2034


In [None]:
'''
Other things I tried but worked even worse:
SVM, logistic regression, decision tree & random forest applied on a design matrix including features, like:
- (log & time decayed) feature_name usage
- usage percentage of feature_name per week since subscription 
- subscription week
- number of unique features used per week
* I used MinMax scaling before running SVM
* The usage hours / day part were not used because from a histogram I made it looked like server time (not local time)

To understand better the design matrix data I used PCA (as a visualization tool). 
It did not find a very 'efficient' projection - the sum (variance) of the first 3 components was about 0.4 
and had no good separation between churned / renewed population.

Then I wanted to check if the usage per week depends on the usage of the previous week, 
so I chose the most popular feature_name that had the greatest usage mean difference between 
churned & renewed populations. For each one of these feature_names the markov chain probabilities were estimated
and were used to compute the probability of each observation given a MC process (churned / renewed). 
Then the difference ratio between these probabilities were used as our design matrix. 
I tried applying logistic regression and random forest to this data but got bad results. 
'''