# Loading and Processing the 30.41GB user_logs.csv File

As the file's extraordinary size requires special measures to load, we created an notebook solely dedicated to that.

In [2]:
# math library
import numpy as np
import pandas as pd
import seaborn as sns
import random
import datetime
from datetime import datetime as dt
from collections import Counter
from scipy.stats.stats import pearsonr # for pearson correlation

# visualization library
%matplotlib inline
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('png2x','pdf')
import matplotlib.pyplot as plt
#import mpld3
from pandas.plotting import scatter_matrix

# machine learning library
from sklearn.linear_model import LogisticRegression

# 3d visualization
from mpl_toolkits.mplot3d import axes3d

# computational time
import time

## 1. Functions

Below is a set of functions that we use to create additional features in the user_logs.csv and user_logs_v2.csv file. 
- grouping(dataframe): This function has two objectives. First, it will group the file by user msno's. This means, that, instead there being one row per listening session, we will have one row per user. The remaining features will be grouped by sum. Second, this function adds a new features called 'entries', which gives the amount of entries in the file for each user.
- monthly_entries(dataframe,version): Here we create additional features that give us the amount of entries in the months Mar-17, Feb-17, Jan-17, Dec-16 and Nov-16. This allows us to to quantify changes in user activity in the months prior to month where we should predict churn.
- monthly_secs(dataframe,version): Similar to the function above, we will sum total_secs for the months Nov-16 to Mar-17 to create new features.
- days_since_function(dataframe,version): This function returns a dataframe with a new column called 'days_since' that tells us the how many days passed since the a user's last listening session and the last day of the month where we want to predict churn. While we believe this feature would add value and improve our prediction, we are currently having troubles to apply it on the whole user_logs.csv file due to it taking an immensely long time (approx. 400 hours). Therefore, all the rows related to days_since are #'d.
- date_features(dataframe,version): This function actually applies the feature creation from the functions above to the user_logs files.
- behaviour (dataframe): This function creates the new feature 'behaviour' and immediately applies it to the user_logs files. 'behaviour' is given by: (num_25 + ... + num_98.5)/num_100. Thus, 0 would mean that a user listens to every song entirely. We expect that users with changing listening behaviour will also have a changing likelihood to churn.


In [5]:
# grouping by msno
def grouping(dataframe):
    counts = dataframe.groupby('msno')['num_25'].count().reset_index()
    counts.columns = ['msno','entries']
    sums = dataframe.groupby('msno').sum().reset_index()
    dataframe = sums.merge(counts, how='inner', on='msno')
    
    return dataframe
    
def monthly_entries(dataframe,version):
    
    idx_mar17 = (dataframe['date'] > datetime.date(2017,2,28)) & (dataframe['date'] < datetime.date(2017,4,1))
    msno_counts = dataframe['msno'][idx_mar17].value_counts()
    entries_mar17 = pd.DataFrame({'msno':msno_counts.index, 'entries_mar17':msno_counts.values})
    
    idx_feb17 = (dataframe['date'] > datetime.date(2017,1,31)) & (dataframe['date'] < datetime.date(2017,3,1))
    msno_counts = dataframe['msno'][idx_feb17].value_counts()
    entries_feb17 = pd.DataFrame({'msno':msno_counts.index, 'entries_feb17':msno_counts.values})
    
    idx_jan17 = (dataframe['date'] > datetime.date(2016,12,31)) & (dataframe['date'] < datetime.date(2017,2,1))
    msno_counts = dataframe['msno'][idx_jan17].value_counts()
    entries_jan17 = pd.DataFrame({'msno':msno_counts.index, 'entries_jan17':msno_counts.values})

    idx_dec16 = (dataframe['date'] > datetime.date(2016,11,30)) & (dataframe['date'] < datetime.date(2017,1,1))
    msno_counts = dataframe['msno'][idx_dec16].value_counts()
    entries_dec16 = pd.DataFrame({'msno':msno_counts.index, 'entries_dec16':msno_counts.values})
    
    idx_nov16 = (dataframe['date'] > datetime.date(2016,10,31)) & (dataframe['date'] < datetime.date(2016,12,1))
    msno_counts = dataframe['msno'][idx_nov16].value_counts()
    entries_nov16 = pd.DataFrame({'msno':msno_counts.index, 'entries_nov16':msno_counts.values})
    
    if (version == 1):    
        monthly_entries_df = entries_nov16.merge(entries_dec16,on='msno',how='outer')
        monthly_entries_df = monthly_entries_df.merge(entries_jan17,on='msno',how='outer')
        monthly_entries_df = monthly_entries_df.merge(entries_feb17,on='msno',how='outer')
    elif (version == 2):
        monthly_entries_df = entries_mar17
        
    return monthly_entries_df

def monthly_secs(dataframe,version):
    
    idx_mar17 = (dataframe['date'] > datetime.date(2017,2,28)) & (dataframe['date'] < datetime.date(2017,4,1))
    secs_mar17 = user_logs_datef_secs[idx_mar17].groupby('msno').sum().reset_index()
    secs_mar17 = secs_mar17.rename(index=str, columns={"total_secs": "secs_mar17"})
    
    idx_feb17 = (dataframe['date'] > datetime.date(2017,1,31)) & (dataframe['date'] < datetime.date(2017,3,1))
    secs_feb17 = user_logs_datef_secs[idx_feb17].groupby('msno').sum().reset_index()
    secs_feb17 = secs_feb17.rename(index=str, columns={"total_secs": "secs_feb17"})
    
    idx_jan17 = (dataframe['date'] > datetime.date(2016,12,31)) & (dataframe['date'] < datetime.date(2017,2,1))
    secs_jan17 = user_logs_datef_secs[idx_jan17].groupby('msno').sum().reset_index()
    secs_jan17 = secs_jan17.rename(index=str, columns={"total_secs": "secs_jan17"})
    
    idx_dec16 = (dataframe['date'] > datetime.date(2016,11,30)) & (dataframe['date'] < datetime.date(2017,1,1))
    secs_dec16 = user_logs_datef_secs[idx_dec16].groupby('msno').sum().reset_index()
    secs_dec16 = secs_dec16.rename(index=str, columns={"total_secs": "secs_dec16"})
    
    idx_nov16 = (dataframe['date'] > datetime.date(2016,10,31)) & (dataframe['date'] < datetime.date(2016,12,1))
    secs_nov16 = user_logs_datef_secs[idx_nov16].groupby('msno').sum().reset_index()
    secs_nov16 = secs_nov16.rename(index=str, columns={"total_secs": "secs_nov16"})
    
    if (version == 1):
        monthly_secs_df = secs_dec16.merge(secs_nov16,on='msno',how='outer')
        monthly_secs_df = monthly_secs_df.merge(secs_jan17,on='msno',how='outer')
        monthly_secs_df = monthly_secs_df.merge(secs_feb17,on='msno',how='outer')
    elif (version == 2):
        monthly_secs_df = secs_mar17
            
    return monthly_secs_df

# days since last session (from Mar 2017, but needs to be put to Apr 2017 for test !!!!)
def days_since_function(dataframe,version):
    
    days_since = dataframe.groupby('msno', as_index=False)['date'].max()
    days_since['days_since_mar'] = 0
    days_since['days_since_apr'] = 0
    if (version == 1):
        days_since['days_since_mar'] = datetime.date(2017,3,31) - days_since['date'] 
        days_since['days_since_mar'] = days_since['days_since_mar'].dt.days
    elif (version == 2):
        days_since['days_since_apr'] = datetime.date(2017,4,30) - days_since['date'] 
        days_since['days_since_apr'] = days_since['days_since_apr'].dt.days
    
    return days_since

def date_features(dataframe,version):
    monthly_entries_df = monthly_entries(dataframe,version)
    monthly_secs_df = monthly_secs(dataframe,version)
    #days_since = days_since_function(dataframe,version)
    user_logs_grouped = grouping(dataframe)
    user_logs_eng = user_logs_grouped.merge(monthly_entries_df,on='msno',how='outer')
    user_logs_eng = user_logs_eng.merge(monthly_secs_df,on='msno',how='outer')
    #user_logs_eng = user_logs_eng.merge(days_since,on='msno',how='outer')
    
    return user_logs_eng

def behaviour (dataframe):
    num_columns = ['num_25','num_50','num_75','num_985','num_100']
    sum = 0
    for i in range(0,len(num_columns)-1):
        inpt = num_columns[i]
        sum = sum + dataframe[inpt]
    dataframe['behaviour'] = sum/dataframe['num_100'][dataframe['num_100']!=0]
    dataframe['behaviour'] = dataframe['behaviour'].fillna(value=1)
    
    return dataframe
    

## 2. Loading Files
### 2.1 user_logs.csv by chunks of 10e8 bytes

The user_logs.csv file has 30.51 GB and thus, due to memory restrictions, cannot be read in by normal computers in one go. Therefore, we use the parameter 'chunksize' from pandas' read_csv function to read in the file by chunks of 10^8 bytes and append them iteratively. While features like 'behaviour' can be derived from a grouped-by-msno file, which will be significantly smaller than the original file(around 400 times smaller), some of our features are derived from the whole dataset and thus need to be generated during the loop. This, in particular, includes all date-related features, as the group by msno will remove dates. Hence, all the features given by the functions monthly_entries and monthly_secs will have be generated in the loop. 
While we already group the chunks by msno in the loop, we need to do it again after the loop as msno's may appear in several different chunks.

In [159]:
# actual solution

start = time.time()

chunksize = 10 ** 8

# initialize user_logs_eng_v1 with only columns names for the loop
user_logs_eng_v1 = pd.read_csv('data/user_logs_20m.csv', nrows=0)
user_logs_eng_v1 = user_logs_eng_v1.drop('date',axis=1)
user_logs_eng_v1 = grouping(user_logs_eng_v1)

for chunk in pd.read_csv('/Volumes/TOSHIBA EXT/Files/user_logs.csv', chunksize=chunksize):
    
    # Changing date types
    chunk['date'] = chunk.date.apply(lambda x: dt.strptime(str(int(x)), "%Y%m%d").date() if pd.notnull(x) else "NAN" )

    # date features
    chunk = date_features(chunk,1)
        
    # append
    user_logs_eng_v1 = user_logs_eng_v1.append(chunk)

# group again the msno's, as they may appear in more than one chunk
# days_since = user_logs_eng_v1.groupby('msno', as_index=False)['day_since'].max()
user_logs_eng_v1 = user_logs_eng_v1.groupby('msno').sum().reset_index()
# user_logs_eng_v1['days_since'] = days_since['days_since']

# adding the 'behaviour' column
user_logs_eng_v1 = behaviour(user_logs_eng_v1)

# make this part immortal
user_logs_eng_immortal2 = user_logs_eng_v1.copy()
user_logs_eng_immortal2.to_csv('data/user_logs_eng_immortal.csv', index=False)
print("Done!")

print('Time =',time.time() - start)


Done!
Time = 9628.721282720566


### 2.2 Loading user_logs_v2.csv file

User_logs_v2.csv is the same dataset than user_logs.csv, except that it only has entries from Mar-17. In contrast user_logs.csv has entries from 2015 to Feb-17. Thus the only features that we create in user_logs_v2.csv are 'behaviour' and entries and total_secs for Mar-17. As the file is only 1.43 GB, we don't need to read it in in a loop.

In [91]:
start = time.time()

# loading user_logs_v2
user_logs_v2 = pd.read_csv('data/user_logs_v2.csv')
print(user_logs_v2.shape)
print('unique msnos in the dataset: ',user_logs_v2['msno'].nunique())

# Changing date types
user_logs_v2['date'] = user_logs_v2.date.apply(lambda x: dt.strptime(str(int(x)), "%Y%m%d").date() if pd.notnull(x) else "NAN" )

# date features
user_logs_v2 = date_features(user_logs_v2,2)
print('rows in the new one: ',user_logs_v2.shape[0])

# adding the 'behaviour' column
user_logs_v2 = behaviour(user_logs_v2)

# group to remove NaN values
user_logs_v2 = user_logs_v2.groupby('msno').sum().reset_index()

# removing v1 parts
#user_logs_v2 = user_logs_v2[['msno','entries_mar17','secs_mar17']]
print(user_logs_v2.shape)

user_logs_v2_immortal = user_logs_v2.copy()
user_logs_v2_immortal.to_csv('data/user_logs_v2_immortal.csv', index=False)

print("Done!")
print('Time =',time.time() - start)

print(user_logs_v2.isnull().sum())


(18396362, 9)
unique msnos in the dataset:  1103894
rows in the new one:  1103894
(1103894, 12)
Done!
Time = 425.85778403282166
msno             0
num_25           0
num_50           0
num_75           0
num_985          0
num_100          0
num_unq          0
total_secs       0
entries          0
entries_mar17    0
secs_mar17       0
behaviour        0
dtype: int64


In [97]:
print(user_logs_v2.head())

                                           msno  num_25  num_50  num_75  \
0  +++IZseRRiQS9aaSkH6cMYU6bGDcxUieAi/tH67sC5s=      86      11      10   
1  +++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o=     191      90      75   
2  +++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=      43      12      15   
3  +++snpr7pmobhLKUgSHTv/mpkqgBT0tQJ0zQj6qKrqc=     207     163     100   
4  ++/9R3sX37CjxbY/AaGvbwr3QkwElKBCtSvVzhCBDOk=     105      24      39   

   num_985  num_100  num_unq  total_secs  entries  entries_mar17  secs_mar17  \
0        5      472      530  117907.425       26             26  117907.425   
1      144      589      885  192527.892       31             31  192527.892   
2       12      485      468  115411.260       28             28  115411.260   
3       64      436      828  149896.558       21             21  149896.558   
4       35      479      230  116433.247       29             29  116433.247   

   behaviour  
0   0.237288  
1   0.848896  
2   0.169072  
3   1.22

In [5]:
user_logs_v1 = pd.read_csv('data/user_logs_eng_immortal.csv')

In [6]:
# the two lines below only had to be implemented because I had a typo in the tab where I load the 30GB file
# in order to not load it again I corrected it below
user_logs_v1['entries_nov16'] = user_logs_v1['entries_nov16'] - user_logs_v1['entries_dec16'] - user_logs_v1['entries_jan17'] - user_logs_v1['entries_feb17']
user_logs_v1['secs_nov16'] = user_logs_v1['secs_nov16'] - user_logs_v1['secs_dec16'] - user_logs_v1['secs_jan17'] - user_logs_v1['secs_feb17']



In [7]:
user_logs_v2 = pd.read_csv('data/user_logs_v2_immortal.csv')

In [8]:
#user_logs_check = pd.read_csv('data/user_logs_20m.csv')

In [9]:
#user_logs_check[user_logs_check['msno'] == '+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=']

### 2.3 Loading Train files

In [10]:
# loading train files
train1 = pd.read_csv('data/train.csv')
train2 = pd.read_csv('data/train_v2.csv')
submission = pd.read_csv('data/sample_submission_v2.csv')

### 2.4 Removing negative total_secs

We had previously seen in the Data Exploration part that there are some entries with negative total_secs. We simply put those entries to 0.

In [11]:
user_logs_v1['total_secs'][user_logs_v1['total_secs'] < 0] = 0
user_logs_v1['secs_nov16'][user_logs_v1['secs_nov16'] < 0] = 0
user_logs_v1['secs_dec16'][user_logs_v1['secs_dec16'] < 0] = 0
user_logs_v1['secs_jan17'][user_logs_v1['secs_jan17'] < 0] = 0
user_logs_v1['secs_feb17'][user_logs_v1['secs_feb17'] < 0] = 0

user_logs_v2['total_secs'][user_logs_v2['total_secs'] < 0] = 0
user_logs_v2['secs_mar17'][user_logs_v2['secs_mar17'] < 0] = 0

print(user_logs_v1['total_secs'].describe())


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

Se

count    5.234111e+06
mean     6.033205e+10
std      2.352035e+13
min      0.000000e+00
25%      1.025253e+03
50%      7.664507e+03
75%      1.230398e+05
max      9.223372e+15
Name: total_secs, dtype: float64


In [12]:
print(user_logs_v2.describe())

             num_25        num_50        num_75       num_985       num_100  \
count  1.103894e+06  1.103894e+06  1.103894e+06  1.103894e+06  1.103894e+06   
mean   1.031795e+02  2.514392e+01  1.568800e+01  1.799658e+01  5.046563e+02   
std    1.722185e+02  3.856735e+01  2.221605e+01  6.143512e+01  7.606139e+02   
min    0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   
25%    1.500000e+01  4.000000e+00  2.000000e+00  2.000000e+00  7.500000e+01   
50%    5.000000e+01  1.300000e+01  9.000000e+00  9.000000e+00  2.710000e+02   
75%    1.260000e+02  3.200000e+01  2.100000e+01  2.200000e+01  6.280000e+02   
max    4.682000e+04  3.163000e+03  1.690000e+03  1.516400e+04  5.767500e+04   

            num_unq    total_secs       entries  entries_mar17    secs_mar17  \
count  1.103894e+06  1.103894e+06  1.103894e+06   1.103894e+06  1.103894e+06   
mean   4.838865e+02  1.317335e+05  1.666497e+01   1.666497e+01  1.317335e+05   
std    5.952788e+02  1.852267e+05  1.030333e+01 

## 3. Additional Processing
We will now create 3 different files. We currently have three different is_churn dates we have to look at. Train.csv gives us is_churn in Feb-17, train_v2.csv in Mar-17 and for the submission we are required to predict Apr-17. As our most up-to-date data is from Mar-17, we can always just use data from the previous month to predict is_churn.

(THIS IS REALLY IMPORTANT TO UNDERSTAND SO I EXPLAIN IT AGAIN)

We are given a dataset with user data that includes user listening session and user transactions from 2015 to Mar-17. We are supposed to use this data to predict is_churn for Apr-17. This means that we must predict is_churn for the month after the month we have our most recent data. 

As we must train our models under exactly the same conditions than we 'test' them on (meaning we create predictions on), we have to make sure that the data we use to train the model is also from the month before the is_churn.

Thus, as we use is_churn from Mar-17 (train_v2.csv) for our train model, we have to use data from 2015 to Feb-17 as the input. 

For testing our prediction without uploading it to Kaggle, we can use is_churn from Feb-17 (train.csv) to train and is_churn from Mar-17 to check our prediction. Thus, for the train model we will data from 2015 to Jan-17. This means that we will have to manually remove all data after Jan-17 from the user_logs.csv and transactions.csv.

- One with is_churn data from Feb-17 and user_logs data from 2015 to Jan-17.
- One with is_churn data from Mar-17 and user_logs data from 2015 to Feb-17.
- One which will be used to predict is_churn in Apr-17 and with user_logs data from 2015 to Mar-17.

### 3.1 Merging with Train files

In [13]:
# JUST NOTICED: FOR VERSION 1 I ALSO NEED TO NOT ADD ABSOLUTE VALUES FROM THE LAST MONTHS #

# first dataset: is_churn from train1
# we drop entries and secs from February 17 due to the reasons given in the explanation above
user_logs_1 = user_logs_v1.drop(columns=['entries_feb17','secs_feb17'],axis=1)
user_logs_1 = user_logs_1.merge(train1,on='msno',how='right')

# second dataset: is_churn from train2
user_logs_2 = user_logs_v1.merge(train2,on='msno',how='right')

# third dataset
# as user_logs_v2 only contains data from Mar-17 we have to merge it with user_logs_v1 to add the data from 2015 to Feb-17.
user_logs_3 = user_logs_v2.merge(user_logs_v1[['msno','secs_dec16','secs_feb17','secs_jan17','secs_nov16','entries_nov16','entries_dec16','entries_feb17','entries_jan17']],on='msno',how='outer')
user_logs_3 = user_logs_3.merge(submission,on='msno',how='right')


In [14]:
print(train1.shape)
print(train2.shape)


(992931, 2)
(970960, 2)


In [15]:
print(user_logs_1.shape)
print(user_logs_2.shape)
print(user_logs_3.shape)

print(user_logs_1.head())
print(user_logs_2.head())
print(user_logs_3.head())

(992931, 17)
(970960, 19)
(907471, 21)
                                           msno  entries  entries_dec16  \
0  +++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o=    103.0           31.0   
1  +++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=    609.0           31.0   
2  +++snpr7pmobhLKUgSHTv/mpkqgBT0tQJ0zQj6qKrqc=    603.0           24.0   
3  ++/9R3sX37CjxbY/AaGvbwr3QkwElKBCtSvVzhCBDOk=    292.0           26.0   
4  ++/UDNo9DLrxT8QVGiDi1OnWfczAdEwThaVyD0fXO50=    445.0           18.0   

   entries_jan17  entries_nov16  num_100  num_25  num_50  num_75  num_985  \
0           30.0           15.0   2231.0   572.0   289.0   284.0    504.0   
1           31.0           29.0  15572.0  1396.0   494.0   420.0    561.0   
2           27.0           17.0  23807.0  6688.0  2213.0  1476.0   1402.0   
3           27.0           26.0   5092.0  1438.0   449.0   350.0    471.0   
4           27.0           19.0   4803.0  1083.0   540.0   299.0    415.0   

   num_unq  secs_dec16  secs_jan17  secs_nov16 

In [16]:
print(user_logs_1.describe())


             entries  entries_dec16  entries_jan17  entries_nov16  \
count  869926.000000  869926.000000  869926.000000  869926.000000   
mean      282.261706      15.383880      15.649394      14.380819   
std       224.093097      10.928904      10.408171      10.781273   
min         1.000000       0.000000       0.000000       0.000000   
25%        83.000000       5.000000       6.000000       3.000000   
50%       234.000000      16.000000      16.000000      15.000000   
75%       453.000000      26.000000      25.000000      25.000000   
max       790.000000      31.000000      31.000000      30.000000   

             num_100         num_25         num_50         num_75  \
count  869926.000000  869926.000000  869926.000000  869926.000000   
mean     8644.731970    1836.855691     456.326159     285.663154   
std     12682.459724    2940.346760     635.687247     386.645811   
min         0.000000       0.000000       0.000000       0.000000   
25%      1225.000000     279.0000

As we created the new features, some entries ended up being Null values. For features that contain the number of entries, seconds or songs, we set Null to 0. For 'behaviour', we set Null to 1, as 0 is not neutral enough.

In [17]:
print(user_logs_1.isnull().sum())
print(user_logs_2.isnull().sum())
print(user_logs_3.isnull().sum())

msno                  0
entries          123005
entries_dec16    123005
entries_jan17    123005
entries_nov16    123005
num_100          123005
num_25           123005
num_50           123005
num_75           123005
num_985          123005
num_unq          123005
secs_dec16       123005
secs_jan17       123005
secs_nov16       123005
total_secs       123005
behaviour        123005
is_churn              0
dtype: int64
msno                  0
entries          120664
entries_dec16    120664
entries_feb17    120664
entries_jan17    120664
entries_nov16    120664
num_100          120664
num_25           120664
num_50           120664
num_75           120664
num_985          120664
num_unq          120664
secs_dec16       120664
secs_feb17       120664
secs_jan17       120664
secs_nov16       120664
total_secs       120664
behaviour        120664
is_churn              0
dtype: int64
msno                  0
num_25           207242
num_50           207242
num_75           207242
num_985       

In [18]:
Null_to_0_columns = ['secs_dec16','secs_feb17','secs_jan17','entries_dec16','entries_feb17','entries_jan17',
                     'entries_mar17','entries_nov16','secs_mar17','secs_nov16','entries','num_25','num_50','num_75','num_985','num_100',
                     'num_unq','total_secs']

for i in range(0,len(Null_to_0_columns)):
    inpt = Null_to_0_columns[i]
    if (inpt in user_logs_1.columns):
        user_logs_1[inpt] = user_logs_1[inpt].fillna(value=0)
    if (inpt in user_logs_2.columns):
        user_logs_2[inpt] = user_logs_2[inpt].fillna(value=0)
    if (inpt in user_logs_3.columns):
        user_logs_3[inpt] = user_logs_3[inpt].fillna(value=0)
 
user_logs_1['behaviour'] = user_logs_1['behaviour'].fillna(value=1)
user_logs_2['behaviour'] = user_logs_2['behaviour'].fillna(value=1)
user_logs_3['behaviour'] = user_logs_3['behaviour'].fillna(value=1)

In [19]:
print(user_logs_1.isnull().sum())
print(user_logs_2.isnull().sum())
print(user_logs_3.isnull().sum())

msno             0
entries          0
entries_dec16    0
entries_jan17    0
entries_nov16    0
num_100          0
num_25           0
num_50           0
num_75           0
num_985          0
num_unq          0
secs_dec16       0
secs_jan17       0
secs_nov16       0
total_secs       0
behaviour        0
is_churn         0
dtype: int64
msno             0
entries          0
entries_dec16    0
entries_feb17    0
entries_jan17    0
entries_nov16    0
num_100          0
num_25           0
num_50           0
num_75           0
num_985          0
num_unq          0
secs_dec16       0
secs_feb17       0
secs_jan17       0
secs_nov16       0
total_secs       0
behaviour        0
is_churn         0
dtype: int64
msno             0
num_25           0
num_50           0
num_75           0
num_985          0
num_100          0
num_unq          0
total_secs       0
entries          0
entries_mar17    0
secs_mar17       0
behaviour        0
secs_dec16       0
secs_feb17       0
secs_jan17       0
secs_

### 3.2 Adding ratios

We will create a bunch of ratios in order to to quantify trends in user behaviour. These ratios are: 
- cVp_entries = current month entries / previous month entries (where current always depicts the month before is_churn is given) (e.g. if is_churn is given for Feb-17, current month will be January)
- pVpp_entries = previous month / previous previous month (FOR LATER!)
- cVp_ and pVpp_secs = same logic than above

In [20]:
print(user_logs_1.shape)
print(user_logs_2.shape)
print(user_logs_3.shape)

(992931, 17)
(970960, 19)
(907471, 21)


In [21]:
# is_churn in Feb
ul_1 = user_logs_1.copy()

ul_1['cVp_entries'] = 0
# avoid NaN or inf values when denominator is 0
idx_1 = (ul_1['entries_dec16'] == 0) 
ul_1['cVp_entries'][idx_1] = ul_1['entries_jan17'][idx_1]
ul_1['cVp_entries'][~idx_1] = ul_1['entries_jan17'][~idx_1]/ul_1['entries_dec16'][~idx_1]

ul_1['pVpp_entries'] = 0
# avoid NaN or inf values when denominator is 0
idx_1 = (ul_1['entries_nov16'] == 0) 
ul_1['pVpp_entries'][idx_1] = ul_1['entries_dec16'][idx_1]
ul_1['pVpp_entries'][~idx_1] = ul_1['entries_dec16'][~idx_1]/ul_1['entries_nov16'][~idx_1]

ul_1['cVp_secs'] = 0
# avoid NaN or inf values when denominator is 0
idx_1 = (ul_1['secs_dec16'] == 0) 
ul_1['cVp_secs'][idx_1] = ul_1['secs_jan17'][idx_1]
ul_1['cVp_secs'][~idx_1] = ul_1['secs_jan17'][~idx_1]/ul_1['secs_dec16'][~idx_1]

ul_1['pVpp_secs'] = 0
# avoid NaN or inf values when denominator is 0
idx_1 = (ul_1['secs_nov16'] == 0) 
ul_1['pVpp_secs'][idx_1] = ul_1['secs_dec16'][idx_1]
ul_1['pVpp_secs'][~idx_1] = ul_1['secs_dec16'][~idx_1]/ul_1['secs_nov16'][~idx_1]
         
# is_churn in Mar
ul_2 = user_logs_2.copy()
         
ul_2['cVp_entries'] = 0
idx_1 = (ul_2['entries_jan17'] == 0) 
ul_2['cVp_entries'][idx_1] = ul_2['entries_feb17'][idx_1]
ul_2['cVp_entries'][~idx_1] = ul_2['entries_feb17'][~idx_1]/ul_2['entries_jan17'][~idx_1]

ul_2['pVpp_entries'] = 0
idx_1 = (ul_2['entries_dec16'] == 0) 
ul_2['pVpp_entries'][idx_1] = ul_2['entries_jan17'][idx_1]
ul_2['pVpp_entries'][~idx_1] = ul_2['entries_jan17'][~idx_1]/ul_2['entries_dec16'][~idx_1]
  
ul_2['cVp_secs'] = 0
idx_1 = (ul_2['secs_jan17'] == 0) 
ul_2['cVp_secs'][idx_1] = ul_2['secs_feb17'][idx_1]
ul_2['cVp_secs'][~idx_1] = ul_2['secs_feb17'][~idx_1]/ul_2['secs_jan17'][~idx_1]

ul_2['pVpp_secs'] = 0
idx_1 = (ul_2['secs_dec16'] == 0) 
ul_2['pVpp_secs'][idx_1] = ul_2['secs_jan17'][idx_1]
ul_2['pVpp_secs'][~idx_1] = ul_2['secs_jan17'][~idx_1]/ul_2['secs_dec16'][~idx_1]

# is_churn in April
ul_3 = user_logs_3.copy()

ul_3['cVp_entries'] = 0
idx_1 = (ul_3['entries_feb17'] == 0) 
ul_3['cVp_entries'][idx_1] = ul_3['entries_mar17'][idx_1]
ul_3['cVp_entries'][~idx_1] = ul_3['entries_mar17'][~idx_1]/ul_3['entries_feb17'][~idx_1]

ul_3['pVpp_entries'] = 0
idx_1 = (ul_3['entries_jan17'] == 0) 
ul_3['pVpp_entries'][idx_1] = ul_3['entries_feb17'][idx_1]
ul_3['pVpp_entries'][~idx_1] = ul_3['entries_feb17'][~idx_1]/ul_3['entries_jan17'][~idx_1]

ul_3['cVp_secs'] = 0
idx_1 = (ul_3['secs_feb17'] == 0) 
ul_3['cVp_secs'][idx_1] = ul_3['secs_mar17'][idx_1]
ul_3['cVp_secs'][~idx_1] = ul_3['secs_mar17'][~idx_1]/ul_3['secs_feb17'][~idx_1]

ul_3['pVpp_secs'] = 0
idx_1 = (ul_3['secs_jan17'] == 0) 
ul_3['pVpp_secs'][idx_1] = ul_3['secs_feb17'][idx_1]
ul_3['pVpp_secs'][~idx_1] = ul_3['secs_feb17'][~idx_1]/ul_3['secs_jan17'][~idx_1]




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying 

In [22]:
print(ul_1.isnull().sum())
print(ul_1[ul_1 == np.inf].count())

msno             0
entries          0
entries_dec16    0
entries_jan17    0
entries_nov16    0
num_100          0
num_25           0
num_50           0
num_75           0
num_985          0
num_unq          0
secs_dec16       0
secs_jan17       0
secs_nov16       0
total_secs       0
behaviour        0
is_churn         0
cVp_entries      0
pVpp_entries     0
cVp_secs         0
pVpp_secs        0
dtype: int64
msno             0
entries          0
entries_dec16    0
entries_jan17    0
entries_nov16    0
num_100          0
num_25           0
num_50           0
num_75           0
num_985          0
num_unq          0
secs_dec16       0
secs_jan17       0
secs_nov16       0
total_secs       0
behaviour        0
is_churn         0
cVp_entries      0
pVpp_entries     0
cVp_secs         0
pVpp_secs        0
dtype: int64


In [23]:
print(ul_1.shape)
print(ul_2.shape)
print(ul_3.shape)

(992931, 21)
(970960, 23)
(907471, 25)


### 3.2 Synchronizing datasets

In a next step, we need to make sure that the date related features in our datasets are not in absolut terms (e.g. feb17) but instead relative to the date where is_churn is derived from (e.g. m1 = 1 month before). Indeed, entries from Feb-17 should have the same meaning for is_churn in Mar-17 than entries from Mar-17 for is_churn in Apr-17.
Thus we will rename the features so that we get the following structure:
- entries_m1 = entries in the month before is_churn is tested
- entries_m2 = entries in the month prior prior to where is_churn is tested
- secs_m1 and secs_m2: same logic as above

Also, to make sure every file has the same amount of features, we had to delete the features of entries and total_secs that where not within three months of the is_churn date.

In [30]:
ul_1_final = ul_1.rename(index=str, columns={"entries_jan17": "entries_m1", 
                                                     "entries_dec16": "entries_m2", 
                                                     "entries_nov16": "entries_m3", 
                                                    "secs_jan17": "secs_m1",
                                                    "secs_dec16": "secs_m2",
                                                    "secs_nov16": "secs_m3"})

ul_2_final = ul_2.drop(columns=['entries_nov16','secs_nov16'],axis=1)
ul_2_final = ul_2_final.rename(index=str, columns={"entries_feb17": "entries_m1", 
                                                     "entries_jan17": "entries_m2",
                                                   "entries_dec16": "entries_m3",
                                                    "secs_feb17": "secs_m1",
                                                    "secs_jan17": "secs_m2",
                                                  "secs_dec16": "secs_m3"})

ul_3_final = ul_3.drop(columns=['entries_dec16','secs_dec16','entries_nov16','secs_nov16'],axis=1)
ul_3_final = ul_3_final.rename(index=str, columns={"entries_mar17": "entries_m1", 
                                                     "entries_feb17": "entries_m2",
                                                   "entries_jan17": "entries_m3",
                                                    "secs_mar17": "secs_m1",
                                                    "secs_feb17": "secs_m2",
                                                  "secs_jan17": "secs_m3"})

Thanks to the procedures above, our three datasets now have exactly the same features and can be used to train/test on each other.

In [31]:
print(ul_1_final.shape)
print(ul_2_final.shape)
print(ul_3_final.shape)

(992931, 21)
(970960, 21)
(907471, 21)


In [32]:
print(ul_3_final.head())
print(ul_2_final.head())
print(ul_1_final.head())

                                           msno  num_25  num_50  num_75  \
0  +++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o=   191.0    90.0    75.0   
1  +++snpr7pmobhLKUgSHTv/mpkqgBT0tQJ0zQj6qKrqc=   207.0   163.0   100.0   
2  ++/9R3sX37CjxbY/AaGvbwr3QkwElKBCtSvVzhCBDOk=   105.0    24.0    39.0   
3  ++0/NopttBsaAn6qHZA2AWWrDg7Me7UOMs1vsyo4tSI=    21.0     8.0    17.0   
4  ++0BJXY8tpirgIhJR14LDM1pnaRosjD1mdO1mIKxlJA=    27.0    15.0    10.0   

   num_985  num_100  num_unq  total_secs  entries  entries_m1    ...      \
0    144.0    589.0    885.0  192527.892     31.0        31.0    ...       
1     64.0    436.0    828.0  149896.558     21.0        21.0    ...       
2     35.0    479.0    230.0  116433.247     29.0        29.0    ...       
3      7.0    104.0    115.0   28450.268      8.0         8.0    ...       
4      4.0    115.0    163.0   31788.296      9.0         9.0    ...       

   behaviour     secs_m2     secs_m3  entries_m2  entries_m3  is_churn  \
0   0.848896  1565

In a final step, we make sure that the order of the features is the same for every file.

In [34]:
ul_3_final['is_churn'] = 0
ul_2_final = ul_2_final[ul_1_final.columns]
ul_3_final = ul_3_final[ul_1_final.columns]

In [28]:
hmm = ul_2_final.columns == ul_3_final.columns
print(hmm)

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True]


## 4. Downloading Files

In [29]:
# download file
ul_1_final.to_csv('data/user_logs_feb.csv', index=False)
print("Done (1/3)!")
ul_2_final.to_csv('data/user_logs_mar.csv', index=False)
print("Done (2/3)!")
ul_3_final.to_csv('data/user_logs_apr.csv', index=False)
print("Done (3/3)!")


Done (1/3)!
Done (2/3)!
Done (3/3)!
