This Python Notebook generates features for "transactions_v2.csv" and exports a file into a folder called data "final_transactions". 
In this notebook, more changes to "transactions_v2" can be made. 
In the notebook "algorithm_solution" load "final_transactions" in.

In [1]:
#Import the relevant libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mpld3
import seaborn as sns
import matplotlib.dates as mdates
import time
from datetime import datetime
from pandas.lib import Timestamp


#Configure Panda
pd.options.display.width = 200

  # This is added back by InteractiveShellApp.init_path()


## 1. Data import and Feature Engineering

In [2]:
#Load transactions
transactions = pd.read_csv("data/transactions_v2.csv")

In [3]:
#Look at the first values in transactions:
print("Transactions:")
transactions.head()

Transactions:


Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
0,++6eU4LsQ3UQ20ILS7d99XK8WbiVgbyYL4FUgzZR134=,32,90,298,298,0,20170131,20170504,0
1,++lvGPJOinuin/8esghpnqdljm6NXS8m8Zwchc7gOeA=,41,30,149,149,1,20150809,20190412,0
2,+/GXNtXWQVfKrEDqYAzcSw2xSPYMKWNj22m+5XkVQZc=,36,30,180,180,1,20170303,20170422,0
3,+/w1UrZwyka4C9oNH3+Q8fUf3fD8R3EwWrx57ODIsqk=,36,30,180,180,1,20170329,20170331,1
4,+00PGzKTYqtnb65mPKPyeHXcZEwqiEzktpQksaaSC3c=,41,30,99,99,1,20170323,20170423,0


In [4]:
print('Number of rows & columns "TRANSACTIONS": ', transactions.shape)

Number of rows & columns "TRANSACTIONS":  (1431009, 9)


### 1.1. Feature 0. Feature computed as the difference between the "plan price" and "amount paid"

In [5]:
transactions['diff_plan_actual'] = transactions['plan_list_price'] - transactions['actual_amount_paid']
transactions.head()

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel,diff_plan_actual
0,++6eU4LsQ3UQ20ILS7d99XK8WbiVgbyYL4FUgzZR134=,32,90,298,298,0,20170131,20170504,0,0
1,++lvGPJOinuin/8esghpnqdljm6NXS8m8Zwchc7gOeA=,41,30,149,149,1,20150809,20190412,0,0
2,+/GXNtXWQVfKrEDqYAzcSw2xSPYMKWNj22m+5XkVQZc=,36,30,180,180,1,20170303,20170422,0,0
3,+/w1UrZwyka4C9oNH3+Q8fUf3fD8R3EwWrx57ODIsqk=,36,30,180,180,1,20170329,20170331,1,0
4,+00PGzKTYqtnb65mPKPyeHXcZEwqiEzktpQksaaSC3c=,41,30,99,99,1,20170323,20170423,0,0


In [6]:
print('Number of rows & columns "TRANSACTIONS": ', transactions.shape)

Number of rows & columns "TRANSACTIONS":  (1431009, 10)


### 1.2. Feature 1: discount
I create a discount column to see how much discount was offered to the customer.

In [7]:
transactions['discount'] = transactions['plan_list_price'] - transactions['actual_amount_paid']

#transactions['discount'].unique() #Find the unique elements of an array/dataset

In [8]:
print('Number of rows & columns "TRANSACTIONS": ', transactions.shape)

Number of rows & columns "TRANSACTIONS":  (1431009, 11)


## 1.3. Feature 2 : is_discount
Feature to check whether the customer has availed any discount or not

In [9]:
transactions['is_discount'] = transactions.discount.apply(lambda x: 1 if x > 0 else 0)
#print(transactions['is_discount'].unique())

In [10]:
print('Number of rows & columns "TRANSACTIONS": ', transactions.shape)

Number of rows & columns "TRANSACTIONS":  (1431009, 12)


### 1.4. Feature 3: membership duration
Difference between transaction_date and membership_expire_date. We find the difference in terms of days and months. We keep this result as integer.

In [11]:
## NOT DONE YET!

### 1.5. Feature Engineering: "trans1". Hot-encode "payment_method_id" feature and test algorithm

In [12]:
#One-hot encode the payment_method_id. 
#Instead of having a variable called payment_method_id with values from 2-41, the alorithm performs better with 0's and 1's -> onehot encoding

trans1 = transactions

#One-hot encode payment_method_id and save it into payment_method_id_encode
payment_method_id_encode = pd.get_dummies(trans1['payment_method_id'], prefix='payment_method_id')

#Drop variable payment_method_id in trans1, as it is no longer needed
trans1 = trans1.drop('payment_method_id', axis=1)

#Join the encoded payment_method_id_encode
trans1 = trans1.join(payment_method_id_encode)

trans1.head()

Unnamed: 0,msno,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel,diff_plan_actual,discount,...,payment_method_id_32,payment_method_id_33,payment_method_id_34,payment_method_id_35,payment_method_id_36,payment_method_id_37,payment_method_id_38,payment_method_id_39,payment_method_id_40,payment_method_id_41
0,++6eU4LsQ3UQ20ILS7d99XK8WbiVgbyYL4FUgzZR134=,90,298,298,0,20170131,20170504,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,++lvGPJOinuin/8esghpnqdljm6NXS8m8Zwchc7gOeA=,30,149,149,1,20150809,20190412,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,+/GXNtXWQVfKrEDqYAzcSw2xSPYMKWNj22m+5XkVQZc=,30,180,180,1,20170303,20170422,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,+/w1UrZwyka4C9oNH3+Q8fUf3fD8R3EwWrx57ODIsqk=,30,180,180,1,20170329,20170331,1,0,0,...,0,0,0,0,1,0,0,0,0,0
4,+00PGzKTYqtnb65mPKPyeHXcZEwqiEzktpQksaaSC3c=,30,99,99,1,20170323,20170423,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [13]:
print('Number of rows & columns "TRANSACTIONS": ', transactions.shape)
print('Number of rows & columns "TRANSACTIONS": ', trans1.shape)
#Same number of rows!

Number of rows & columns "TRANSACTIONS":  (1431009, 12)
Number of rows & columns "TRANSACTIONS":  (1431009, 48)


### 1.6. Feature Engineering: "trans2". Hot-encode "payment_plan_days" feature and test algorithm

In [14]:
trans2 = trans1

payment_plan_days_encode = pd.get_dummies(trans2['payment_plan_days'], prefix='payment_plan_days')

#Drop variable payment_method_id in trans1, as it is no longer needed
trans2 = trans2.drop('payment_plan_days', axis=1)

#Join the encoded payment_method_id_encode
trans2 = trans2.join(payment_plan_days_encode)

trans2.head()

Unnamed: 0,msno,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel,diff_plan_actual,discount,is_discount,...,payment_plan_days_230,payment_plan_days_240,payment_plan_days_270,payment_plan_days_360,payment_plan_days_365,payment_plan_days_395,payment_plan_days_400,payment_plan_days_410,payment_plan_days_415,payment_plan_days_450
0,++6eU4LsQ3UQ20ILS7d99XK8WbiVgbyYL4FUgzZR134=,298,298,0,20170131,20170504,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,++lvGPJOinuin/8esghpnqdljm6NXS8m8Zwchc7gOeA=,149,149,1,20150809,20190412,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,+/GXNtXWQVfKrEDqYAzcSw2xSPYMKWNj22m+5XkVQZc=,180,180,1,20170303,20170422,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,+/w1UrZwyka4C9oNH3+Q8fUf3fD8R3EwWrx57ODIsqk=,180,180,1,20170329,20170331,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,+00PGzKTYqtnb65mPKPyeHXcZEwqiEzktpQksaaSC3c=,99,99,1,20170323,20170423,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
print('Number of rows & columns "TRANSACTIONS": ', transactions.shape)
print('Number of rows & columns "TRANSACTIONS": ', trans2.shape)
#Same number of rows!

Number of rows & columns "TRANSACTIONS":  (1431009, 12)
Number of rows & columns "TRANSACTIONS":  (1431009, 78)


### 1.7. Feature Engineering: "trans3". Hot-encode "plan_list_price" feature and test algorithm

In [16]:
trans3 = trans2

plan_list_price_encode = pd.get_dummies(trans3['plan_list_price'], prefix='plan_list_price')

#Drop variable payment_method_id in trans1, as it is no longer needed
trans3 = trans3.drop('plan_list_price', axis=1)

#Join the encoded payment_method_id_encode
trans3 = trans3.join(plan_list_price_encode)

trans3.head()

Unnamed: 0,msno,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel,diff_plan_actual,discount,is_discount,payment_method_id_2,...,plan_list_price_1000,plan_list_price_1150,plan_list_price_1200,plan_list_price_1260,plan_list_price_1299,plan_list_price_1300,plan_list_price_1399,plan_list_price_1599,plan_list_price_1788,plan_list_price_2000
0,++6eU4LsQ3UQ20ILS7d99XK8WbiVgbyYL4FUgzZR134=,298,0,20170131,20170504,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,++lvGPJOinuin/8esghpnqdljm6NXS8m8Zwchc7gOeA=,149,1,20150809,20190412,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,+/GXNtXWQVfKrEDqYAzcSw2xSPYMKWNj22m+5XkVQZc=,180,1,20170303,20170422,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,+/w1UrZwyka4C9oNH3+Q8fUf3fD8R3EwWrx57ODIsqk=,180,1,20170329,20170331,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,+00PGzKTYqtnb65mPKPyeHXcZEwqiEzktpQksaaSC3c=,99,1,20170323,20170423,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
print('Number of rows & columns "TRANSACTIONS": ', transactions.shape)
print('Number of rows & columns "TRANSACTIONS": ', trans3.shape)
#Same number of rows!

Number of rows & columns "TRANSACTIONS":  (1431009, 12)
Number of rows & columns "TRANSACTIONS":  (1431009, 125)


### 1.8. Check dataset

In [19]:
#Check columns
print(len(trans3.columns))
trans3.columns

125


Index(['msno', 'actual_amount_paid', 'is_auto_renew', 'transaction_date', 'membership_expire_date', 'is_cancel', 'diff_plan_actual', 'discount', 'is_discount', 'payment_method_id_2',
       ...
       'plan_list_price_1000', 'plan_list_price_1150', 'plan_list_price_1200', 'plan_list_price_1260', 'plan_list_price_1299', 'plan_list_price_1300', 'plan_list_price_1399', 'plan_list_price_1599',
       'plan_list_price_1788', 'plan_list_price_2000'],
      dtype='object', length=125)

#### 1.7.1. Check if missing values

In [20]:
trans3.isnull().sum()
#We should get 0!

msno                      0
actual_amount_paid        0
is_auto_renew             0
transaction_date          0
membership_expire_date    0
is_cancel                 0
diff_plan_actual          0
discount                  0
is_discount               0
payment_method_id_2       0
payment_method_id_3       0
payment_method_id_5       0
payment_method_id_6       0
payment_method_id_8       0
payment_method_id_10      0
payment_method_id_11      0
payment_method_id_12      0
payment_method_id_13      0
payment_method_id_14      0
payment_method_id_15      0
payment_method_id_16      0
payment_method_id_17      0
payment_method_id_18      0
payment_method_id_19      0
payment_method_id_20      0
payment_method_id_21      0
payment_method_id_22      0
payment_method_id_23      0
payment_method_id_24      0
payment_method_id_25      0
                         ..
plan_list_price_150       0
plan_list_price_180       0
plan_list_price_210       0
plan_list_price_265       0
plan_list_price_298 

## 2. Export file
Recall, it can take some minutes!

In [21]:
trans3.to_csv('data/final_transactions.csv', index=False)
print('Done!')

Done!
