This files generates feautures for "members.csv" and exports the file into a folder called data "final_members". In this notebook, more changes to "members" can be made.
In the notebook "algorithm_solution" load "final_members" in.

In [113]:
#Import the relevant libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mpld3
import seaborn as sns
import matplotlib.dates as mdates
import time
from datetime import datetime


#Configure Panda
pd.options.display.width = 200

In [114]:
#Load members in
members= pd.read_csv("data/members_v3.csv")


In [115]:
#Look at the first values in members:
print("members:")
print(members.head())


members:
                                           msno  city  bd  gender  registered_via  registration_init_time
0  Rb9UwLQTrxzBVwCB6+bCcSQWZ9JiNLC9dXtM1oEsZA8=     1   0     NaN              11                20110911
1  +tJonkh+O1CA796Fm5X60UMOtB6POHAwPjbTRVl/EuU=     1   0     NaN               7                20110914
2  cV358ssn7a0f7jZOwGNWS07wCKVqxyiImJUX6xcIwKw=     1   0     NaN              11                20110915
3  9bzDeJP6sQodK73K5CBlJ6fgIQzPeLnRl0p5B77XP+g=     1   0     NaN              11                20110915
4  WFLY3s7z4EZsieHCt63XrsdtfTEmJ+2PnnKLH5GY4Tk=     6  32  female               9                20110915


# Feature 1, hot-encode city variable and save in final1

In [116]:
#One-hot encode the cities. 
#Instead of having a variable called city with values from 1-22, the alorithm performs better with 0's and 1's ->onehot encoding

final1 = members

#One-hot encode city and save it into city_encde
city_encode = pd.get_dummies(final1['city'],prefix='city')

#Drop variable city in fial2, as it is no longer needed
final1=final1.drop('city',axis=1)

#Join the encoded city_encode
final1 = final1.join(city_encode)

final1.head()


Unnamed: 0,msno,bd,gender,registered_via,registration_init_time,city_1,city_3,city_4,city_5,city_6,...,city_13,city_14,city_15,city_16,city_17,city_18,city_19,city_20,city_21,city_22
0,Rb9UwLQTrxzBVwCB6+bCcSQWZ9JiNLC9dXtM1oEsZA8=,0,,11,20110911,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,+tJonkh+O1CA796Fm5X60UMOtB6POHAwPjbTRVl/EuU=,0,,7,20110914,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,cV358ssn7a0f7jZOwGNWS07wCKVqxyiImJUX6xcIwKw=,0,,11,20110915,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9bzDeJP6sQodK73K5CBlJ6fgIQzPeLnRl0p5B77XP+g=,0,,11,20110915,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,WFLY3s7z4EZsieHCt63XrsdtfTEmJ+2PnnKLH5GY4Tk=,32,female,9,20110915,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


# Feature 2, save the year and the month as two new columns in final1

In [117]:
#transform integer dates to data type: date
final1['registration_init_time'] = final1.registration_init_time.apply(lambda x: datetime.strptime(str(int(x)), "%Y%m%d").date() if pd.notnull(x) else "NAN" )



In [118]:
#Give the year and month of registration in two new columns
date = pd.DataFrame(columns=['reg_name','reg_month'])

date.reg_name=pd.DatetimeIndex(final1['registration_init_time']).year
date.reg_month=pd.DatetimeIndex(final1['registration_init_time']).month


#Drop variable registration_init_time in final2, as it is no longer needed

final1=final1.drop('registration_init_time',axis=1)
#Join the two new columns
final1=final1.join(date)

final1.head()



                                           msno  bd  gender  registered_via  city_1  city_3  city_4  city_5  city_6  city_7    ...      city_15  city_16  city_17  city_18  city_19  city_20  city_21  \
0  Rb9UwLQTrxzBVwCB6+bCcSQWZ9JiNLC9dXtM1oEsZA8=   0     NaN              11       1       0       0       0       0       0    ...            0        0        0        0        0        0        0   
1  +tJonkh+O1CA796Fm5X60UMOtB6POHAwPjbTRVl/EuU=   0     NaN               7       1       0       0       0       0       0    ...            0        0        0        0        0        0        0   
2  cV358ssn7a0f7jZOwGNWS07wCKVqxyiImJUX6xcIwKw=   0     NaN              11       1       0       0       0       0       0    ...            0        0        0        0        0        0        0   
3  9bzDeJP6sQodK73K5CBlJ6fgIQzPeLnRl0p5B77XP+g=   0     NaN              11       1       0       0       0       0       0    ...            0        0        0        0        0        0        

# Final finish, before exporting the file in the data folder

In [119]:
#dropping bd gender, regstered_via and regisration_init_time to test if city encoding helps algorithm performing
final1=final1.drop(['bd','gender','registered_via'],axis=1)
final1.head()

Unnamed: 0,msno,city_1,city_3,city_4,city_5,city_6,city_7,city_8,city_9,city_10,...,city_15,city_16,city_17,city_18,city_19,city_20,city_21,city_22,reg_name,reg_month
0,Rb9UwLQTrxzBVwCB6+bCcSQWZ9JiNLC9dXtM1oEsZA8=,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2011,9
1,+tJonkh+O1CA796Fm5X60UMOtB6POHAwPjbTRVl/EuU=,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2011,9
2,cV358ssn7a0f7jZOwGNWS07wCKVqxyiImJUX6xcIwKw=,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2011,9
3,9bzDeJP6sQodK73K5CBlJ6fgIQzPeLnRl0p5B77XP+g=,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2011,9
4,WFLY3s7z4EZsieHCt63XrsdtfTEmJ+2PnnKLH5GY4Tk=,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,2011,9


In [120]:
#Check for null-values:
final1.isnull().sum()

msno         0
city_1       0
city_3       0
city_4       0
city_5       0
city_6       0
city_7       0
city_8       0
city_9       0
city_10      0
city_11      0
city_12      0
city_13      0
city_14      0
city_15      0
city_16      0
city_17      0
city_18      0
city_19      0
city_20      0
city_21      0
city_22      0
reg_name     0
reg_month    0
dtype: int64

In [121]:
#export file, can take up to 1 min
final1.to_csv('data/final_members.csv', index=False)
print("Done!")


Done!
