In [2]:
#!/usr/bin/python3
# -*-coding:utf-8
'''
Created on Fri Dec 1 22:22:35 2017

@author: Ray

'''
import time
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import utils # written by author
from glob import glob
from datetime import datetime, timedelta
import multiprocessing as mp
import gc # for automatic releasing memory

##################################################
# Load members and
##################################################
members = pd.read_csv('../../input/members_v3.csv')

#==============================================================================
print('reduce memory')
#==============================================================================
utils.reduce_memory(members)


##################################################
# Gender
##################################################
def gender(x):
    if x == 'female':
        return 0
    elif x == 'male':
        return 1
    else:
        return 2 # 2:代表missing values
members['gender'] = members.gender.apply(gender)
##################################################
# city
##################################################
members['city'] = members.city.apply(lambda x: int(x) if pd.notnull(x) else 2) # 把City-NaN當作2

##################################################
#registered_via
##################################################
registered_via_dict = {k:v+1 for v,k in enumerate(members['registered_via'].unique().tolist())}
'''
key: orginal registered_via
{-1: 18,
 1: 15,
 2: 10,
 3: 4,
 4: 6,
 5: 9,
 6: 13,
 7: 2,
 8: 12,
 9: 3,
 10: 17,
 11: 1,
 13: 7,
 14: 14,
 16: 5,
 17: 8,
 18: 16,
 19: 11}
 # 18代表nan
'''
members['registered_via'] = members.registered_via.apply(lambda x: registered_via_dict[x]) # 把NaN當作2

##################################################
# Birthd Date Cleaning
##################################################

# missing value (12%) and outliers(about 49%)
members['bd'] = members.bd.apply(lambda x: -99999 if float(x)<=1 else x )
members['bd'] = members.bd.apply(lambda x: -99999 if float(x)>=100 else x )
members['bd'] = members.bd.apply(lambda x: int(x) if pd.notnull(x) else -99999 )

tmp_bd_for_filling = members[members.bd != -99999] # using mean of bd as filling of missing values and outliers
mean_bd = int(tmp_bd_for_filling.bd.mean())
del tmp_bd_for_filling

members['bd'] = members.bd.apply(lambda x: mean_bd if x == -99999 else x )

reduce memory


100%|██████████| 6/6 [00:00<00:00, 1147.66it/s]


In [3]:
members.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6769473 entries, 0 to 6769472
Data columns (total 6 columns):
msno                      object
city                      int64
bd                        int64
gender                    int64
registered_via            int64
registration_init_time    int32
dtypes: int32(1), int64(4), object(1)
memory usage: 284.1+ MB


In [4]:
#==============================================================================
print('reduce memory')
#==============================================================================
utils.reduce_memory(members)

reduce memory


100%|██████████| 6/6 [00:00<00:00, 628.41it/s]


In [5]:
members.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6769473 entries, 0 to 6769472
Data columns (total 6 columns):
msno                      object
city                      int8
bd                        int8
gender                    int8
registered_via            int8
registration_init_time    int32
dtypes: int32(1), int8(4), object(1)
memory usage: 103.3+ MB


In [6]:
members['is_registerd_via_4'] = members.registered_via.apply(lambda x: 1 if x == 6 else 0)
members['is_registerd_via_7'] = members.registered_via.apply(lambda x: 1 if x == 2 else 0)


In [7]:
members.head(n = 20)

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time,is_registerd_via_4,is_registerd_via_7
0,Rb9UwLQTrxzBVwCB6+bCcSQWZ9JiNLC9dXtM1oEsZA8=,1,29,2,1,20110911,0,0
1,+tJonkh+O1CA796Fm5X60UMOtB6POHAwPjbTRVl/EuU=,1,29,2,2,20110914,0,1
2,cV358ssn7a0f7jZOwGNWS07wCKVqxyiImJUX6xcIwKw=,1,29,2,1,20110915,0,0
3,9bzDeJP6sQodK73K5CBlJ6fgIQzPeLnRl0p5B77XP+g=,1,29,2,1,20110915,0,0
4,WFLY3s7z4EZsieHCt63XrsdtfTEmJ+2PnnKLH5GY4Tk=,6,32,0,3,20110915,0,0
5,yLkV2gbZ4GLFwqTOXLVHz0VGrMYcgBGgKZ3kj9RiYu8=,4,30,1,3,20110916,0,0
6,jNCGK78YkTyId3H3wFavcBLDmz7pfqlvCfUKf4G1Lw4=,1,29,2,2,20110916,0,1
7,WH5Jq4mgtfUFXh2yz+HrcTXKS4Oess4k4W3qKolAeb0=,5,34,1,3,20110916,0,0
8,tKmbR4X5VXjHmxERrckawEMZ4znVy1lAQIR1vV5rdNk=,5,19,1,3,20110917,0,0
9,I0yFvqMoNkM8ZNHb617e1RBzIS/YRKemHO7Wj13EtA0=,13,63,1,3,20110918,0,0


In [9]:
def bd_zone(x):
	if  x <= 18:
		return 1 # 大學生以下
	elif 18 < x <= 22:
		return 2 # 大學生
	elif 22 < x < 35:
		return 3 # 上班族
	else:
		return 4 # 35以上lol
		
members['bd_zone'] = members.bd.apply(bd_zone)


In [10]:
members

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time,is_registerd_via_4,is_registerd_via_7,bd_zone
0,Rb9UwLQTrxzBVwCB6+bCcSQWZ9JiNLC9dXtM1oEsZA8=,1,29,2,1,20110911,0,0,3
1,+tJonkh+O1CA796Fm5X60UMOtB6POHAwPjbTRVl/EuU=,1,29,2,2,20110914,0,1,3
2,cV358ssn7a0f7jZOwGNWS07wCKVqxyiImJUX6xcIwKw=,1,29,2,1,20110915,0,0,3
3,9bzDeJP6sQodK73K5CBlJ6fgIQzPeLnRl0p5B77XP+g=,1,29,2,1,20110915,0,0,3
4,WFLY3s7z4EZsieHCt63XrsdtfTEmJ+2PnnKLH5GY4Tk=,6,32,0,3,20110915,0,0,3
5,yLkV2gbZ4GLFwqTOXLVHz0VGrMYcgBGgKZ3kj9RiYu8=,4,30,1,3,20110916,0,0,3
6,jNCGK78YkTyId3H3wFavcBLDmz7pfqlvCfUKf4G1Lw4=,1,29,2,2,20110916,0,1,3
7,WH5Jq4mgtfUFXh2yz+HrcTXKS4Oess4k4W3qKolAeb0=,5,34,1,3,20110916,0,0,3
8,tKmbR4X5VXjHmxERrckawEMZ4znVy1lAQIR1vV5rdNk=,5,19,1,3,20110917,0,0,2
9,I0yFvqMoNkM8ZNHb617e1RBzIS/YRKemHO7Wj13EtA0=,13,63,1,3,20110918,0,0,4


In [11]:
members['is_living_in_city-21'] = members.city.apply(lambda x: 1 if x == 21 else 0)
members['is_living_in_city-2'] = members.city.apply(lambda x: 1 if x == 2 else 0)



In [13]:
members[members.city == 21]

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time,is_registerd_via_4,is_registerd_via_7,bd_zone,is_living_in_city-21,is_living_in_city-2
115,nv5dJzjDQjzpX1jiw9y4dF5vJ1w5dwj/BWN24y1Qudc=,21,20,0,4,20141023,0,0,2,1,0
276,Vf/uET/p/GayO3URHAzwCEdwIkW3x4vaA/XBL3CL1qM=,21,33,1,4,20141101,0,0,3,1,0
327,4RYyC9Ba0lfxI3xhN8JF0huBhymQts4vh72I6taMzsI=,21,18,0,4,20141104,0,0,1,1,0
400,qqMaRJurkfuSjLz+T3sigPgUexsEV5qPdnD7DIUs2wg=,21,29,0,4,20141109,0,0,3,1,0
530,O2Q9D3n3FbYyMdfj9O6cbyXLiP2R6c7VNVnA89EzuB4=,21,19,1,6,20161226,1,0,2,1,0
1388,dkcUFJdIPesnMJobPklGy5qD+TCsY4ezuGb3tbDO06k=,21,33,0,3,20060714,0,0,3,1,0
1428,6S5k10UBEzidKpUdJpwCVurEY3B3iRtodzSUT1z7nzY=,21,31,1,3,20061014,0,0,3,1,0
1494,dQvFM86vRVvPZItdmYKKMWM+HDqaxvaEYoBhEANS9c8=,21,29,1,3,20070209,0,0,3,1,0
1500,3kE7mL1nlSTuIhBmXPOrg1/qsTZKR3CSDM0owe/8yIU=,21,25,0,3,20070211,0,0,3,1,0
1607,IzbTmOOBTEs0VIqJ5cPESv+EpEL8faokkdSWNfk6fKI=,21,20,0,3,20070421,0,0,2,1,0


In [23]:
def city_zone(x):
	if x in set([2,1,20,16,17,17]):
		return 1 # 最不容易流失的city_zone
	elif x in set([11,13,7,18,14,9]):
		return 2 
	elif x in set([10,5,22,6,15,12]):
		return 3 
	else:
		return 4 # 最容易流失的city_zone

In [24]:
members['city_zone'] = members.city.apply(city_zone)


In [22]:
set([2,1,20,16,17,17])

{1, 2, 16, 17, 20}

In [25]:
#==============================================================================
print('reduce memory')
#==============================================================================
utils.reduce_memory(members)
gc.collect()

reduce memory


100%|██████████| 12/12 [00:00<00:00, 492.11it/s]


0

In [26]:
members.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6769473 entries, 0 to 6769472
Data columns (total 12 columns):
msno                      object
city                      int8
bd                        int8
gender                    int8
registered_via            int8
registration_init_time    int32
is_registerd_via_4        int8
is_registerd_via_7        int8
bd_zone                   int8
is_living_in_city-21      int8
is_living_in_city-2       int8
city_zone                 int8
dtypes: int32(1), int8(10), object(1)
memory usage: 142.0+ MB
