In [1]:
from scipy import stats
import os
import pandas as pd # package for high-performance, easy-to-use data structures and data analysis
import numpy as np  # fundamental package for acientific computing with python
import matplotlib 
from matplotlib import pyplot as plt # for plotting
import seaborn as sns # for making plots with seaborn
color = sns.color_palette()
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot

%matplotlib inline

import config as cfg

from plotly import tools

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

init_notebook_mode(connected=True)

In [2]:
# The function to plot the distribution of the categorical values Horizontaly 
def bar_hor(df,  col, title, color, w=None, h=None, lm=0,  limit=100, return_trace=False, rev=False, xlb=False):
    cnt_srs = df[col].value_counts()
    yy = cnt_srs.head(limit).index[::-1]
    xx = cnt_srs.head(limit).values[::-1]
    if rev:
        yy = cnt_srs.tail(limit).index[::-1]
        xx = cnt_srs.tail(limit).values[::-1]
    if xlb:#????
        trace = go.Bar(y=xlb, x=xx,orientation='h', marker=dict(color=color))
    else:
        trace = go.Bar(y=yy, x=xx,orientation='h', marker=dict(color=color))
    if return_trace:
        return trace
    layout = dict(title=title, margin=dict(l=lm), width=w, height=h)
    data = [trace]
    fig = go.Figure(data=data, layout=layout)
    iplot(fig)

# The function to get the distribution of the categories according to the target
#(target de dtype=bool? or np.int8?)
def gp(df, col, title):
    df0 = df[df['label']==0]
    df1 = df[df['label']==1]
    a1 = df1[col].value_counts()
    b1 = df0[col].value_counts()
    
    total = dict(df[col].value_counts())
    x0 = a1.index
    x1 = b1.index
    
    y0 = [float(x)*100/total[x0[i]] for i,x in enumerate(a1.values)]
    y1 = [float(x)*100/total[x1[i]] for i,x in enumerate(b1.values)]
    
    trace1 = go.Bar(x=x0, y=y0, name="Target : 0", marker=dict(color="#96D38C"))
    trace2 = go.Bar(x=x1, y=y1, name="Target : 1", marker=dict(color="#FEBFB3"))
    
    return trace1, trace2

def exploreCat(df, col):
    t = df[col].value_counts()
    labels = t.index 
    values = t.values
    colors = ["#96D38C",  "#FEBFB3"]
    trace  = go.Pie(labels=labels, values=values,
                   hoverinfo="all",textinfo='value',
                   textfont=dict(size=12), 
                   marker=dict(colors=colors,
                               line=dict(color='#fff',width=2)))
    layout = go.Layout(title=col, height=400)
    fig = go.Figure(data=[trace], layout=layout)
    iplot(fig)


In [3]:
# the relation between the categorical column and the target
def catAndTrgt(df, col):
    tr0 = bar_hor(df, col, "Distribution of "+col, "#f975ae", w=700, lm=100, return_trace=True)
    tr1, tr2 = gp(df, col, "Distribution of Target with "+col)
    
    fig = tools.make_subplots(rows=1, cols=3, print_grid=False, 
                             subplot_titles=[col+" Distribution", "% of target=0", "% of target=1"])
    fig.append_trace(tr0, 1, 1);
    fig.append_trace(tr1, 1, 2);
    fig.append_trace(tr2, 1, 3);
    fig['layout'].update(height=350, showlegend=False, margin=dict(l=50));
    iplot(fig);

In [4]:
merge_df = pd.read_excel("data/jiebao_all_user.xlsx")
print(merge_df.shape)
merge_df.head(2)

(2961, 18)


Unnamed: 0,user_id,follow_count,fans_count,gender,birthday,location,level,registration_time,properties,mileage,post,cars,gas_mileage,car_friend_zone,label,car_like,excellent_post_count,all_post_count
0,oden123,0,0,man,NaT,江西 宜春,1,2018-03-26,普通用户,4430,[],['捷豹 捷豹XFL 2018款 XFL 2.0T 200PS 精英版'],"[{'车型': '捷豹 捷豹XFL 2018款 XFL 2.0T 200PS 精英版', '...",[],1,宝马5系,0,0
1,生活1934626,2,4,man,NaT,甘肃 白银,1,2012-05-07,普通用户,4270,[],[],"[{'车型': '捷豹 捷豹XFL 2018款 XFL 2.0T 250PS 豪华版', '...","[{'名称': '汽车之家甘肃论坛车友会', '人数': 672, '创建时间': '201...",1,奥迪A6L,0,0


In [5]:
total = merge_df.isnull().sum().sort_values(ascending=False)
percentage = (merge_df.isnull().sum()/merge_df.isnull().count()*100).sort_values(ascending=False)
missing_use_luhu_data = pd.concat([total,percentage], axis=1,keys=['total', 'missing_percentage'])
missing_use_luhu_data.head(18)

Unnamed: 0,total,missing_percentage
birthday,2563,86.558595
car_like,8,0.270179
car_friend_zone,2,0.067545
gas_mileage,2,0.067545
cars,2,0.067545
all_post_count,0,0.0
level,0,0.0
follow_count,0,0.0
fans_count,0,0.0
gender,0,0.0


In [6]:
exploreCat(merge_df, "label")

In [7]:
bar_hor(merge_df, "label", "Distribution of label", color=['#44ff54','#ff4444'], h=350, w=600,
       lm=200, xlb=['label : 0', "label : 1"])

In [8]:
exploreCat(merge_df, "gender")

In [9]:
catAndTrgt(merge_df, "level")

In [10]:
catAndTrgt(merge_df, "mileage")

In [11]:
catAndTrgt(merge_df, "all_post_count")

In [12]:
catAndTrgt(merge_df, "excellent_post_count")

In [13]:
catAndTrgt(merge_df, "car_like")

In [14]:
catAndTrgt(merge_df, "location")

In [15]:
merge_df.head(5)

Unnamed: 0,user_id,follow_count,fans_count,gender,birthday,location,level,registration_time,properties,mileage,post,cars,gas_mileage,car_friend_zone,label,car_like,excellent_post_count,all_post_count
0,oden123,0,0,man,NaT,江西 宜春,1,2018-03-26,普通用户,4430,[],['捷豹 捷豹XFL 2018款 XFL 2.0T 200PS 精英版'],"[{'车型': '捷豹 捷豹XFL 2018款 XFL 2.0T 200PS 精英版', '...",[],1,宝马5系,0,0
1,生活1934626,2,4,man,NaT,甘肃 白银,1,2012-05-07,普通用户,4270,[],[],"[{'车型': '捷豹 捷豹XFL 2018款 XFL 2.0T 250PS 豪华版', '...","[{'名称': '汽车之家甘肃论坛车友会', '人数': 672, '创建时间': '201...",1,奥迪A6L,0,0
2,wangzi1125,0,1,man,NaT,广东 东莞,1,2017-06-18,普通用户,2090,"[{'标题': '请问有混动MKZ的车主吗', 'url': '//club.autohom...",[],[],[],1,帝豪GS,0,1
3,房产专家谢广财,3,2,man,NaT,广东 广州,1,2014-02-27,普通用户,1120,[],[],[],[],1,比亚迪S7,0,0
4,南宫晗笑,4,5,man,NaT,北京,1,2017-07-03,普通用户,740,"[{'标题': '5000公里尊享数据', 'url': '//club.autohome....",[],[],[],1,宝马5系,0,14


In [16]:
merge_df.dtypes

user_id                         object
follow_count                     int64
fans_count                       int64
gender                          object
birthday                datetime64[ns]
location                        object
level                            int64
registration_time       datetime64[ns]
properties                      object
mileage                          int64
post                            object
cars                            object
gas_mileage                     object
car_friend_zone                 object
label                            int64
car_like                        object
excellent_post_count             int64
all_post_count                   int64
dtype: object

In [17]:
# key 'user_id',
# 离散值 'car_like'  'location', 'level', #
# binary 'label', 'gender', 'properties',
# 连续值 'follow_count', 'fans_count','mileage','gas_mileage',
# Time 'registration_time', 'birthday',
# 其他'post','cars',car_friend_zone',

In [18]:
merge_df['gender'].unique()

array(['man', 'woman'], dtype=object)

In [19]:
# creat new df, copy the data we need in 
new_df = pd.DataFrame() 

In [20]:
new_df['user_id'] = merge_df['user_id']
new_df['follow_count']  = merge_df['follow_count']
new_df['fans_count'] =  merge_df['fans_count']
#new_df.head()

In [21]:
new_df['follow_count'].describe()

count    2961.000000
mean       10.340088
std        24.923454
min         0.000000
25%         0.000000
50%         2.000000
75%         9.000000
max       276.000000
Name: follow_count, dtype: float64

In [22]:
new_df['fans_count'].describe()

count    2961.000000
mean       13.227288
std        32.096518
min         0.000000
25%         1.000000
50%         3.000000
75%        12.000000
max       561.000000
Name: fans_count, dtype: float64

In [23]:
new_df['follow_count']=new_df['follow_count'].astype('uint16')
new_df['fans_count']=new_df['fans_count'].astype('uint16')

In [24]:
def gender2bool(row):
    import config as cfg
    return cfg._C.dict_gender[row['gender']] + 1

In [25]:
new_df['gender'] = merge_df['gender']
new_df['gender'] = new_df.apply (lambda row: gender2bool(row), axis=1)
new_df['gender'] = new_df['gender'].astype('uint8')

In [26]:
locations = merge_df['location'].unique()

In [27]:
def loc2value(row):
    import config as cfg
    locs = cfg._C.LOCATIONS 
    len_loc = len(locs)
    dict_loc = dict(zip(locs, range(len_loc)))
    return dict_loc[row['location']] + 1

In [28]:
new_df['location'] = merge_df['location']
new_df['location'] = new_df.apply (lambda row: loc2value(row), axis=1)
new_df['location'] = new_df['location'].astype('uint8')
#new_df.head()

In [29]:
new_df['level'] = merge_df['level'].astype('uint8')
new_df['level'].unique()

array([1, 2, 3, 4, 5], dtype=uint64)

In [30]:
def extract_year(row):
    return row['regis_year'].dt.year.astype('uint16')
def extract_month(row):
    return row['regis_month'].dt.month.astype('uint8')
def extract_day(row):
    return row['regis_day'].dt.day.astype('uint8')

In [31]:
new_df['regis_year'] = merge_df['registration_time'].dt.year.astype('uint16')
new_df['regis_month'] = merge_df['registration_time'].dt.month.astype('uint8')
new_df['regis_day'] = merge_df['registration_time'].dt.month.astype('uint8')
new_df.head()

Unnamed: 0,user_id,follow_count,fans_count,gender,location,level,regis_year,regis_month,regis_day
0,oden123,0,0,2,1,1,2018,3,3
1,生活1934626,2,4,2,2,1,2012,5,5
2,wangzi1125,0,1,2,3,1,2017,6,6
3,房产专家谢广财,3,2,2,4,1,2014,2,2
4,南宫晗笑,4,5,2,5,1,2017,7,7


In [32]:
merge_df['properties'].unique()

array(['普通用户', '关禁闭', 'SLS赛威论坛版主', '捷豹F-PACE论坛版主', '编辑', '捷豹XF/XFL论坛版主'],
      dtype=object)

In [33]:
def property2value(row):
    import config as cfg
    properties = cfg._C.PROPERTIES
    len_prop = len(properties)
    dict_prop = dict(zip(properties, range(len_prop)))
    return dict_prop[row['properties']] + 1

In [34]:
new_df['properties'] = merge_df['properties']
new_df['properties'] = new_df.apply(lambda row: property2value(row), axis=1)
new_df['properties'] = new_df['properties'].astype('uint8')
#new_df.head()

In [50]:
new_df.dtypes

user_id          object
follow_count     uint16
fans_count       uint16
gender            uint8
location          uint8
level             uint8
regis_year       uint16
regis_month       uint8
regis_day         uint8
properties        uint8
mileage         float32
car_like         uint16
dtype: object

In [36]:
new_df['mileage'] = merge_df['mileage'].astype('float32')

In [44]:
new_df['car_like'] = merge_df['car_like']
# new_df['car_like'].unique()

In [45]:
def cars_like2value(row):
    import config as cfg
    cars = cfg._C.CARS_LIKE
    len_cars = len(cars)
    dict_cars = dict(zip(cars, range(len_cars)))
    if str(row['car_like']) == 'nan':
        return 0
    else:
        return dict_cars[row['car_like']] + 1 

In [48]:
new_df['car_like'] = new_df.apply(lambda row: cars_like2value(row), axis=1)
#new_df.head()

In [49]:
new_df['car_like'] = new_df['car_like'].astype('uint16')

In [51]:
merge_df['excellent_post_count'].describe()

count    2961.000000
mean        0.168524
std         0.871645
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max        27.000000
Name: excellent_post_count, dtype: float64

In [52]:
merge_df['all_post_count'].describe()

count    2961.000000
mean        9.336711
std        48.654599
min         0.000000
25%         0.000000
50%         1.000000
75%         7.000000
max      2391.000000
Name: all_post_count, dtype: float64

In [53]:
new_df['excellent_post_count'] = merge_df['excellent_post_count'].astype('uint16')
new_df['all_post_count'] = merge_df['all_post_count'].astype('uint16')

In [55]:
new_df['label'] = merge_df['label'].astype('bool')

In [56]:
new_df.head()

Unnamed: 0,user_id,follow_count,fans_count,gender,location,level,regis_year,regis_month,regis_day,properties,mileage,car_like,excellent_post_count,all_post_count,label
0,oden123,0,0,2,1,1,2018,3,3,1,4430.0,1,0,0,True
1,生活1934626,2,4,2,2,1,2012,5,5,1,4270.0,2,0,0,True
2,wangzi1125,0,1,2,3,1,2017,6,6,1,2090.0,3,0,1,True
3,房产专家谢广财,3,2,2,4,1,2014,2,2,1,1120.0,4,0,0,True
4,南宫晗笑,4,5,2,5,1,2017,7,7,1,740.0,1,0,14,True


In [57]:
new_df.dtypes

user_id                  object
follow_count             uint16
fans_count               uint16
gender                    uint8
location                  uint8
level                     uint8
regis_year               uint16
regis_month               uint8
regis_day                 uint8
properties                uint8
mileage                 float32
car_like                 uint16
excellent_post_count     uint16
all_post_count           uint16
label                      bool
dtype: object

In [60]:
new_df.to_excel('data/jiebao_all_user_processed.xlsx', encoding='utf-8')

In [61]:
new_df.to_csv('data/jiebao_all_user_processed.csv', encoding='utf-8')