In [1]:
from scipy import stats
import os
import pandas as pd # package for high-performance, easy-to-use data structures and data analysis
import numpy as np  # fundamental package for acientific computing with python
import matplotlib 
from matplotlib import pyplot as plt # for plotting
import seaborn as sns # for making plots with seaborn
color = sns.color_palette()
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
%matplotlib inline

from plotly import tools

init_notebook_mode(connected=True)

In [2]:
# The function to plot the distribution of the categorical values Horizontaly 
def bar_hor(df,  col, title, color, w=None, h=None, lm=0,  limit=100, return_trace=False, rev=False, xlb=False):
    cnt_srs = df[col].value_counts()
    yy = cnt_srs.head(limit).index[::-1]
    xx = cnt_srs.head(limit).values[::-1]
    if rev:
        yy = cnt_srs.tail(limit).index[::-1]
        xx = cnt_srs.tail(limit).values[::-1]
    if xlb:#????
        trace = go.Bar(y=xlb, x=xx,orientation='h', marker=dict(color=color))
    else:
        trace = go.Bar(y=yy, x=xx,orientation='h', marker=dict(color=color))
    if return_trace:
        return trace
    layout = dict(title=title, margin=dict(l=lm), width=w, height=h)
    data = [trace]
    fig = go.Figure(data=data, layout=layout)
    iplot(fig)

# The function to get the distribution of the categories according to the target
#(target de dtype=bool? or np.int8?)
def gp(df, col, title):
    df0 = df[df['label']==0]
    df1 = df[df['label']==1]
    a1 = df1[col].value_counts()
    b1 = df0[col].value_counts()
    
    total = dict(df[col].value_counts())
    x0 = a1.index
    x1 = b1.index
    
    y0 = [float(x)*100/total[x0[i]] for i,x in enumerate(a1.values)]
    y1 = [float(x)*100/total[x1[i]] for i,x in enumerate(b1.values)]
    
    trace1 = go.Bar(x=x0, y=y0, name="Target : 0", marker=dict(color="#96D38C"))
    trace2 = go.Bar(x=x1, y=y1, name="Target : 1", marker=dict(color="#FEBFB3"))
    
    return trace1, trace2

def exploreCat(df, col):
    t = df[col].value_counts()
    labels = t.index 
    values = t.values
    colors = ["#96D38C",  "#FEBFB3"]
    trace  = go.Pie(labels=labels, values=values,
                   hoverinfo="all",textinfo='value',
                   textfont=dict(size=12), 
                   marker=dict(colors=colors,
                               line=dict(color='#fff',width=2)))
    layout = go.Layout(title=col, height=400)
    fig = go.Figure(data=[trace], layout=layout)
    iplot(fig)

# the relation between the categorical column and the target
def catAndTrgt(df, col):
    tr0 = bar_hor(df, col, "Distribution of "+col, "#f975ae", w=700, lm=100, return_trace=True)
    tr1, tr2 = gp(df, col, "Distribution of Target with "+col)
    
    fig = tools.make_subplots(rows=1, cols=3, print_grid=False, 
                             subplot_titles=[col+" Distribution", "% of target=0", "% of target=1"])
    fig.append_trace(tr0, 1, 1);
    fig.append_trace(tr1, 1, 2);
    fig.append_trace(tr2, 1, 3);
    fig['layout'].update(height=350, showlegend=False, margin=dict(l=50));
    iplot(fig);

In [3]:
merge_df = pd.read_excel("data/jiebao口碑所有用户.xlsx")
print(merge_df.shape)
merge_df.head(2)

(2926, 18)


Unnamed: 0,user_id,follow_count,fans_count,gender,birthday,location,level,registration_time,properties,mileage,post,cars,gas_mileage,car_friend_zone,label,car_like,excellent_post_count,all_post_count
0,oden123,0,0,man,NaT,江西 宜春,1,2018-03-26,普通用户,4430,[],['捷豹 捷豹XFL 2018款 XFL 2.0T 200PS 精英版'],"[{'车型': '捷豹 捷豹XFL 2018款 XFL 2.0T 200PS 精英版', '...",[],1,宝马5系,0,0
1,生活1934626,2,4,man,NaT,甘肃 白银,1,2012-05-07,普通用户,4270,[],[],"[{'车型': '捷豹 捷豹XFL 2018款 XFL 2.0T 250PS 豪华版', '...","[{'名称': '汽车之家甘肃论坛车友会', '人数': 672, '创建时间': '201...",1,奥迪A6L,0,0


In [5]:
total = merge_df.isnull().sum().sort_values(ascending=False)
percentage = (merge_df.isnull().sum()/merge_df.isnull().count()*100).sort_values(ascending=False)
missing_use_luhu_data = pd.concat([total,percentage], axis=1,keys=['total', 'missing_percentage'])
missing_use_luhu_data.head(18)

Unnamed: 0,total,missing_percentage
birthday,2531,86.500342
car_like,8,0.273411
car_friend_zone,2,0.068353
gas_mileage,2,0.068353
cars,2,0.068353
all_post_count,0,0.0
level,0,0.0
follow_count,0,0.0
fans_count,0,0.0
gender,0,0.0


In [14]:
exploreCat(merge_df, "label")

In [6]:
bar_hor(merge_df, "label", "Distribution of label", color=['#44ff54','#ff4444'], h=350, w=600,
       lm=200, xlb=['label : 0', "label : 1"])

In [7]:
exploreCat(merge_df, "gender")

In [8]:
catAndTrgt(merge_df, "level")


plotly.graph_objs.Font is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.Font
  - plotly.graph_objs.layout.hoverlabel.Font
  - etc.




In [9]:
catAndTrgt(merge_df, "mileage")


plotly.graph_objs.Font is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.Font
  - plotly.graph_objs.layout.hoverlabel.Font
  - etc.




In [10]:
catAndTrgt(merge_df, "all_post_count")


plotly.graph_objs.Font is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.Font
  - plotly.graph_objs.layout.hoverlabel.Font
  - etc.




In [11]:
catAndTrgt(merge_df, "excellent_post_count")


plotly.graph_objs.Font is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.Font
  - plotly.graph_objs.layout.hoverlabel.Font
  - etc.




In [12]:
catAndTrgt(merge_df, "car_like")


plotly.graph_objs.Font is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.Font
  - plotly.graph_objs.layout.hoverlabel.Font
  - etc.




In [13]:
catAndTrgt(merge_df, "location")


plotly.graph_objs.Font is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.Font
  - plotly.graph_objs.layout.hoverlabel.Font
  - etc.


