In [1]:
import re
from scipy import stats
import os
import pandas as pd # package for high-performance, easy-to-use data structures and data analysis
import numpy as np  # fundamental package for acientific computing with python
import matplotlib 
from matplotlib import pyplot as plt # for plotting
import seaborn as sns # for making plots with seaborn
color = sns.color_palette()
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot

from plotly import tools

init_notebook_mode(connected=True)

import config as cfg
import utils

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline 

## Read in data

In [2]:
print(os.listdir('./data'))

['口碑 路虎 总.xlsx', 'luhu_comments.xlsx', '路虎口碑所有用户.xlsx', '.DS_Store', '路虎口碑用户.xlsx', '路虎车商城.xlsx', 'luhu_koubei_user.xlsx', '路虎口碑用户-1.xlsx', '捷豹车商城.xlsx', '路虎口碑评论人.xlsx', '~$捷豹口碑用户.xlsx', '捷豹口碑用户.xlsx', '路虎帖子评论人.xlsx', 'luhu_comment_user.xlsx', '口碑 捷豹 评论.xlsx', 'jiebao_user_koubei.xlsx', 'luhu_all_user_0.xlsx', '.ipynb_checkpoints', '路虎口碑评论人-1.xlsx', 'jiebao_all_user_0.xlsx', 'jiebao_all_user_processed.xlsx', '捷豹口碑评论人.xlsx', '口碑 路虎 评论.xlsx', '~$路虎口碑所有用户.xlsx', 'jlr_all_user_processed.xlsx', '口碑 捷豹 总 252.xlsx', 'luhu_all_user_1.xlsx']


In [3]:
jiepao_path = './data/jiebao_all_user_0.xlsx'
luhu_path = './data/luhu_all_user_1.xlsx'
jiepao_df = pd.read_excel(jiepao_path)
luhu_df = pd.read_excel(luhu_path)

In [4]:
print(jiepao_df.keys() == luhu_df.keys())

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True]


In [5]:
jlr_df = jiepao_df.append(luhu_df)
print(jiepao_df.shape, luhu_df.shape, jlr_df.shape)

jlr_df.to_excel('jiebao_luhu_all_use.xlsx', encoding='utf-8')

(2981, 19) (2574, 19) (5555, 19)


### Frature engineering

In [6]:
print(jlr_df.keys())
jlr_df.head(1)

Index(['user_id', 'follow_count', 'fans_count', 'gender', 'birthday',
       'location', 'level', 'registration_time', 'properties', 'mileage',
       'post', 'cars', 'koubei_post', 'gas_mileage', 'car_friend_zone',
       'label', 'car_like', 'excellent_post_count', 'all_post_count'],
      dtype='object')


Unnamed: 0,user_id,follow_count,fans_count,gender,birthday,location,level,registration_time,properties,mileage,post,cars,koubei_post,gas_mileage,car_friend_zone,label,car_like,excellent_post_count,all_post_count
0,oden123,0,0,man,NaT,江西 宜春,1,2018-03-26,普通用户,4430,,['捷豹 捷豹XFL 2018款 XFL 2.0T 200PS 精英版'],"[{'title': '敲黑板了！2018款xfl精英版2个月的使用感受，要考的', 'ur...","[{'车型': '捷豹 捷豹XFL 2018款 XFL 2.0T 200PS 精英版', '...",,1,宝马5系,0,0


In [7]:
X = jlr_df.copy()
# X = jlr_df.drop(columns=['user_id',
#                          'birthday',
#                          'post', 
#                          'cars',
#                          'koubei_post',
#                          'car_friend_zone',
#                               ])
# Y = list(jlr_df['label'])

In [8]:
# X.location.unique()
# X.car_like.unique()
X.properties.unique()

array(['普通用户', '关禁闭', 'SLS赛威论坛版主', '捷豹F-PACE论坛版主', '编辑', '捷豹XF/XFL论坛版主',
       'YARiS L 致炫论坛版主', '宝马i3论坛版主', '大7 SUV论坛版主', '神行者2论坛版主'],
      dtype=object)

In [9]:
X['follow_count'] = X['follow_count'].astype('uint16')
X['fans_count'] = X['fans_count'].astype('uint16')

In [10]:
utils.catAndTrgt(df=X, col='follow_count', limit=20)

In [11]:
utils.catAndTrgt(df=X, col='fans_count', limit=20)

In [12]:
X['gender'] = X.apply(lambda row: utils.gender2bool(row), axis=1)
X['gender'] = X['gender'].astype('uint8')

In [13]:
utils.catAndTrgt(jlr_df, 'gender')

In [14]:
X['location'] = X.apply (lambda row: utils.loc2value(row), axis=1)
X['location'] = X['location'].astype('uint16')

In [15]:
utils.catAndTrgt(jlr_df, 'location',10)

In [17]:
X['level'] = X['level'].astype('uint8')
X.level.unique()

array([1, 2, 3, 4, 5, 6], dtype=uint64)

In [18]:
utils.catAndTrgt(jlr_df, 'level')

In [19]:
X['regis_year'] = X['registration_time'].dt.year.astype('uint16')
X['regis_month'] = X['registration_time'].dt.month.astype('uint8')
X['regis_day'] = X['registration_time'].dt.month.astype('uint8')

In [20]:
X['date'] = pd.to_datetime(X['registration_time'])    
X['date_delta'] = (X['date'] - X['date'].min())  / np.timedelta64(1,'D')

In [23]:
utils.catAndTrgt(X, 'date_delta', limit=10)

In [28]:
utils.catAndTrgt(X, 'registration_time', limit=100)

In [None]:
X['regis_time_relative'] = pd.Series(data=relative_time,)

In [None]:
# Group year and month as feature
# X['year_month'] = pd.to_datetime(X[['regis_day','regis_month','regis_year']])

In [None]:
X.regis_time_relative.iloc[0]

In [None]:
np.array(X['regis_time_relative'])/(10.**11)

In [29]:
X['properties'] = X.apply(lambda row: utils.property2value(row), axis=1)
X['properties'] = X['properties'].astype('uint8')
#new_df.head()

In [34]:
utils.catAndTrgt(jlr_df, 'properties')

In [35]:
X['car_like'] = X.apply(lambda row: utils.cars_like2value(row), axis=1)
X['car_like'] = X['car_like'].astype('uint16')

In [36]:
utils.catAndTrgt(jlr_df, 'car_like')

In [None]:
X['excellent_post_count'] = X['excellent_post_count'].astype('uint16')
X['all_post_count'] = X['all_post_count'].astype('uint16')

In [37]:
utils.catAndTrgt(jlr_df, 'excellent_post_count')

In [38]:
utils.catAndTrgt(jlr_df, 'all_post_count')

In [39]:
X['label'] = X['label'].astype('bool')

In [40]:
utils.exploreCat(jlr_df, 'label')

In [41]:
X.to_excel('./data/jlr_all_user_processed.xlsx', encoding='utf-8')

In [42]:
X.head()

Unnamed: 0,user_id,follow_count,fans_count,gender,birthday,location,level,registration_time,properties,mileage,...,car_friend_zone,label,car_like,excellent_post_count,all_post_count,regis_year,regis_month,regis_day,date,date_delta
0,oden123,0,0,2,NaT,1,1,2018-03-26,1,4430,...,,True,1,0,0,2018,3,3,2018-03-26,4522.0
1,生活1934626,2,4,2,NaT,2,1,2012-05-07,1,4270,...,"[{'名称': '汽车之家甘肃论坛车友会', '人数': 672, '创建时间': '201...",True,2,0,0,2012,5,5,2012-05-07,2373.0
2,wangzi1125,0,1,2,NaT,3,1,2017-06-18,1,2090,...,,True,3,0,1,2017,6,6,2017-06-18,4241.0
3,房产专家谢广财,3,2,2,NaT,4,1,2014-02-27,1,1120,...,,True,4,0,0,2014,2,2,2014-02-27,3034.0
4,南宫晗笑,4,5,2,NaT,5,1,2017-07-03,1,740,...,,True,1,0,14,2017,7,7,2017-07-03,4256.0
