In [34]:
import os
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

import config as cfg
import utils

%matplotlib inline 

## Read in data

In [35]:
os.listdir('./data')

['口碑 路虎 总.xlsx',
 'luhu_comments.xlsx',
 '路虎口碑所有用户.xlsx',
 '.DS_Store',
 '路虎口碑用户.xlsx',
 '路虎车商城.xlsx',
 'luhu_koubei_user.xlsx',
 '路虎口碑用户-1.xlsx',
 '捷豹车商城.xlsx',
 '路虎口碑评论人.xlsx',
 '~$捷豹口碑用户.xlsx',
 '捷豹口碑用户.xlsx',
 '路虎帖子评论人.xlsx',
 'luhu_comment_user.xlsx',
 '口碑 捷豹 评论.xlsx',
 'jiebao_user_koubei.xlsx',
 'luhu_all_user_0.xlsx',
 '.ipynb_checkpoints',
 '路虎口碑评论人-1.xlsx',
 'jiebao_all_user_0.xlsx',
 'jiebao_all_user_processed.xlsx',
 '捷豹口碑评论人.xlsx',
 '口碑 路虎 评论.xlsx',
 '~$路虎口碑所有用户.xlsx',
 'jlr_all_user_processed.xlsx',
 '口碑 捷豹 总 252.xlsx',
 'luhu_all_user_1.xlsx']

In [36]:
jiepao_path = './data/jiebao_all_user_0.xlsx'
luhu_path = './data/luhu_all_user_1.xlsx'
jiepao_df = pd.read_excel(jiepao_path)
luhu_df = pd.read_excel(luhu_path)

In [37]:
print(jiepao_df.keys() == luhu_df.keys())

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True]


In [38]:
jlr_df = jiepao_df.append(luhu_df)
print(jiepao_df.shape, luhu_df.shape, jlr_df.shape)

jlr_df.to_excel('jiebao_luhu_all_use.xlsx', encoding='utf-8')

(2981, 19) (2574, 19) (5555, 19)


### Frature engineering

In [39]:
print(jlr_df.keys())
jlr_df.head(1)

Index(['user_id', 'follow_count', 'fans_count', 'gender', 'birthday',
       'location', 'level', 'registration_time', 'properties', 'mileage',
       'post', 'cars', 'koubei_post', 'gas_mileage', 'car_friend_zone',
       'label', 'car_like', 'excellent_post_count', 'all_post_count'],
      dtype='object')


Unnamed: 0,user_id,follow_count,fans_count,gender,birthday,location,level,registration_time,properties,mileage,post,cars,koubei_post,gas_mileage,car_friend_zone,label,car_like,excellent_post_count,all_post_count
0,oden123,0,0,man,NaT,江西 宜春,1,2018-03-26,普通用户,4430,,['捷豹 捷豹XFL 2018款 XFL 2.0T 200PS 精英版'],"[{'title': '敲黑板了！2018款xfl精英版2个月的使用感受，要考的', 'ur...","[{'车型': '捷豹 捷豹XFL 2018款 XFL 2.0T 200PS 精英版', '...",,1,宝马5系,0,0


In [40]:
X = jlr_df.copy()
# X = jlr_df.drop(columns=['user_id',
#                          'birthday',
#                          'post', 
#                          'cars',
#                          'koubei_post',
#                          'car_friend_zone',
#                               ])
# Y = list(jlr_df['label'])

In [41]:
# X.location.unique()
# X.car_like.unique()
X.properties.unique()

array(['普通用户', '关禁闭', 'SLS赛威论坛版主', '捷豹F-PACE论坛版主', '编辑', '捷豹XF/XFL论坛版主',
       'YARiS L 致炫论坛版主', '宝马i3论坛版主', '大7 SUV论坛版主', '神行者2论坛版主'],
      dtype=object)

In [42]:
X['follow_count'] = X['follow_count'].astype('uint16')
X['fans_count'] = X['fans_count'].astype('uint16')

In [43]:
X['gender'] = X.apply (lambda row: utils.gender2bool(row), axis=1)
X['gender'] = X['gender'].astype('uint8')

In [44]:
X['location'] = X.apply (lambda row: utils.loc2value(row), axis=1)
X['location'] = X['location'].astype('uint16')

In [45]:
X['level'] = X['level'].astype('uint8')
X.level.unique()

array([1, 2, 3, 4, 5, 6], dtype=uint64)

In [46]:
X['regis_year'] = X['registration_time'].dt.year.astype('uint16')
X['regis_month'] = X['registration_time'].dt.month.astype('uint8')
X['regis_day'] = X['registration_time'].dt.month.astype('uint8')

In [56]:
X['date'] = pd.to_datetime(X['registration_time'])    
X['date_delta'] = (X['date'] - X['date'].min())  / np.timedelta64(1,'D')

In [57]:
a = np.array(['2000-01-01'], dtype='M8[D]')
print(a)
b = np.array(X.registration_time, dtype='M8[D]')
print(b[0])
relative_time = b - a
print(relative_time[:3])

['2000-01-01']
2018-03-26
[6659 4510 6378]


In [58]:
X['regis_time_relative'] = pd.Series(data=relative_time,)

In [60]:
# Group year and month as feature
# X['year_month'] = pd.to_datetime(X[['regis_day','regis_month','regis_year']])

In [61]:
X.regis_time_relative.iloc[0]

Timedelta('6659 days 00:00:00')

In [62]:
np.array(X['regis_time_relative'])/(10.**11)

array([5753376, 3896640, 5510592, ..., 5223744, 5760288, 5349024],
      dtype='timedelta64[ns]')

In [63]:
X['properties'] = X.apply(lambda row: utils.property2value(row), axis=1)
X['properties'] = X['properties'].astype('uint8')
#new_df.head()

In [64]:
X['car_like'] = X.apply(lambda row: utils.cars_like2value(row), axis=1)
X['car_like'] = X['car_like'].astype('uint16')

In [65]:
X['excellent_post_count'] = X['excellent_post_count'].astype('uint16')
X['all_post_count'] = X['all_post_count'].astype('uint16')

In [66]:
X['label'] = X['label'].astype('bool')

In [67]:
X.to_excel('./data/jlr_all_user_processed.xlsx', encoding='utf-8')

In [68]:
X.head()

Unnamed: 0,user_id,follow_count,fans_count,gender,birthday,location,level,registration_time,properties,mileage,...,label,car_like,excellent_post_count,all_post_count,regis_year,regis_month,regis_day,regis_time_relative,date,date_delta
0,oden123,0,0,2,NaT,1,1,2018-03-26,1,4430,...,True,1,0,0,2018,3,3,6659 days,2018-03-26,4522.0
1,生活1934626,2,4,2,NaT,2,1,2012-05-07,1,4270,...,True,2,0,0,2012,5,5,4510 days,2012-05-07,2373.0
2,wangzi1125,0,1,2,NaT,3,1,2017-06-18,1,2090,...,True,3,0,1,2017,6,6,6378 days,2017-06-18,4241.0
3,房产专家谢广财,3,2,2,NaT,4,1,2014-02-27,1,1120,...,True,4,0,0,2014,2,2,5171 days,2014-02-27,3034.0
4,南宫晗笑,4,5,2,NaT,5,1,2017-07-03,1,740,...,True,1,0,14,2017,7,7,6393 days,2017-07-03,4256.0
