# Table Of Content
---

1. ### [Data Processing](#data_processing)
    1. ### [User Data Processing](#user_data_processing)
    2. ### [Post Data Processing](#post_data_processing)

In [1]:
import pickle
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from collections import deque

<a name="data_processing"/>

# Data Processing

<a name="user_data_processing"/>

## User Data Processing

In [2]:
with open('users_with_content_df.pkl','rb') as f:
    users_with_content = pickle.load(f)
users_with_content.head()

Unnamed: 0,id,gender,#tweets,#follower,#following,verified,content,location,#censoship
0,6309921628,f,2902,344,403,False,1.0,河南 许昌,0
1,6007931743,m,11,20,163,False,1.0,江苏 南京,0
2,7471743898,m,5,10,265,False,1.0,其他,0
3,5701747600,f,3175,95113,1052,False,1.0,上海 杨浦,0
4,7577162125,m,25,3,190,False,1.0,山东 济南,0


<a name="follower_count_percentile"/>

### Follower Count Percentile

In [3]:
users_with_content.sort_values('#follower', inplace=True)
df_len = len(users_with_content)

lesser_num = 0
cur_follower_num = 0
cur_count = 0
percentiles = deque()

for follower_num in users_with_content['#follower']:
    if follower_num == cur_follower_num:
        cur_count += 1
    else:
        percentile = (lesser_num) / df_len * 100
        percentiles.extend([percentile for _ in range(cur_count)])
        cur_follower_num = follower_num
        lesser_num += cur_count 
        cur_count = 1

percentile = (lesser_num) / df_len * 100
percentiles.extend([percentile for _ in range(cur_count)])

users_with_content["#follower_percentile_grouped"] = list(map(lambda x: x//10 + 1, percentiles))
users_with_content["#follower_percentile_grouped"] = users_with_content["#follower_percentile_grouped"].astype('int32')
users_with_content.tail()

Unnamed: 0,id,gender,#tweets,#follower,#following,verified,content,location,#censoship,#follower_percentile_grouped
426005,2803301701,m,135297,138009780,3061,True,1.0,北京,0,10
258474,1934183965,f,2318,158452899,183,True,1.0,北京,0,10
203828,5878659096,m,10368,181509693,2553,True,1.0,北京,0,10
505034,2016713117,f,15318,183042610,135,True,1.0,北京 海淀,0,10
76225,1642909335,f,5632,197923455,2931,True,1.0,北京,0,10


<a name="location_to_province"/>

### Location to Province

In [4]:
weird_prefixes = {"昵称", "认证"}

def to_province(location):
    location = location.split(' ')[0]
    return location if all((matcher not in location for matcher in weird_prefixes)) else '其他'
users_with_content['province'] = users_with_content['location'].map(to_province)
users_with_content['province']

536321    其他
278927    其他
405016    其他
234168    其他
151815    河南
          ..
426005    北京
258474    北京
203828    北京
505034    北京
76225     北京
Name: province, Length: 574357, dtype: object

<a name="post_number"/>

### Post Number

In [6]:
from collections import defaultdict
user_post_nums = defaultdict(int)
for i in tqdm(range(12)):
    with open('raw_repost_df_w_id_%d.pkl' %i, 'rb') as f:
        df = pickle.load(f)
    for uid in df['user_id']:
        user_post_nums[uid] += 1
users_with_content['#collected_posts'] = users_with_content['id'].map(lambda uid: user_post_nums[uid])
users_with_content.head()

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [01:33<00:00,  7.77s/it]


Unnamed: 0,id,gender,#tweets,#follower,#following,verified,content,location,#censoship,#follower_percentile_grouped,province,#collected_posts
536321,2453175617,f,2,0,172,False,1.0,其他,0,1,其他,1
278927,2123046974,f,8,0,18,False,1.0,其他,0,1,其他,3
405016,1883592641,m,724,0,1215,False,1.0,其他,1,1,其他,58
234168,2639003040,m,32,0,51,False,1.0,其他,14,1,其他,62
151815,2045211094,m,15,0,48,False,1.0,河南,0,1,河南,4


### Province GDP

In [7]:
from collections import defaultdict

gdp_by_province = defaultdict(int) # default value for GDP is 0

gdp_by_province.update({
    "北京":23805,
    "上海":23277,
    "江苏":17121,
    "浙江":16358,
    "福建":15531,
    "广东":14223,
    "天津":13569,
    "湖北":10988,
    "重庆":10867,
    "山东":10811,
    "内蒙古":9977,
    "陕西":9239,
    "安徽":8703,
    "湖南":8681,
    "辽宁":8667,
    "海南":8323,
    "河南":8302,
    "四川":8229,
    "新疆":7721,
    "宁夏":7686,
    "江西":7682,
    "青海":6998,
    "西藏":6997,
    "云南":6950,
    "贵州":6828,
    "河北":6797,
    "山西":6735,
    "吉林":6577,
    "广西":6386,
    "黑龙江":5129,
    "甘肃":4624,
    "香港":46700,
    "台湾":28306,
    "澳门":38769,
})

users_with_content['province_gdp'] = users_with_content['province'].map(lambda p: gdp_by_province[p])
users_with_content.head()

Unnamed: 0,id,gender,#tweets,#follower,#following,verified,content,location,#censoship,#follower_percentile_grouped,province,#collected_posts,province_gdp
536321,2453175617,f,2,0,172,False,1.0,其他,0,1,其他,1,0
278927,2123046974,f,8,0,18,False,1.0,其他,0,1,其他,3,0
405016,1883592641,m,724,0,1215,False,1.0,其他,1,1,其他,58,0
234168,2639003040,m,32,0,51,False,1.0,其他,14,1,其他,62,0
151815,2045211094,m,15,0,48,False,1.0,河南,0,1,河南,4,8302


### Censored Ratio & renaming

In [8]:
users_with_content = users_with_content.rename(columns={
    'verified':"verification",
    '#censoship':'#censored_posts'
})
users_with_content['censored'] = users_with_content['#censored_posts'].map(lambda x: x > 0)
users_with_content['censored_ratio'] = users_with_content['#censored_posts']/users_with_content['#collected_posts']
users_with_content['censored_ratio'] = users_with_content['censored_ratio'].map(lambda x: x if x != float('inf') else 0)
users_with_content.head()

Unnamed: 0,id,gender,#tweets,#follower,#following,verification,content,location,#censored_posts,#follower_percentile_grouped,province,#collected_posts,province_gdp,censored,censored_ratio
536321,2453175617,f,2,0,172,False,1.0,其他,0,1,其他,1,0,False,0.0
278927,2123046974,f,8,0,18,False,1.0,其他,0,1,其他,3,0,False,0.0
405016,1883592641,m,724,0,1215,False,1.0,其他,1,1,其他,58,0,True,0.017241
234168,2639003040,m,32,0,51,False,1.0,其他,14,1,其他,62,0,True,0.225806
151815,2045211094,m,15,0,48,False,1.0,河南,0,1,河南,4,8302,False,0.0


### Unused Fields Cleansing

In [9]:
users_with_content.drop([
    'content',
    'location'
], axis = 'columns', inplace=True, errors='ignore')
users_with_content.head()

Unnamed: 0,id,gender,#tweets,#follower,#following,verification,#censored_posts,#follower_percentile_grouped,province,#collected_posts,province_gdp,censored,censored_ratio
536321,2453175617,f,2,0,172,False,0,1,其他,1,0,False,0.0
278927,2123046974,f,8,0,18,False,0,1,其他,3,0,False,0.0
405016,1883592641,m,724,0,1215,False,1,1,其他,58,0,True,0.017241
234168,2639003040,m,32,0,51,False,14,1,其他,62,0,True,0.225806
151815,2045211094,m,15,0,48,False,0,1,河南,4,8302,False,0.0


In [10]:
users_with_content

Unnamed: 0,id,gender,#tweets,#follower,#following,verification,#censored_posts,#follower_percentile_grouped,province,#collected_posts,province_gdp,censored,censored_ratio
536321,2453175617,f,2,0,172,False,0,1,其他,1,0,False,0.000000
278927,2123046974,f,8,0,18,False,0,1,其他,3,0,False,0.000000
405016,1883592641,m,724,0,1215,False,1,1,其他,58,0,True,0.017241
234168,2639003040,m,32,0,51,False,14,1,其他,62,0,True,0.225806
151815,2045211094,m,15,0,48,False,0,1,河南,4,8302,False,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
426005,2803301701,m,135297,138009780,3061,True,0,10,北京,11,23805,False,0.000000
258474,1934183965,f,2318,158452899,183,True,0,10,北京,51,23805,False,0.000000
203828,5878659096,m,10368,181509693,2553,True,0,10,北京,51,23805,False,0.000000
505034,2016713117,f,15318,183042610,135,True,0,10,北京,54,23805,False,0.000000


In [11]:
with open('final/user_df.pkl', 'wb') as f:
    pickle.dump(users_with_content, f)

In [13]:
temp_df.head()

Unnamed: 0,reposting time,reposter_device,#likes,#comments,#reposts,repost_weibo_comment,source_user_id,source_user_nickname,source_weibo_device,#source_weibo_likes,#source_weibo_comments,source_weibo_reposts,source_weibo_content,user_id
0,Sun Aug 01 03:41:33 +0800 2021,iPhone客户端,1.0,0,0,转发微博,6804337464.0,oneneedless,iPhone客户端,105,3,48,该分享给谁呢 我今天经历的事 我此时此刻正在做的事 我看到的风景 我的单曲循环 我的难过 我...,5711341129
1,Sun Dec 27 22:39:03 +0800 2020,iPhone客户端,0.0,0,0,//@廿一几:转发微博,3674428202.0,obliviuforu,,1077,24,94,什么都不敢抱太大的希望 这是阴影也是教训,5711341129
2,Sun May 24 09:08:34 +0800 2020,iPhone客户端,3.0,0,0,转发微博,1719232542.0,那英,iPad客户端,117079,26148,96958,妈的，最烦装逼的人！,5711341129
0,Mon Aug 02 15:54:07 +0800 2021,vivo 全新水滴屏手机,0.0,0,0,转发微博,1407590331.0,复兴生活馆,微博 weibo.com,8189,473,10078,被世界遗忘的角落，只有植物缓缓生长，证明时间的流逝,7412051152
1,Mon Aug 02 11:58:25 +0800 2021,vivo 全新水滴屏手机,0.0,0,0,转发微博,6532230551.0,無端Cathyyy,iPhone客户端,72340,1156,15652,可不可以把运动Bra的代言也给女运动员们啊，女明星那些犹穿外套半遮面的照片又看不出来支撑强度...,7412051152


<a name="post_data_processing"/>

## Post Data Processing

In [34]:
import os
import glob
def findfiles(path):
    return glob.glob(path)
path='/root/data_general'
working_directory ='/root/data_general/tweets/*/'
tweets_path_list = findfiles(working_directory)

tweets_list = os.listdir('/root/data_general/tweets')


In [33]:
location_to_join = pd.read_csv('/root/data_general/locations_combined.csv', header = None, names = ['id','location'])
location_to_join['id'] = [str(i) for i in location_to_join['id']]
location_to_join

Unnamed: 0,id,location
0,5711341129,其他
1,7412051152,其他
2,7568601498,其他
3,1804028250,广东 广州
4,7448767160,四川
...,...,...
574395,6090644549,北京
574396,5940170360,其他
574397,3949398197,山西 晋中
574398,5266395467,山西 吕梁


In [2]:
with open('raw_repost_df_%d.pkl' %0, 'rb') as f:
    temp_df = pickle.load(f)

In [3]:
temp_df.head()

Unnamed: 0,reposting time,reposter_device,#likes,#comments,#reposts,repost_weibo_comment,source_user_id,source_user_nickname,source_weibo_device,#source_weibo_likes,#source_weibo_comments,source_weibo_reposts,source_weibo_content,user_id
0,Sun Aug 01 03:41:33 +0800 2021,iPhone客户端,1.0,0,0,转发微博,6804337464.0,oneneedless,iPhone客户端,105,3,48,该分享给谁呢 我今天经历的事 我此时此刻正在做的事 我看到的风景 我的单曲循环 我的难过 我...,
1,Sun Dec 27 22:39:03 +0800 2020,iPhone客户端,0.0,0,0,//@廿一几:转发微博,3674428202.0,obliviuforu,,1077,24,94,什么都不敢抱太大的希望 这是阴影也是教训,
2,Sun May 24 09:08:34 +0800 2020,iPhone客户端,3.0,0,0,转发微博,1719232542.0,那英,iPad客户端,117079,26148,96958,妈的，最烦装逼的人！,
0,Mon Aug 02 15:54:07 +0800 2021,vivo 全新水滴屏手机,0.0,0,0,转发微博,1407590331.0,复兴生活馆,微博 weibo.com,8189,473,10078,被世界遗忘的角落，只有植物缓缓生长，证明时间的流逝,
1,Mon Aug 02 11:58:25 +0800 2021,vivo 全新水滴屏手机,0.0,0,0,转发微博,6532230551.0,無端Cathyyy,iPhone客户端,72340,1156,15652,可不可以把运动Bra的代言也给女运动员们啊，女明星那些犹穿外套半遮面的照片又看不出来支撑强度...,


In [7]:
# For testing purposes only

with open('raw_repost_df_w_id_%d.pkl' %0, 'rb') as f:
    temp_df = pickle.load(f)

array([<class 'str'>, <class 'float'>, <class 'numpy.int64'>,
       <class 'numpy.float64'>], dtype=object)

In [20]:
temp_df['is_int'] = list(map(lambda x : type(x) is np.int64, temp_df['source_weibo_content']))

In [21]:
temp_df[temp_df['is_int']]

Unnamed: 0,reposting time,reposter_device,#likes,#comments,#reposts,repost_weibo_comment,source_user_id,source_user_nickname,source_weibo_device,#source_weibo_likes,#source_weibo_comments,source_weibo_reposts,source_weibo_content,user_id,is_int
0,,,,Sun Aug 01 06:47:34 +0800 2021,HUAWEI P30 Pro,0,7648424390.0,影视剧本人,https://f.video.weibocdn.com/000FNn11gx07Ox374...,,Mon Jul 26 17:23:44 +0800 2021,微博视频号,18,,True
1,,,,Sun Aug 01 00:09:56 +0800 2021,HUAWEI P30 Pro,0,6216754397.0,世界美食教程,https://f.video.weibocdn.com/DJPM0HPilx07OrWDR...,,Fri Jul 23 14:42:47 +0800 2021,微博视频号,37,,True
2,,,,Sun Aug 01 00:08:41 +0800 2021,HUAWEI P30 Pro,0,2713686834.0,最爱学做饭,https://f.video.weibocdn.com/4LMSA9WKlx07O0qIL...,,Tue Jul 06 06:34:33 +0800 2021,微博视频号,774,,True
3,,,,Fri Jul 30 18:53:56 +0800 2021,HUAWEI P30 Pro,0,6249072737.0,蜜瓜乌龙猹,https://f.video.weibocdn.com/mB6KpAPmlx07OCxhE...,,Fri Jul 30 11:00:14 +0800 2021,微博视频号,584,,True
4,,,,Fri Jul 30 15:28:49 +0800 2021,HUAWEI P30 Pro,0,6185411519.0,NBA篮球动态,https://f.video.weibocdn.com/2pnZ5Xoelx07ODd0c...,,Fri Jul 30 14:04:37 +0800 2021,微博视频号,35267,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34,,,,Mon Aug 19 19:44:38 +0800 2019,iPhone客户端,0,5467867079.0,优惠女孩bot,https://f.us.sinaimg.cn/002noi4ilx07vxpmxMB201...,,Fri Jul 19 09:31:08 +0800 2019,微博视频号,3677,,True
35,,,,Sat Aug 10 17:47:42 +0800 2019,iPhone客户端,0,6395351705.0,一个好设计,,,Sat Aug 10 17:41:03 +0800 2019,微博 weibo.com,331,,True
36,,,,Thu Jul 25 17:10:00 +0800 2019,iPhone客户端,0,3011739161.0,女生爱拍照,,,Thu Jul 25 14:00:03 +0800 2019,微博 weibo.com,19,,True
37,,,,Tue Jul 24 14:27:07 +0800 2018,iPhone客户端,0,5153273079.0,野驴叔叔,https://f.us.sinaimg.cn/0037COTplx07mdlQTKeI01...,,Sat Jul 21 23:31:31 +0800 2018,谜之微笑超话,399,,True


### Device Transform Definition

In [4]:
device_mapping = {
    'Apple':['iPhone','iPad','Mac'],
    'Web':['浏览器', '微博'],
    'Huawei':['Huawei','nova','华为','HUAWEI','nova','Harmony'],
    'Honor':['荣耀'],
    'XiaoMi':['小米', 'Redmi','红米'],
    'vivo':['vivo'],
    'OPPO':['OPPO'],
    'Samsung':['三星','Samsung'],
    'General Andoid':['android','Android'],
    'Realme':['realme','真我'],
    'IQOO':['iQOO'],
    'OnePlus':['一加','OnePlus']
}

def get_reposter_device(df):   
    def to_brand(client_name):
        if type(client_name) is not str:
            return 'NaN'
        for brand, devices in device_mapping.items():
            if any((device in client_name for device in devices)):
                return brand
        return 'other'
    
    return df['reposter_device'].map(to_brand).astype('string')

get_reposter_device(temp_df)

0    Apple
1    Apple
2    Apple
0     vivo
1     vivo
Name: reposter_device, dtype: string

### Censored Transform Definition

In [14]:
censored_patterns = [
    '抱歉，由于作者设置，',
    '该微博因被多人投诉',
    '该账号因被投诉违反',             
    '该账号因被投诉',             
    '查看帮助： 网页链接' 
]

def get_censored(df):
    def to_censored(content):
        return any((pattern in str(content) for pattern in censored_patterns))
    
    return df['source_weibo_content'].map(to_censored)

get_censored(temp_df)

0    False
1    False
2    False
0    False
1    False
Name: source_weibo_content, dtype: bool

### Time Transform Definition

In [15]:
from datetime import datetime

def get_timestamp(df):
    def to_timestamp(s):
        if type(s) is not str:
            return None
        return datetime.strptime(s, '%a %b %d %X %z %Y').replace(hour=0, minute=0, second=0, microsecond=0, day=1).timestamp()
    return df['reposting time'].map(to_timestamp)

get_timestamp(temp_df)

0    1.627747e+09
1    1.606752e+09
2    1.588262e+09
0    1.627747e+09
1    1.627747e+09
Name: reposting time, dtype: float64

### User Information Preparation

In [16]:
users_df = users_with_content.copy()
users_df = users_df.set_index('id')
users_df.head()

Unnamed: 0_level_0,gender,#tweets,#follower,#following,verification,#censored_posts,#follower_percentile_grouped,province,#collected_posts,province_gdp,censored,censored_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2453175617,f,2,0,172,False,0,1,其他,1,0,False,0.0
2123046974,f,8,0,18,False,0,1,其他,3,0,False,0.0
1883592641,m,724,0,1215,False,1,1,其他,58,0,True,0.017241
2639003040,m,32,0,51,False,14,1,其他,62,0,True,0.225806
2045211094,m,15,0,48,False,0,1,河南,4,8302,False,0.0


### Transformation Pipeline

In [18]:
def pipeline(df, partition):
    df = df.assign(
        general_device = get_reposter_device(df),
        censored = get_censored(df),
        timestamp_month = get_timestamp(df)
    )
    df.drop([
        "source_user_nickname",
        "source_user_id",
        "reposter_device",
        "source_weibo_device",
        "reposting time",
        "#likes",
        "#comments",
        "#reposts",
        "#source_weibo_likes",
        "#source_weibo_comments",
        "source_weibo_reposts"
    ], axis = 'columns', inplace=True)
    df = df.merge(users_df, left_on='user_id', right_on='id', right_index=True, suffixes=('_post', '_user'))
    with open('final/post_full_df_%d.pkl' % partition, 'wb') as f_out:
        pickle.dump(df, f_out)
        
    df.drop([
        "repost_weibo_comment",
        "source_weibo_content"
    ], axis = 'columns', inplace=True)
    return df

pipeline(temp_df, 10086)

Unnamed: 0,user_id,general_device,censored_post,timestamp_month,gender,#tweets,#follower,#following,verification,#censored_posts,#follower_percentile_grouped,province,#collected_posts,province_gdp,censored_user,censored_ratio
0,5711341129,Apple,False,1627747000.0,f,48,193,394,False,0,7,其他,3,0,False,0.0
1,5711341129,Apple,False,1606752000.0,f,48,193,394,False,0,7,其他,3,0,False,0.0
2,5711341129,Apple,False,1588262000.0,f,48,193,394,False,0,7,其他,3,0,False,0.0
0,7412051152,vivo,False,1627747000.0,m,129,1,38,False,1,1,其他,52,0,True,0.019231
1,7412051152,vivo,False,1627747000.0,m,129,1,38,False,1,1,其他,52,0,True,0.019231


In [19]:
# Transform Begin

num_dfs = 12

no_content_post_dfs = []

for i in tqdm(range(num_dfs)):
    with open('raw_repost_df_w_id_%d.pkl' %i, 'rb') as f:
        df = pickle.load(f)
    no_content_post_df = pipeline(df, i)
    no_content_post_dfs.append(no_content_post_df)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [08:23<00:00, 41.94s/it]


In [4]:
# Run this in case memory outage and no_content_post get stucked

# no_content_post_dfs = []
# for i in tqdm(range(12)):
#     with open('final/post_full_df_%d.pkl' %i, 'rb') as f:
#         df = pickle.load(f)
#     df.drop([
#         "repost_weibo_comment",
#         "source_weibo_content"
#     ], axis = 'columns', inplace=True)
#     no_content_post_dfs.append(df)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:49<00:00,  4.14s/it]


In [8]:
with open('final/post_no_content_df.pkl', 'wb') as f_out:
    pickle.dump(pd.concat(no_content_post_dfs), f_out)