# Table Of Content
---

1. ### [Data Processing](#data_processing)
    1. ### [User Data Processing](#user_data_processing)
    2. ### [Post Data Processing](#raw_post_data_processing)

    3. ### [Post Data Processing](#post_data_processing)

In [171]:
import pickle
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from collections import deque
from datetime import *
import re
from glob import glob
import os
from collections import defaultdict

In [194]:
processing_dir = os.getcwd()
path = '/'.join(processing_dir.split('/')[:-1])
tweets_path_list = glob(path+'/data_general/tweets/*/')
tweets_path_list[:5]

['/home/sean/Desktop/CSC440_project/data_general/tweets/小黑-9-30/',
 '/home/sean/Desktop/CSC440_project/data_general/tweets/瓜果梨桃葡萄橙子/',
 '/home/sean/Desktop/CSC440_project/data_general/tweets/鹿家宁浅/',
 '/home/sean/Desktop/CSC440_project/data_general/tweets/夏柔嘉/',
 '/home/sean/Desktop/CSC440_project/data_general/tweets/龍豁成/']

<a name="data_processing"/>

# Data Processing

<a name="user_data_processing"/>

## User Data Processing

In [241]:
# users
users = pd.read_csv(
    path+'/data_general/combined.csv'
).rename(
    columns={
        '用户id':'id',
        '性别': 'gender',
        '微博数':'#tweets',
        '粉丝数': '#follower',
        '关注数': '#following',
        '是否认证':'verified'
    }
)[
    ['id','gender','#tweets','#follower','#following','verified']
].drop_duplicates()

location_to_join = pd.read_csv(path+'/data_general/locations_combined.csv', header = None, names = ['id','location'], encoding='utf-8')
user_id_lst = set(location_to_join['id'])
users['content'] = np.zeros(len(users))
users.loc[users['id'].isin(user_id_lst), 'content'] = 1  
users_with_content = users.merge(
    location_to_join, how='inner', left_on='id', right_on='id'
).drop_duplicates(
    'id'
).reset_index(
    drop=True
).astype(
    {
        'location': 'string',
        'id': 'int64',
        'gender': 'string'
    }
)
users_with_content

Unnamed: 0,id,gender,#tweets,#follower,#following,verified,content,location
0,6309921628,f,2902,344,403,False,1.0,河南 许昌
1,6007931743,m,11,20,163,False,1.0,江苏 南京
2,7471743898,m,5,10,265,False,1.0,其他
3,5701747600,f,3175,95113,1052,False,1.0,上海 杨浦
4,7577162125,m,25,3,190,False,1.0,山东 济南
...,...,...,...,...,...,...,...,...
574352,6505005302,f,36,21,239,False,1.0,其他
574353,1803190527,f,6243,1409,2069,False,1.0,浙江
574354,5343183078,m,418,148,336,False,1.0,北京 海淀
574355,6031048952,f,183,1,1116,False,1.0,海南 海口


<a name="raw_post_data_processing"/>

## Post Data Processing

### Reorganize scrambled columns

In [54]:
tweets_list = os.listdir(path+'/data_general/tweets')
rename_dict = {
    '视频url': 'reposter_device',  
    '原始图片url': 'reposting_time', 
    '位置': '#likes', 
    '日期': '#comments', 
    '工具': '#reposts', 
    '点赞数': 'repost_weibo_comment',
    '源用户id': 'source_user_id', 
    '源用户昵称': 'source_user_nickname', 
    '源微博原始图片url': 'source_weibo_post_time',
    '源微博视频url': 'source_weibo_device', 
    '源微博位置': '#source_weibo_likes', 
    '源微博日期': '#source_weibo_comments', 
    '源微博工具': '#source_weibo_reposts', 
    '源微博点赞数': 'source_weibo_content' 
}

rename_dict2 = {
    "正文": 'repost_weibo_comment',
    '日期': 'reposting_time',
    '工具': 'reposter_device',
    '点赞数': '#likes',
    '评论数':'#comments',
    '转发数': '#reposts',
    '源用户id': 'source_user_id',
    '源用户昵称': 'source_user_nickname',
    '源微博正文': 'source_weibo_content',
    '源微博日期': 'source_weibo_post_time',
    '源微博工具': 'source_weibo_device',
    '源微博点赞数': '#source_weibo_likes',
    '源微博评论数': '#source_weibo_comments',
    '源微博转发数': '#source_weibo_reposts'
}

censor_indications = [
  '抱歉，由于作者设置，',
  '该微博因被多人投诉',
  '该账号因被投诉违反',
  '该账号因被投诉',
  '查看帮助： 网页链接',
]

# resumption from break point?
def tweets_tackling(tweets_list, segment_index):
    censored_repost_list = deque()
    # /data_general/tweets/用户名/user_id.csv
    # count the number of censored reposts for each user
    # put then-censored reposts into one pandas dataframe
    
    for username in tqdm(tweets_list, desc="#posts", unit="posts", total=len(tweets_list), position=0):
        csvlist = glob(username + '/*.csv')
        if len(csvlist) == 0:
            # empty folder case
            continue
        else:
            weibo_csv = csvlist[0]
        id_ = os.path.splitext(os.path.basename(os.listdir(i)[0]))[0]
        user_weibo_df = pd.read_csv(weibo_csv)
        
        # test if need to rename
        need_rename = True
        try:
            datetime.strptime(user_weibo_df['原始图片url'][0],'%a %b %d %X %z %Y')
        except TypeError:
            need_rename = False
        
        if need_rename:
            user_weibo_df = user_weibo_df.drop(columns=['bid','id','正文','头条文章url','源微博@用户','源微博话题','源微博转发数', '源微博bid','源微博id','源微博正文', '源微博头条文章url','评论数', '转发数', '话题', '@用户', '是否原创','源微博评论数']).rename(columns=rename_dict)
        else:
            user_weibo_df = user_weibo_df.drop(columns=['id','bid','头条文章url','原始图片url','视频url','源微博视频url','源微博位置','是否原创','源微博bid','话题','@用户','源微博@用户','位置','源微博id','源微博头条文章url','源微博原始图片url','源微博话题']).rename(columns=rename_dict2)
        censored_repost_list.extend((row for _, row in user_weibo_df.iterrows()))

    censored_repost_df = pd.DataFrame(list(censored_repost_list))

In [9]:
def main():
    segment_length = 20000
    for segment_index in range(len(tweets_path_list)//segment_length + 1):
        path_segment = tweets_path_list[segment_index*segment_length: (segment_index+1)*segment_length]
        tweets_tackling(path_segment, segment_index)

In [10]:
main()

#posts: 100%|████████████████████████████████████████| 20000/20000 [00:58<00:00, 339.19posts/s]
#posts: 100%|████████████████████████████████████████| 20000/20000 [00:59<00:00, 334.66posts/s]
#posts: 100%|████████████████████████████████████████| 20000/20000 [00:59<00:00, 333.58posts/s]
#posts: 100%|████████████████████████████████████████| 20000/20000 [00:59<00:00, 337.90posts/s]
#posts: 100%|████████████████████████████████████████| 20000/20000 [00:59<00:00, 334.45posts/s]
#posts:  45%|██████████████████▍                      | 8979/20000 [00:27<00:32, 337.88posts/s]

/home/sean/Desktop/CSC440_project/data_general/tweets/苏墨熙咩/


#posts: 100%|████████████████████████████████████████| 20000/20000 [00:59<00:00, 335.11posts/s]
#posts:  90%|███████████████████████████████████▊    | 17913/20000 [00:54<00:05, 361.50posts/s]

/home/sean/Desktop/CSC440_project/data_general/tweets/·莓頌困菟Killer·/


#posts: 100%|████████████████████████████████████████| 20000/20000 [01:00<00:00, 331.49posts/s]
#posts: 100%|████████████████████████████████████████| 20000/20000 [00:59<00:00, 336.19posts/s]
#posts: 100%|████████████████████████████████████████| 20000/20000 [00:59<00:00, 335.13posts/s]
#posts: 100%|████████████████████████████████████████| 20000/20000 [00:58<00:00, 342.39posts/s]
#posts: 100%|████████████████████████████████████████| 20000/20000 [00:59<00:00, 336.56posts/s]
#posts: 100%|████████████████████████████████████████| 20000/20000 [00:59<00:00, 336.24posts/s]
#posts: 100%|████████████████████████████████████████| 20000/20000 [00:58<00:00, 341.34posts/s]
#posts: 100%|████████████████████████████████████████| 20000/20000 [01:00<00:00, 331.75posts/s]
#posts: 100%|████████████████████████████████████████| 20000/20000 [00:59<00:00, 334.32posts/s]
#posts: 100%|████████████████████████████████████████| 20000/20000 [01:00<00:00, 331.64posts/s]
#posts: 100%|███████████████████████████

<a name="post_number"/>

### Post Number and Censored Number

In [177]:
with open('__raw_repost_df_%s.pkl' %0, 'rb') as f:
    temp_raw_df = pickle.load(f)

In [178]:
temp_raw_df

Unnamed: 0,reposting_time,reposter_device,#likes,#comments,#reposts,repost_weibo_comment,source_user_id,source_user_nickname,source_weibo_post_time,source_weibo_device,#source_weibo_likes,#source_weibo_comments,#source_weibo_reposts,source_weibo_content,user_id,is_censored_repost
0,Thu Aug 05 02:16:37 +0800 2021,iPhone客户端,0,0.0,0.0,转发微博,5.721827e+09,王尼美,Tue Aug 03 13:44:34 +0800 2021,微博视频号,1925,224,980,发现一只乖乖拍证件照的纯白波斯猫！好像一朵云哦#猫咪的毛发有多蓬松# 王尼美的微博视频,6195560523,False
1,Wed Aug 04 13:19:00 +0800 2021,iPhone客户端,0,0.0,0.0,转发微博,1.686137e+09,大战柴柯夫斯基,Sun Aug 01 20:23:46 +0800 2021,iPhone客户端,1080,58,88,猫狗一起药浴了！#萌宠一夏##萌宠星视频# 大战柴柯夫斯基的微博视频,6195560523,False
2,Tue Aug 03 10:17:30 +0800 2021,iPhone客户端,0,0.0,0.0,转发微博,2.844061e+09,全是猫,Tue Jul 27 17:30:22 +0800 2021,iPhone客户端,1283,81,265,当你喜欢上一只猫猫主播 全是猫的微博视频,6195560523,False
3,Mon Aug 02 20:57:01 +0800 2021,iPhone客户端,0,0.0,0.0,转发微博,7.390621e+09,矮脚虎凸凸,Wed Jul 28 18:23:59 +0800 2021,矮脚凹凸喵超话,354,67,31,怎么会有这么又菜又爱玩的小猫咪呢矮脚凹凸喵#新星V计划##萌宠星视频# 矮脚虎凸凸的微博视频,6195560523,False
4,Sun Aug 01 06:53:28 +0800 2021,iPhone客户端,0,0.0,0.0,转发微博,2.807208e+09,萌宠百科,Thu Jul 29 00:30:03 +0800 2021,微博视频号,941,116,162,迷你版的小猫咪！！ 萌宠百科的微博视频,6195560523,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50,Fri Jul 30 15:36:11 +0800 2021,小米MIX2S 全面屏,0,0.0,0.0,这款小众品牌又便宜又好用，太可了，感谢你的安利,1.593934e+09,林萍在日本,Fri Jul 30 14:58:37 +0800 2021,iPhone 12,1138,388,321,“因为每个月都会收到各个品牌给我寄的样品，所以家里护肤品是特别多，中间能让我用到空瓶的化妆品...,6355372856,False
51,Fri Jul 30 15:15:47 +0800 2021,小米MIX2S 全面屏,0,0.0,0.0,你要是天天营业，那我就能天天开心了，懂,1.911353e+09,米热MERXAT,Fri Jul 30 14:20:40 +0800 2021,微博视频号,24213,9394,5760,一则姗姗来迟的vlog，祝愿大家天天开心 米热MERXAT的微博视频,6355372856,False
52,Fri Jul 30 14:58:21 +0800 2021,小米MIX2S 全面屏,0,0.0,0.0,娇兰复原蜜的成分太可了，果然贵有贵的道理,1.874306e+09,Guerlain法国娇兰,Thu Jul 29 18:00:03 +0800 2021,微博 weibo.com,12028,3470,4444,看@彭小苒 如何肌肤元气回来，光采倍速绽现！即日起至8月4日，关注@Guerlain法国娇兰...,6355372856,False
53,Fri Jul 30 14:41:51 +0800 2021,小米MIX2S 全面屏,0,0.0,0.0,不知道是张慧雯的皮肤白，还是这款项链显人白,2.911941e+09,Qeelin官方微博,Fri Jul 30 12:00:50 +0800 2021,,551,558,550,七夕将至，演员@张慧雯wen 与#Qeelin小红锁#一起沁入甜蜜。炽热浓烈的红玛瑙，以如意...,6355372856,False


In [126]:
censored_repost_list = deque()
for i in tqdm(tweets_path_list[:10000], desc="#posts", unit="posts", total=len(tweets_list), position=0):
    try:
        weibo_ = findfiles(i + '/*.csv')[0]
        id_ = os.path.splitext(os.path.basename(os.listdir(i)[0]))[0]
        user_weibo_df = pd.read_csv(weibo_)
        try:
            datetime.strptime(user_weibo_df['原始图片url'][0],'%a %b %d %X %z %Y')
            # needs rename
            user_weibo_df = user_weibo_df.drop(columns=['bid','id','正文','头条文章url','源微博@用户','源微博话题','源微博转发数', '源微博bid','源微博id','源微博正文', '源微博头条文章url','评论数', '转发数', '话题', '@用户', '是否原创','源微博评论数']).rename(columns=rename_dict)
            user_weibo_df = user_weibo_df.assign(
                user_id = id_,
                is_censored_repost = user_weibo_df['source_weibo_content'].map(lambda content: any((keyword in str(content) for keyword in censor_indications)))
            )
        except TypeError:
            user_weibo_df = user_weibo_df.drop(columns=['id','bid','头条文章url','原始图片url','视频url','源微博视频url','源微博位置','是否原创','源微博bid','话题','@用户','源微博@用户','位置','源微博id','源微博头条文章url','源微博原始图片url','源微博话题']).rename(columns=rename_dict2)
            user_weibo_df = user_weibo_df.assign(
                user_id = id_,
                is_censored_repost = user_weibo_df['source_weibo_content'].map(lambda content: any((keyword in str(content) for keyword in censor_indications)))
            )
        for time in user_weibo_df['reposting_time']:
            if time == '0':
                print(i)
                print(id_)
        censored_repost_list.extend(row for _, row in user_weibo_df.iterrows())
    except IndexError:
        print(i)
    

#posts:   0%|▏                                                                                          | 933/579768 [00:02<27:34, 349.93posts/s]

/home/sean/Desktop/CSC440_project/data_general/tweets/恩u46/
7562554131
/home/sean/Desktop/CSC440_project/data_general/tweets/恩u46/
7562554131


#posts:   0%|▎                                                                                         | 1707/579768 [00:04<27:35, 349.23posts/s]


KeyboardInterrupt: 

In [85]:
test_df = pd.DataFrame(censored_repost_list)

In [127]:
i = '/home/sean/Desktop/CSC440_project/data_general/tweets/恩u46/'
weibo_ = glob(i + '/*.csv')[0]
pd.read_csv(weibo_)


Unnamed: 0,id,bid,正文,头条文章url,原始图片url,视频url,位置,日期,工具,点赞数,...,源微博原始图片url,源微博视频url,源微博位置,源微博日期,源微博工具,源微博点赞数,源微博评论数,源微博转发数,源微博话题,源微博@用户
0,4614348720574709,K61h62pvv,啊啊啊啊啊啊跳舞了跳舞了！！！,,,,,Sat Mar 13 18:05:28 +0800 2021,vivo Y30 2021,0,...,,https://f.video.weibocdn.com/vehv9evglx07L0UTf...,,Thu Mar 11 23:10:10 +0800 2021,IU超话,927,51.0,82.0,,
1,4612150758736783,K565ZAEPR,期待期待 3.25,,,,,Sun Mar 07 16:31:32 +0800 2021,vivo Y30 2021,0,...,,https://f.video.weibocdn.com/2wd9aJholx07KLLc8...,,Tue Mar 02 11:06:48 +0800 2021,IU超话,0,0.0,1.0,"IU正规五辑预告,娱乐影像力",
2,4614348720574709,K61h62pvv,,,Sat Mar 13 18:05:28 +0800 2021,vivo Y30 2021,0.0,0,0,啊啊啊啊啊啊跳舞了跳舞了！！！,...,Thu Mar 11 23:10:10 +0800 2021,IU超话,927.0,51,82,IU IU新曲《Flu》预告丽丽跳舞了！这个身材我慕了慕了~ 顶LIU音乐的微博视频,,,,
3,4612150758736783,K565ZAEPR,,,Sun Mar 07 16:31:32 +0800 2021,vivo Y30 2021,0.0,0,0,期待期待 3.25,...,Tue Mar 02 11:06:48 +0800 2021,IU超话,0.0,0,1,#IU正规五辑预告##娱乐影像力# 又是被李知恩美哭的一天，iu新专预告好好看！期待正式见面...,,,"IU正规五辑预告,娱乐影像力",


In [None]:
pd.read_csv(glob('/home/sean/Desktop/CSC440_project/data_general/tweets/''))

In [109]:
temp_df.iloc[119246]

reposting_time                                                0
reposter_device                                               0
#likes                                                     转发微博
#comments                                                   NaN
#reposts                                                    NaN
repost_weibo_comment                                        NaN
source_user_id                                     7577330208.0
source_user_nickname                                  爱吃葱油饼的胖娃娃
source_weibo_post_time                                     1601
source_weibo_device                                         758
#source_weibo_likes       每天都点外卖，竟然不知道这个隐藏功能。。。我是错过了几个亿吧！ 惊呆了～ 
#source_weibo_comments                                      NaN
#source_weibo_reposts                                       NaN
source_weibo_content                                        NaN
user_id                                              6191040729
is_censored_repost                      

In [106]:
[i for i, value in enumerate(temp_df['reposting_time']) if value == '0']

[29705,
 29706,
 119246,
 119247,
 119248,
 119249,
 119250,
 119251,
 119252,
 119253,
 119254,
 119255,
 119256,
 119257,
 119258,
 119259,
 119260,
 119261,
 166070,
 166071,
 166072,
 166073,
 166074,
 166076,
 166077,
 166078,
 166079,
 166080,
 166081,
 166082,
 166083,
 166084,
 166085,
 166086,
 166087,
 166088,
 166089,
 261277,
 261278,
 261279,
 261280,
 261281,
 261282,
 261283,
 261284,
 261285,
 261286,
 261287,
 261288,
 261289,
 261290,
 261291,
 261292,
 261293,
 261294,
 261295,
 261296,
 261297,
 261298,
 261299,
 261300,
 261301,
 261302,
 261303,
 261304,
 261306,
 261307,
 261308,
 261310,
 261311,
 261312,
 261313,
 261314,
 261315,
 261316,
 261317,
 261318,
 261319,
 261321,
 382610,
 382611,
 382612,
 382613,
 382614,
 382615,
 382616,
 382617,
 382618,
 382619,
 382620,
 382621,
 382622,
 382624,
 382625,
 382626,
 382627,
 382628,
 382629,
 382630,
 382632,
 382633,
 382634,
 382635,
 382636,
 382637,
 382638,
 382639,
 382640,
 382642,
 382644,
 382645,
 38

In [12]:
censored_num_recorder = defaultdict(int)
collected_repost_num = defaultdict(int)
for i in range(len(tweets_path_list)//20000 + 1):
    with open('__raw_repost_df_%s.pkl' %i, 'rb') as f:
        temp_raw_df = pickle.load(f)
    for key, values in defaultdict(int,temp_raw_df.groupby('user_id')['is_censored_repost'].agg('sum').to_dict()).items():
        censored_num_recorder[key] += values
    for key, values in defaultdict(int,temp_raw_df.groupby('user_id').size().to_dict()).items():
        collected_repost_num[key] += values
with open('user_censorship_num_recorder.pkl','wb') as f:
    pickle.dump(censored_num_recorder, f)
with open('user_collected_repost_num.pkl','wb') as f:
    pickle.dump(collected_repost_num, f)
    

### Before running the cell below, you have to check if the number of keys in the collected_posts matches the number of length of users_with_content. If there is no exact match, you can subset users_with_content with the IDs that are in the dictionary

#### further cleaning of the dictionary

In [9]:
with open('user_censorship_num_recorder.pkl','rb') as f:
    censored_num_recorder = pickle.load(f)
with open('user_collected_repost_num.pkl', 'rb') as f:
    collected_repost_num = pickle.load(f)

In [10]:
translation = {}

for key in censored_num_recorder.keys():
    cleaning = lambda x: re.sub('[^0-9]*','',x)
    translation[key] = cleaning(key)
    
for item, value in translation.items():
    censored_num_recorder[value] = censored_num_recorder.pop(item)

In [11]:
translation = {}
for key in collected_repost_num.keys():
    cleaning = lambda x: re.sub('[^0-9]*','',x)
    translation[key] = cleaning(key)
    
for item, value in translation.items():
    collected_repost_num[value] = collected_repost_num.pop(item)

In [12]:
####################### Manual check here!!!

len(set(users_with_content['id']) & set(collected_repost_num.keys()))

574356

In [14]:
users_with_content['#collected_posts'] = users_with_content['id'].map(lambda x: collected_repost_num[x])
users_with_content['#censored_posts'] = users_with_content['id'].map(lambda x: censored_num_recorder[x])

In [15]:
users_with_content.head()

Unnamed: 0,id,gender,#tweets,#follower,#following,verified,content,location,#collected_posts,#censored_posts
0,6309921628,f,2902,344,403,False,1.0,河南 许昌,56,0
1,6007931743,m,11,20,163,False,1.0,江苏 南京,2,0
2,7471743898,m,5,10,265,False,1.0,其他,5,0
3,5701747600,f,3175,95113,1052,False,1.0,上海 杨浦,52,0
4,7577162125,m,25,3,190,False,1.0,山东 济南,2,0


<a name="follower_count_percentile"/>

### Follower Count Percentile

In [242]:
users_with_content.sort_values('#follower', inplace=True)
df_len = len(users_with_content)

lesser_num = 0
cur_follower_num = 0
cur_count = 0
percentiles = deque()

for follower_num in users_with_content['#follower']:
    if follower_num == cur_follower_num:
        cur_count += 1
    else:
        percentile = (lesser_num) / df_len * 100
        percentiles.extend([percentile for _ in range(cur_count)])
        cur_follower_num = follower_num
        lesser_num += cur_count 
        cur_count = 1

percentile = (lesser_num) / df_len * 100
percentiles.extend([percentile for _ in range(cur_count)])

users_with_content["#follower_percentile_grouped"] = list(map(lambda x: x//10 + 1, percentiles))
users_with_content["#follower_percentile_grouped"] = users_with_content["#follower_percentile_grouped"].astype('int32')
users_with_content.tail()

Unnamed: 0,id,gender,#tweets,#follower,#following,verified,content,location,#follower_percentile_grouped
426005,2803301701,m,135297,138009780,3061,True,1.0,北京,10
258474,1934183965,f,2318,158452899,183,True,1.0,北京,10
203828,5878659096,m,10368,181509693,2553,True,1.0,北京,10
505034,2016713117,f,15318,183042610,135,True,1.0,北京 海淀,10
76225,1642909335,f,5632,197923455,2931,True,1.0,北京,10


<a name="location_to_province"/>

### Location to Province

In [243]:
weird_prefixes = {"昵称", "认证"}

def to_province(location):
    location = location.split(' ')[0]
    return location if all((matcher not in location for matcher in weird_prefixes)) else '其他'
users_with_content['province'] = users_with_content['location'].map(to_province)
users_with_content['province']

536321    其他
278927    其他
405016    其他
234168    其他
151815    河南
          ..
426005    北京
258474    北京
203828    北京
505034    北京
76225     北京
Name: province, Length: 574357, dtype: object

### Province GDP

In [244]:
from collections import defaultdict

gdp_by_province = defaultdict(int) # default value for GDP is 0

gdp_by_province.update({
    "北京":23805,
    "上海":23277,
    "江苏":17121,
    "浙江":16358,
    "福建":15531,
    "广东":14223,
    "天津":13569,
    "湖北":10988,
    "重庆":10867,
    "山东":10811,
    "内蒙古":9977,
    "陕西":9239,
    "安徽":8703,
    "湖南":8681,
    "辽宁":8667,
    "海南":8323,
    "河南":8302,
    "四川":8229,
    "新疆":7721,
    "宁夏":7686,
    "江西":7682,
    "青海":6998,
    "西藏":6997,
    "云南":6950,
    "贵州":6828,
    "河北":6797,
    "山西":6735,
    "吉林":6577,
    "广西":6386,
    "黑龙江":5129,
    "甘肃":4624,
    "香港":46700,
    "台湾":28306,
    "澳门":38769,
})

users_with_content['province_gdp'] = users_with_content['province'].map(lambda p: gdp_by_province[p])
users_with_content.head()

Unnamed: 0,id,gender,#tweets,#follower,#following,verified,content,location,#follower_percentile_grouped,province,province_gdp
536321,2453175617,f,2,0,172,False,1.0,其他,1,其他,0
278927,2123046974,f,8,0,18,False,1.0,其他,1,其他,0
405016,1883592641,m,724,0,1215,False,1.0,其他,1,其他,0
234168,2639003040,m,32,0,51,False,1.0,其他,1,其他,0
151815,2045211094,m,15,0,48,False,1.0,河南,1,河南,8302


### Censored Ratio & renaming

In [245]:
users_with_content = users_with_content.rename(columns={
    'verified':"verification"
})
users_with_content['censored'] = users_with_content['#censored_posts'].map(lambda x: x > 0)
users_with_content['censored_ratio'] = users_with_content['#censored_posts']/users_with_content['#collected_posts']
users_with_content['censored_ratio'] = users_with_content['censored_ratio'].map(lambda x: x if x != float('inf') else 0)
users_with_content.head()

KeyError: '#censored_posts'

### Unused Fields Cleansing

In [21]:
users_with_content.drop([
    'content',
    'location'
], axis = 'columns', inplace=True, errors='ignore')
users_with_content.head()

Unnamed: 0,id,gender,#tweets,#follower,#following,verification,#collected_posts,#censored_posts,#follower_percentile_grouped,province,province_gdp,censored,censored_ratio
536321,2453175617,f,2,0,172,False,1,0,1,其他,0,False,0.0
278927,2123046974,f,8,0,18,False,3,0,1,其他,0,False,0.0
405016,1883592641,m,724,0,1215,False,58,1,1,其他,0,True,0.017241
234168,2639003040,m,32,0,51,False,62,14,1,其他,0,True,0.225806
151815,2045211094,m,15,0,48,False,4,0,1,河南,8302,False,0.0


In [23]:
with open('./user_df.pkl', 'wb') as f:
    pickle.dump(users_with_content, f)

<a name="post_data_processing"/>

## Post Data Processing

### Reorganize user csv column names

In [219]:
rename_needed_dict = {
    '视频url': 'reposter_device',  
    '原始图片url': 'reposting_time', 
    '位置': '#likes', 
    '日期': '#comments', 
    '工具': '#reposts', 
    '点赞数': 'repost_weibo_comment',
    '源用户id': 'source_user_id', 
    '源用户昵称': 'source_user_nickname', 
    '源微博原始图片url': 'source_weibo_post_time',
    '源微博视频url': 'source_weibo_device', 
    '源微博位置': '#source_weibo_likes', 
    '源微博日期': '#source_weibo_comments', 
    '源微博工具': '#source_weibo_reposts', 
    '源微博点赞数': 'source_weibo_content' 
}

rename_no_need_dict = {
    "正文": 'repost_weibo_comment',
    '日期': 'reposting_time',
    '工具': 'reposter_device',
    '点赞数': '#likes',
    '评论数':'#comments',
    '转发数': '#reposts',
    '源用户id': 'source_user_id',
    '源用户昵称': 'source_user_nickname',
    '源微博正文': 'source_weibo_content',
    '源微博日期': 'source_weibo_post_time',
    '源微博工具': 'source_weibo_device',
    '源微博点赞数': '#source_weibo_likes',
    '源微博评论数': '#source_weibo_comments',
    '源微博转发数': '#source_weibo_reposts'
}

def reorg_column_names(user_weibo_df):
    # test if need to rename
    need_rename = False
    try:
        datetime.strptime(user_weibo_df['日期'][0],'%a %b %d %X %z %Y')
    except TypeError:
        need_rename = True

    if need_rename:
        return user_weibo_df.drop(columns=['bid','id','正文','头条文章url','源微博@用户','源微博话题','源微博转发数', '源微博bid','源微博id','源微博正文', '源微博头条文章url','评论数', '转发数', '话题', '@用户', '是否原创','源微博评论数']).rename(columns=rename_needed_dict)
    else:
        return user_weibo_df.drop(columns=['id','bid','头条文章url','原始图片url','视频url','源微博视频url','源微博位置','是否原创','源微博bid','话题','@用户','源微博@用户','位置','源微博id','源微博头条文章url','源微博原始图片url','源微博话题']).rename(columns=rename_noneed_dict)
        

### Concat user csv's to a single df

In [238]:
def concat_user_csvs_to_df(tweets_list, tqdm_position=0):
    censored_repost_list = deque()
    user_csv_df_list = []
    for username in tqdm(tweets_list, desc="#posts", unit="posts", total=len(tweets_list), position=tqdm_position):
        csvlist = glob(username + '/*.csv')
        if len(csvlist) == 0:
            # empty folder case
            continue
        csv_path = csvlist[0]
        
        user_weibo_df = pd.read_csv(csv_path)
        #print(user_weibo_df.columns)
        user_weibo_df = reorg_column_names(user_weibo_df)
        user_weibo_df['user_id'] = int(csv_path.split('/')[-1].strip('.csv'))
        user_csv_df_list.append(user_weibo_df)
    
    return pd.concat(user_csv_df_list).astype({
            'reposting_time': 'string',
            'reposter_device': 'string',
            'repost_weibo_comment': 'string',
            'source_weibo_post_time': 'string',
            'source_weibo_content': 'string',
            'source_user_nickname': 'string',
            'source_weibo_device': 'string'
        }, copy=False)

concat_user_csvs_to_df(tweets_path_list[:2]).head()

#posts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 446.39posts/s]


Unnamed: 0,reposting_time,reposter_device,#likes,#comments,#reposts,repost_weibo_comment,source_user_id,source_user_nickname,source_weibo_post_time,source_weibo_device,#source_weibo_likes,#source_weibo_comments,#source_weibo_reposts,source_weibo_content,user_id
0,Thu Aug 05 02:16:37 +0800 2021,iPhone客户端,0,0,0,转发微博,5721826695,王尼美,Tue Aug 03 13:44:34 +0800 2021,微博视频号,1925,224,980,发现一只乖乖拍证件照的纯白波斯猫！好像一朵云哦#猫咪的毛发有多蓬松# 王尼美的微博视频,6195560523
1,Wed Aug 04 13:19:00 +0800 2021,iPhone客户端,0,0,0,转发微博,1686137252,大战柴柯夫斯基,Sun Aug 01 20:23:46 +0800 2021,iPhone客户端,1080,58,88,猫狗一起药浴了！#萌宠一夏##萌宠星视频# 大战柴柯夫斯基的微博视频,6195560523
2,Tue Aug 03 10:17:30 +0800 2021,iPhone客户端,0,0,0,转发微博,2844061457,全是猫,Tue Jul 27 17:30:22 +0800 2021,iPhone客户端,1283,81,265,当你喜欢上一只猫猫主播 全是猫的微博视频,6195560523
3,Mon Aug 02 20:57:01 +0800 2021,iPhone客户端,0,0,0,转发微博,7390620536,矮脚虎凸凸,Wed Jul 28 18:23:59 +0800 2021,矮脚凹凸喵超话,354,67,31,怎么会有这么又菜又爱玩的小猫咪呢矮脚凹凸喵#新星V计划##萌宠星视频# 矮脚虎凸凸的微博视频,6195560523
4,Sun Aug 01 06:53:28 +0800 2021,iPhone客户端,0,0,0,转发微博,2807207672,萌宠百科,Thu Jul 29 00:30:03 +0800 2021,微博视频号,941,116,162,迷你版的小猫咪！！ 萌宠百科的微博视频,6195560523


### Device Transform Definition

In [25]:
device_mapping = {
    'Apple':['iPhone','iPad','Mac'],
    'Web':['浏览器', '微博'],
    'Huawei':['Huawei','nova','华为','HUAWEI','nova','Harmony'],
    'Honor':['荣耀'],
    'XiaoMi':['小米', 'Redmi','红米'],
    'vivo':['vivo'],
    'OPPO':['OPPO'],
    'Samsung':['三星','Samsung'],
    'General Andoid':['android','Android'],
    'Realme':['realme','真我'],
    'IQOO':['iQOO'],
    'OnePlus':['一加','OnePlus']
}

def get_reposter_device(df):   
    def to_brand(client_name):
        if type(client_name) is not str:
            return 'NaN'
        for brand, devices in device_mapping.items():
            if any((device in client_name for device in devices)):
                return brand
        return 'other'
    
    return df['reposter_device'].map(to_brand).astype('string')


### Censored Transform Definition

### Time Transform Definition

In [35]:
from datetime import datetime

def get_timestamp(df):
    def to_timestamp(s):
        if type(s) is not str:
            return None
        return datetime.strptime(s, '%a %b %d %X %z %Y').replace(hour=0, minute=0, second=0, microsecond=0, day=1).timestamp()
    return df['reposting_time'].map(to_timestamp)


In [27]:
def get_timestamp2(df):
    def to_timestamp(s):
        if type(s) is not str:
            return None
        return datetime.strptime(s, '%a %b %d %X %z %Y').replace(hour=0, minute=0, second=0, microsecond=0, day=1).timestamp()
    return df['source_weibo_post_time'].map(to_timestamp)


In [43]:
len([i for i in temp_df['reposting_time'] if i == '0'])

348

### User Information Preparation

In [28]:
users_df = users_with_content.copy()
users_df = users_df.set_index('id')
users_df.head()

Unnamed: 0_level_0,gender,#tweets,#follower,#following,verification,#collected_posts,#censored_posts,#follower_percentile_grouped,province,province_gdp,censored,censored_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2453175617,f,2,0,172,False,1,0,1,其他,0,False,0.0
2123046974,f,8,0,18,False,3,0,1,其他,0,False,0.0
1883592641,m,724,0,1215,False,58,1,1,其他,0,True,0.017241
2639003040,m,32,0,51,False,62,14,1,其他,0,True,0.225806
2045211094,m,15,0,48,False,4,0,1,河南,8302,False,0.0


### Transformation Pipeline

In [228]:
users_with_content.dtypes

id              int64
gender         string
#tweets         int64
#follower       int64
#following      int64
verified         bool
content       float64
location       string
dtype: object

In [240]:
def to_post_full_df(df, users_df):
    df = df.assign(
        general_device = get_reposter_device(df),
        repost_timestamp_month = get_timestamp(df),
        post_timestamp_month = get_timestamp2(df)
    )
    df.drop([
        "source_user_nickname",
        "source_user_id",
        "reposter_device",
        "source_weibo_device",
        "reposting_time",
        "#likes",
        "#comments",
        "#reposts",
        "#source_weibo_likes",
        "#source_weibo_comments",
        "#source_weibo_reposts"
    ], axis = 'columns', inplace=True)
    return df.merge(users_df, left_on='user_id', right_on='id', right_index=True, suffixes=('_post', '_user'))

to_post_full_df(concat_user_csvs_to_df(tweets_path_list[:2]), users_with_content)

#posts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 449.62posts/s]


Unnamed: 0,repost_weibo_comment,source_weibo_post_time,source_weibo_content,user_id,general_device,repost_timestamp_month,post_timestamp_month,id,gender,#tweets,#follower,#following,verified,content,location


In [None]:
def to_post_no_content_df(post_full_df):
    post_full_df.drop([
        "repost_weibo_comment",
        "source_weibo_content"
    ], axis = 'columns', inplace=True)
    return post_full_df

In [222]:
num_dfs = 3
tweets_sections = np.array_split(tweets_list, num_dfs)
tweets_sections

[array(['/home/sean/Desktop/CSC440_project/data_general/tweets/小黑-9-30',
        '/home/sean/Desktop/CSC440_project/data_general/tweets/瓜果梨桃葡萄橙子',
        '/home/sean/Desktop/CSC440_project/data_general/tweets/鹿家宁浅', ...,
        '/home/sean/Desktop/CSC440_project/data_general/tweets/一个小小小小呆桃呀',
        '/home/sean/Desktop/CSC440_project/data_general/tweets/是司白白呀',
        '/home/sean/Desktop/CSC440_project/data_general/tweets/影咋2018'],
       dtype='<U84'),
 array(['/home/sean/Desktop/CSC440_project/data_general/tweets/李根5707',
        '/home/sean/Desktop/CSC440_project/data_general/tweets/86岁含泪守塔',
        '/home/sean/Desktop/CSC440_project/data_general/tweets/冼3岁', ...,
        '/home/sean/Desktop/CSC440_project/data_general/tweets/Bao蓓Bei',
        '/home/sean/Desktop/CSC440_project/data_general/tweets/加点番茄酱吗',
        '/home/sean/Desktop/CSC440_project/data_general/tweets/W乐毅'],
       dtype='<U84'),
 array(['/home/sean/Desktop/CSC440_project/data_general/tweets/用户6038140688',
   

In [36]:
# Transform Begin
no_content_post_dfs = []

for i in tqdm(range(num_dfs)):
    with open('__raw_repost_df_%s.pkl' %i, 'rb') as f:
        df = pickle.load(f)
    no_content_post_df = pipeline(df, i)
    no_content_post_dfs.append(no_content_post_df)

  0%|                                                    | 0/29 [00:01<?, ?it/s]


ValueError: time data '0' does not match format '%a %b %d %X %z %Y'

In [4]:
# Run this in case memory outage and no_content_post get stucked

# no_content_post_dfs = []
# for i in tqdm(range(12)):
#     with open('final/post_full_df_%d.pkl' %i, 'rb') as f:
#         df = pickle.load(f)
#     df.drop([
#         "repost_weibo_comment",
#         "source_weibo_content"
#     ], axis = 'columns', inplace=True)
#     no_content_post_dfs.append(df)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:49<00:00,  4.14s/it]


In [8]:
with open('final/post_no_content_df.pkl', 'wb') as f_out:
    pickle.dump(pd.concat(no_content_post_dfs), f_out)