# Table Of Content
---

1. ### [Data Processing](#data_processing)
    1. ### [User Data Processing](#user_data_processing)
    3. ### [Post Data Processing](#post_data_processing)

In [1]:
import pickle
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from collections import deque
from datetime import *
import re
from glob import glob
import os
from collections import defaultdict

In [2]:
processing_dir = os.getcwd()
path = '/'.join(processing_dir.split('/')[:-1])
tweets_path_list = glob(path+'/data_general/tweets/*/')
csv_path_list = map(lambda x: glob(x + '/*.csv'), tweets_path_list)
csv_path_list = filter(lambda x : len(x) > 0, csv_path_list)
csv_path_list = list(map(lambda x:x[0], csv_path_list))
csv_path_list[:5]

['/home/sean/Desktop/CSC440_project/data_general/tweets/小黑-9-30/6195560523.csv',
 '/home/sean/Desktop/CSC440_project/data_general/tweets/瓜果梨桃葡萄橙子/6912965703.csv',
 '/home/sean/Desktop/CSC440_project/data_general/tweets/鹿家宁浅/3208770254.csv',
 '/home/sean/Desktop/CSC440_project/data_general/tweets/夏柔嘉/6626298334.csv',
 '/home/sean/Desktop/CSC440_project/data_general/tweets/龍豁成/7419036979.csv']

In [3]:
censor_indications = [
    '抱歉，由于作者设置，',
    '该微博因被多人投诉',
    '该账号因被投诉违反',
    '该账号因被投诉',
    '查看帮助： 网页链接'
]

### Reorganize user csv column names

In [4]:
rename_needed_dict = {
    '视频url': 'reposter_device',  
    '原始图片url': 'reposting_time', 
    '位置': '#likes', 
    '日期': '#comments', 
    '工具': '#reposts', 
    '点赞数': 'repost_weibo_comment',
    '源用户id': 'source_user_id', 
    '源用户昵称': 'source_user_nickname', 
    '源微博原始图片url': 'source_weibo_post_time',
    '源微博视频url': 'source_weibo_device', 
    '源微博位置': '#source_weibo_likes', 
    '源微博日期': '#source_weibo_comments', 
    '源微博工具': '#source_weibo_reposts', 
    '源微博点赞数': 'source_weibo_content' 
}

rename_no_need_dict = {
    "正文": 'repost_weibo_comment',
    '日期': 'reposting_time',
    '工具': 'reposter_device',
    '点赞数': '#likes',
    '评论数':'#comments',
    '转发数': '#reposts',
    '源用户id': 'source_user_id',
    '源用户昵称': 'source_user_nickname',
    '源微博正文': 'source_weibo_content',
    '源微博日期': 'source_weibo_post_time',
    '源微博工具': 'source_weibo_device',
    '源微博点赞数': '#source_weibo_likes',
    '源微博评论数': '#source_weibo_comments',
    '源微博转发数': '#source_weibo_reposts'
}

def reorg_column_names(user_weibo_df):
    # test if need to rename
    for i
    need_rename = False
    try:
        datetime.strptime(user_weibo_df['日期'][0],'%a %b %d %X %z %Y')
    except TypeError:
        need_rename = True

    if need_rename:
        return user_weibo_df.drop(columns=['bid','id','正文','头条文章url','源微博@用户','源微博话题','源微博转发数', '源微博bid','源微博id','源微博正文', '源微博头条文章url','评论数', '转发数', '话题', '@用户', '是否原创','源微博评论数']).rename(columns=rename_needed_dict)
    else:
        return user_weibo_df.drop(columns=['id','bid','头条文章url','原始图片url','视频url','源微博视频url','源微博位置','是否原创','源微博bid','话题','@用户','源微博@用户','位置','源微博id','源微博头条文章url','源微博原始图片url','源微博话题']).rename(columns=rename_no_need_dict)

In [48]:
with open('final_post_full_%d.df' %0,'rb') as f:
    df_test = pickle.load(f)

In [51]:
[]

[]

In [46]:
for _, row in pd.read_csv(csv_path_list[0]).iterrows():
    print(row)
    print(row['日期'])
    row.
    break

id                                             4666656175685987
bid                                                   KrY1XnRbt
正文                                                          NaN
头条文章url                                                     NaN
原始图片url                          Thu Aug 05 02:16:37 +0800 2021
视频url                                                 iPhone客户端
位置                                                            0
日期                                                            0
工具                                                            0
点赞数                                                        转发微博
评论数                                                         NaN
转发数                                                         NaN
话题                                                          NaN
@用户                                                         NaN
是否原创                                                      False
源用户id                                   

<a name="data_processing"/>

# Data Processing

<a name="user_data_processing"/>

## User Data Processing

In [5]:
# read from csv, join with locations
users = pd.read_csv(
    path+'/data_general/combined.csv'
).rename(
    columns={
        '用户id':'id',
        '性别': 'gender',
        '微博数':'#tweets',
        '粉丝数': '#follower',
        '关注数': '#following',
        '是否认证':'verified'
    }
)[
    ['id','gender','#tweets','#follower','#following','verified']
].drop_duplicates()

location_to_join = pd.read_csv(path+'/data_general/locations_combined.csv', header = None, names = ['id','location'], encoding='utf-8')
user_id_lst = set(location_to_join['id'])
users['content'] = np.zeros(len(users))
users.loc[users['id'].isin(user_id_lst), 'content'] = 1  
users_with_content = users.merge(
    location_to_join, how='inner', left_on='id', right_on='id'
).drop_duplicates(
    'id'
).reset_index(
    drop=True
).astype(
    {
        'location': 'string',
        'id': 'int64',
        'gender': 'string'
    }
)
users_with_content

Unnamed: 0,id,gender,#tweets,#follower,#following,verified,content,location
0,6309921628,f,2902,344,403,False,1.0,河南 许昌
1,6007931743,m,11,20,163,False,1.0,江苏 南京
2,7471743898,m,5,10,265,False,1.0,其他
3,5701747600,f,3175,95113,1052,False,1.0,上海 杨浦
4,7577162125,m,25,3,190,False,1.0,山东 济南
...,...,...,...,...,...,...,...,...
574352,6505005302,f,36,21,239,False,1.0,其他
574353,1803190527,f,6243,1409,2069,False,1.0,浙江
574354,5343183078,m,418,148,336,False,1.0,北京 海淀
574355,6031048952,f,183,1,1116,False,1.0,海南 海口


### Get Censored & Number of Collected Posts for each user

In [6]:
pool_num = 8 # Should be number of real CPU cores

In [7]:
from multiprocessing import Pool, Lock
from functools import reduce

with tqdm() as bar:
    lock = Lock()
    def calculate_post_stats(csv_path_list):
        censored_vs_collected_post_num = {}
        for csv_path in csv_path_list:
            user_weibo_df = reorg_column_names(pd.read_csv(csv_path))
            user_id = int(csv_path.split('/')[-1].strip('.csv'))

            censored_vs_collected_post_num[user_id] = (
                sum(user_weibo_df['source_weibo_content'].map(lambda content: any((keyword in str(content) for keyword in censor_indications)))),
                len(user_weibo_df)
            )
            with lock:
                bar.update()
        return censored_vs_collected_post_num
    
    with Pool(pool_num) as p:
        csv_sections = np.array_split(csv_path_list, pool_num)
        censored_vs_collected_post_num_list = p.map(calculate_post_stats, csv_sections)
        censored_vs_collected_post_num = reduce(lambda a,b: a | b, censored_vs_collected_post_num_list)

len(censored_vs_collected_post_num)

0it [03:02, ?it/s]8.18it/s]


579676

In [8]:
users_with_content['#censored_posts'] = users_with_content['id'].map(
    lambda id: censored_vs_collected_post_num[id][0] if id in censored_vs_collected_post_num else 0
)

users_with_content['#collected_posts'] = users_with_content['id'].map(
    lambda id: censored_vs_collected_post_num[id][1] if id in censored_vs_collected_post_num else 0
)

In [9]:
users_with_content.head()

Unnamed: 0,id,gender,#tweets,#follower,#following,verified,content,location,#censored_posts,#collected_posts
0,6309921628,f,2902,344,403,False,1.0,河南 许昌,0,56
1,6007931743,m,11,20,163,False,1.0,江苏 南京,0,2
2,7471743898,m,5,10,265,False,1.0,其他,0,5
3,5701747600,f,3175,95113,1052,False,1.0,上海 杨浦,0,52
4,7577162125,m,25,3,190,False,1.0,山东 济南,0,2


<a name="follower_count_percentile"/>

### Follower Count Percentile

In [10]:
users_with_content.sort_values('#follower', inplace=True)
df_len = len(users_with_content)

lesser_num = 0
cur_follower_num = 0
cur_count = 0
percentiles = deque()

for follower_num in users_with_content['#follower']:
    if follower_num == cur_follower_num:
        cur_count += 1
    else:
        percentile = (lesser_num) / df_len * 100
        percentiles.extend([percentile for _ in range(cur_count)])
        cur_follower_num = follower_num
        lesser_num += cur_count 
        cur_count = 1

percentile = (lesser_num) / df_len * 100
percentiles.extend([percentile for _ in range(cur_count)])

users_with_content["#follower_percentile_grouped"] = list(map(lambda x: x//10 + 1, percentiles))
users_with_content["#follower_percentile_grouped"] = users_with_content["#follower_percentile_grouped"].astype('int32')
users_with_content.tail()

Unnamed: 0,id,gender,#tweets,#follower,#following,verified,content,location,#censored_posts,#collected_posts,#follower_percentile_grouped
426005,2803301701,m,135297,138009780,3061,True,1.0,北京,0,11,10
258474,1934183965,f,2318,158452899,183,True,1.0,北京,0,51,10
203828,5878659096,m,10368,181509693,2553,True,1.0,北京,0,51,10
505034,2016713117,f,15318,183042610,135,True,1.0,北京 海淀,0,54,10
76225,1642909335,f,5632,197923455,2931,True,1.0,北京,0,55,10


<a name="location_to_province"/>

### Location to Province

In [11]:
weird_prefixes = {"昵称", "认证"}

def to_province(location):
    location = location.split(' ')[0]
    return location if all((matcher not in location for matcher in weird_prefixes)) else '其他'
users_with_content['province'] = users_with_content['location'].map(to_province)
users_with_content['province']

536321    其他
278927    其他
405016    其他
234168    其他
151815    河南
          ..
426005    北京
258474    北京
203828    北京
505034    北京
76225     北京
Name: province, Length: 574357, dtype: object

### Province GDP

In [12]:
from collections import defaultdict

gdp_by_province = defaultdict(int) # default value for GDP is 0

gdp_by_province.update({
    "北京":23805,
    "上海":23277,
    "江苏":17121,
    "浙江":16358,
    "福建":15531,
    "广东":14223,
    "天津":13569,
    "湖北":10988,
    "重庆":10867,
    "山东":10811,
    "内蒙古":9977,
    "陕西":9239,
    "安徽":8703,
    "湖南":8681,
    "辽宁":8667,
    "海南":8323,
    "河南":8302,
    "四川":8229,
    "新疆":7721,
    "宁夏":7686,
    "江西":7682,
    "青海":6998,
    "西藏":6997,
    "云南":6950,
    "贵州":6828,
    "河北":6797,
    "山西":6735,
    "吉林":6577,
    "广西":6386,
    "黑龙江":5129,
    "甘肃":4624,
    "香港":46700,
    "台湾":28306,
    "澳门":38769,
})

users_with_content['province_gdp'] = users_with_content['province'].map(lambda p: gdp_by_province[p])
users_with_content.head()

Unnamed: 0,id,gender,#tweets,#follower,#following,verified,content,location,#censored_posts,#collected_posts,#follower_percentile_grouped,province,province_gdp
536321,2453175617,f,2,0,172,False,1.0,其他,0,1,1,其他,0
278927,2123046974,f,8,0,18,False,1.0,其他,0,3,1,其他,0
405016,1883592641,m,724,0,1215,False,1.0,其他,1,58,1,其他,0
234168,2639003040,m,32,0,51,False,1.0,其他,14,62,1,其他,0
151815,2045211094,m,15,0,48,False,1.0,河南,0,4,1,河南,8302


### Censored Ratio & renaming

In [13]:
users_with_content = users_with_content.rename(columns={
    'verified':"verification"
})
users_with_content['censored'] = users_with_content['#censored_posts'].map(lambda x: x > 0)
users_with_content['censored_ratio'] = users_with_content['#censored_posts']/users_with_content['#collected_posts']
users_with_content['censored_ratio'] = users_with_content['censored_ratio'].map(lambda x: x if x != float('inf') else 0)
users_with_content.head()

Unnamed: 0,id,gender,#tweets,#follower,#following,verification,content,location,#censored_posts,#collected_posts,#follower_percentile_grouped,province,province_gdp,censored,censored_ratio
536321,2453175617,f,2,0,172,False,1.0,其他,0,1,1,其他,0,False,0.0
278927,2123046974,f,8,0,18,False,1.0,其他,0,3,1,其他,0,False,0.0
405016,1883592641,m,724,0,1215,False,1.0,其他,1,58,1,其他,0,True,0.017241
234168,2639003040,m,32,0,51,False,1.0,其他,14,62,1,其他,0,True,0.225806
151815,2045211094,m,15,0,48,False,1.0,河南,0,4,1,河南,8302,False,0.0


### Unused Fields Cleansing

In [14]:
users_with_content.drop([
    'content',
    'location'
], axis = 'columns', inplace=True, errors='ignore')
users_with_content.head()

Unnamed: 0,id,gender,#tweets,#follower,#following,verification,#censored_posts,#collected_posts,#follower_percentile_grouped,province,province_gdp,censored,censored_ratio
536321,2453175617,f,2,0,172,False,0,1,1,其他,0,False,0.0
278927,2123046974,f,8,0,18,False,0,3,1,其他,0,False,0.0
405016,1883592641,m,724,0,1215,False,1,58,1,其他,0,True,0.017241
234168,2639003040,m,32,0,51,False,14,62,1,其他,0,True,0.225806
151815,2045211094,m,15,0,48,False,0,4,1,河南,8302,False,0.0


In [16]:
with open('./user_df.pkl', 'wb') as f:
    pickle.dump(users_with_content, f)

<a name="raw_post_data_processing"/>

## Post Data Processing

<a name="post_number"/>

### Post Number and Censored Number

In [17]:
censored_num_recorder = defaultdict(int)
collected_repost_num = defaultdict(int)
for i in range(len(tweets_path_list)//20000 + 1):
    with open('__raw_repost_df_%s.pkl' %i, 'rb') as f:
        temp_raw_df = pickle.load(f)
    for key, values in defaultdict(int,temp_raw_df.groupby('user_id')['is_censored_repost'].agg('sum').to_dict()).items():
        censored_num_recorder[key] += values
    for key, values in defaultdict(int,temp_raw_df.groupby('user_id').size().to_dict()).items():
        collected_repost_num[key] += values
with open('user_censorship_num_recorder.pkl','wb') as f:
    pickle.dump(censored_num_recorder, f)
with open('user_collected_repost_num.pkl','wb') as f:
    pickle.dump(collected_repost_num, f)
    

KeyboardInterrupt: 

### Before running the cell below, you have to check if the number of keys in the collected_posts matches the number of length of users_with_content. If there is no exact match, you can subset users_with_content with the IDs that are in the dictionary

#### further cleaning of the dictionary

In [None]:
with open('user_censorship_num_recorder.pkl','rb') as f:
    censored_num_recorder = pickle.load(f)
with open('user_collected_repost_num.pkl', 'rb') as f:
    collected_repost_num = pickle.load(f)

In [None]:
translation = {}

for key in censored_num_recorder.keys():
    cleaning = lambda x: re.sub('[^0-9]*','',x)
    translation[key] = cleaning(key)
    
for item, value in translation.items():
    censored_num_recorder[value] = censored_num_recorder.pop(item)

In [None]:
translation = {}
for key in collected_repost_num.keys():
    cleaning = lambda x: re.sub('[^0-9]*','',x)
    translation[key] = cleaning(key)
    
for item, value in translation.items():
    collected_repost_num[value] = collected_repost_num.pop(item)

In [None]:
####################### Manual check here!!!

len(set(users_with_content['id']) & set(collected_repost_num.keys()))

In [None]:
users_with_content['#collected_posts'] = users_with_content['id'].map(lambda x: collected_repost_num[x])
users_with_content['#censored_posts'] = users_with_content['id'].map(lambda x: censored_num_recorder[x])

In [None]:
len(users_with_content)

<a name="post_data_processing"/>

## Post Data Processing

### Concat user csv's to a single df

In [18]:
def concat_user_csvs_to_df(csv_list, bar=None, lock=None):
    censored_repost_list = deque()
    user_csv_df_list = []
    for csv_path in csv_list:        
        user_weibo_df = pd.read_csv(csv_path)
        user_weibo_df = reorg_column_names(user_weibo_df)
        user_weibo_df['user_id'] = int(csv_path.split('/')[-1].strip('.csv'))
        user_csv_df_list.append(user_weibo_df)
        if bar is not None and lock is not None:
            with lock:
                bar.update()
    
    return pd.concat(user_csv_df_list).astype({
            'reposting_time': 'string',
            'reposter_device': 'string',
            'repost_weibo_comment': 'string',
            'source_weibo_post_time': 'string',
            'source_weibo_content': 'string',
            'source_user_nickname': 'string',
            'source_weibo_device': 'string'
        }, copy=False)

concat_user_csvs_to_df(csv_path_list[:2]).head()

Unnamed: 0,reposting_time,reposter_device,#likes,#comments,#reposts,repost_weibo_comment,source_user_id,source_user_nickname,source_weibo_post_time,source_weibo_device,#source_weibo_likes,#source_weibo_comments,#source_weibo_reposts,source_weibo_content,user_id
0,Thu Aug 05 02:16:37 +0800 2021,iPhone客户端,0,0,0,转发微博,5721826695,王尼美,Tue Aug 03 13:44:34 +0800 2021,微博视频号,1925,224,980,发现一只乖乖拍证件照的纯白波斯猫！好像一朵云哦#猫咪的毛发有多蓬松# 王尼美的微博视频,6195560523
1,Wed Aug 04 13:19:00 +0800 2021,iPhone客户端,0,0,0,转发微博,1686137252,大战柴柯夫斯基,Sun Aug 01 20:23:46 +0800 2021,iPhone客户端,1080,58,88,猫狗一起药浴了！#萌宠一夏##萌宠星视频# 大战柴柯夫斯基的微博视频,6195560523
2,Tue Aug 03 10:17:30 +0800 2021,iPhone客户端,0,0,0,转发微博,2844061457,全是猫,Tue Jul 27 17:30:22 +0800 2021,iPhone客户端,1283,81,265,当你喜欢上一只猫猫主播 全是猫的微博视频,6195560523
3,Mon Aug 02 20:57:01 +0800 2021,iPhone客户端,0,0,0,转发微博,7390620536,矮脚虎凸凸,Wed Jul 28 18:23:59 +0800 2021,矮脚凹凸喵超话,354,67,31,怎么会有这么又菜又爱玩的小猫咪呢矮脚凹凸喵#新星V计划##萌宠星视频# 矮脚虎凸凸的微博视频,6195560523
4,Sun Aug 01 06:53:28 +0800 2021,iPhone客户端,0,0,0,转发微博,2807207672,萌宠百科,Thu Jul 29 00:30:03 +0800 2021,微博视频号,941,116,162,迷你版的小猫咪！！ 萌宠百科的微博视频,6195560523


In [35]:
concat_user_csvs_to_df(csv_path_list[:1])

Unnamed: 0,reposting_time,reposter_device,#likes,#comments,#reposts,repost_weibo_comment,source_user_id,source_user_nickname,source_weibo_post_time,source_weibo_device,#source_weibo_likes,#source_weibo_comments,#source_weibo_reposts,source_weibo_content,user_id
0,Thu Aug 05 02:16:37 +0800 2021,iPhone客户端,0,0,0,转发微博,5721826695,王尼美,Tue Aug 03 13:44:34 +0800 2021,微博视频号,1925,224,980,发现一只乖乖拍证件照的纯白波斯猫！好像一朵云哦#猫咪的毛发有多蓬松# 王尼美的微博视频,6195560523
1,Wed Aug 04 13:19:00 +0800 2021,iPhone客户端,0,0,0,转发微博,1686137252,大战柴柯夫斯基,Sun Aug 01 20:23:46 +0800 2021,iPhone客户端,1080,58,88,猫狗一起药浴了！#萌宠一夏##萌宠星视频# 大战柴柯夫斯基的微博视频,6195560523
2,Tue Aug 03 10:17:30 +0800 2021,iPhone客户端,0,0,0,转发微博,2844061457,全是猫,Tue Jul 27 17:30:22 +0800 2021,iPhone客户端,1283,81,265,当你喜欢上一只猫猫主播 全是猫的微博视频,6195560523
3,Mon Aug 02 20:57:01 +0800 2021,iPhone客户端,0,0,0,转发微博,7390620536,矮脚虎凸凸,Wed Jul 28 18:23:59 +0800 2021,矮脚凹凸喵超话,354,67,31,怎么会有这么又菜又爱玩的小猫咪呢矮脚凹凸喵#新星V计划##萌宠星视频# 矮脚虎凸凸的微博视频,6195560523
4,Sun Aug 01 06:53:28 +0800 2021,iPhone客户端,0,0,0,转发微博,2807207672,萌宠百科,Thu Jul 29 00:30:03 +0800 2021,微博视频号,941,116,162,迷你版的小猫咪！！ 萌宠百科的微博视频,6195560523
5,Sat Jul 31 03:41:28 +0800 2021,iPhone客户端,0,0,0,转发微博,2844061457,全是猫,Fri Jul 30 12:53:49 +0800 2021,iPhone客户端,3712,271,1157,会游泳但不完全会 全是猫的微博视频,6195560523
6,Fri Jul 30 02:21:46 +0800 2021,iPhone客户端,0,0,0,转发微博,5129369274,今天可以吃主人了吗,Thu Jul 29 00:34:46 +0800 2021,微博视频号,7840,626,767,那就看看小mer吧 今天可以吃主人了吗的微博视频,6195560523
7,Thu Jul 29 10:19:57 +0800 2021,iPhone客户端,0,0,0,转发微博,7390620536,矮脚虎凸凸,Mon Jun 21 14:34:38 +0800 2021,矮脚凹凸喵超话,286,36,25,jiojio：“我想开了”#小猫咪的肉垫有多可爱#矮脚凹凸喵#萌宠星视频# 矮脚虎凸凸的微博视频,6195560523
8,Wed Jul 28 20:45:08 +0800 2021,iPhone客户端,0,0,0,转发微博,5224441745,阿司匹喵,Tue Jul 13 07:36:23 +0800 2021,微博视频号,278,41,46,小奶猫困到头都点地了还不肯睡，在坚持什么呢，这也太可爱了吧 #视频创意话题大...,6195560523
9,Tue Jul 27 20:50:43 +0800 2021,iPhone客户端,0,0,0,转发微博,3146606464,厨娘小米粒,Mon Jul 19 19:01:15 +0800 2021,微博视频号,3359,106,1106,#番茄肥牛泡菜烩饭# 不知道吃什么的时候来一碗番茄泡菜肥牛饭吧，给你整的妥妥的，拯救没食欲的...,6195560523


### Device Transform Definition

In [19]:
device_mapping = {
    'Apple':['iPhone','iPad','Mac'],
    'Web':['浏览器', '微博'],
    'Huawei':['Huawei','nova','华为','HUAWEI','nova','Harmony'],
    'Honor':['荣耀'],
    'XiaoMi':['小米', 'Redmi','红米'],
    'vivo':['vivo'],
    'OPPO':['OPPO'],
    'Samsung':['三星','Samsung'],
    'General Andoid':['android','Android'],
    'Realme':['realme','真我'],
    'IQOO':['iQOO'],
    'OnePlus':['一加','OnePlus']
}

def get_reposter_device(df):   
    def to_brand(client_name):
        if type(client_name) is not str:
            return 'NaN'
        for brand, devices in device_mapping.items():
            if any((device in client_name for device in devices)):
                return brand
        return 'other'
    
    return df['reposter_device'].map(to_brand).astype('string')


### Censored Transform Definition

### Time Transform Definition

In [20]:
from datetime import datetime

def get_timestamp(df, col_name):
    def to_timestamp(s):
        if type(s) is not str:
            return None
        try:
            return datetime.strptime(s, '%a %b %d %X %z %Y').replace(hour=0, minute=0, second=0, microsecond=0, day=1).timestamp()
        except ValueError:
            return None
    return df[col_name].map(to_timestamp)


### User Information Preparation

In [21]:
users_df = users_with_content.copy()
users_df = users_df.set_index('id')
users_df.head()

Unnamed: 0_level_0,gender,#tweets,#follower,#following,verification,#censored_posts,#collected_posts,#follower_percentile_grouped,province,province_gdp,censored,censored_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2453175617,f,2,0,172,False,0,1,1,其他,0,False,0.0
2123046974,f,8,0,18,False,0,3,1,其他,0,False,0.0
1883592641,m,724,0,1215,False,1,58,1,其他,0,True,0.017241
2639003040,m,32,0,51,False,14,62,1,其他,0,True,0.225806
2045211094,m,15,0,48,False,0,4,1,河南,8302,False,0.0


### Transformation Pipeline

In [22]:
users_with_content.dtypes

id                                int64
gender                           string
#tweets                           int64
#follower                         int64
#following                        int64
verification                       bool
#censored_posts                   int64
#collected_posts                  int64
#follower_percentile_grouped      int32
province                         object
province_gdp                      int64
censored                           bool
censored_ratio                  float64
dtype: object

In [25]:
def to_post_full_df(df):
    df = df.assign(
        general_device = get_reposter_device(df),
        repost_timestamp_month = get_timestamp(df, 'reposting_time'),
        post_timestamp_month = get_timestamp(df, 'source_weibo_post_time')
    )
    df.drop([
        "source_user_nickname",
        "source_user_id",
        "reposter_device",
        "source_weibo_device",
        "reposting_time",
        "#likes",
        "#comments",
        "#reposts",
        "#source_weibo_likes",
        "#source_weibo_comments",
        "#source_weibo_reposts"
    ], axis = 'columns', inplace=True)
    return df.merge(users_df, left_on='user_id', right_on='id', right_index=True, suffixes=('_post', '_user'))

to_post_full_df(concat_user_csvs_to_df(csv_path_list[:100])).head()

Unnamed: 0,repost_weibo_comment,source_weibo_post_time,source_weibo_content,user_id,general_device,repost_timestamp_month,post_timestamp_month,gender,#tweets,#follower,#following,verification,#censored_posts,#collected_posts,#follower_percentile_grouped,province,province_gdp,censored,censored_ratio
0,转发微博,Tue Aug 03 13:44:34 +0800 2021,发现一只乖乖拍证件照的纯白波斯猫！好像一朵云哦#猫咪的毛发有多蓬松# 王尼美的微博视频,6195560523,Apple,1627747000.0,1627747000.0,f,138,7,521,False,0,59,3,其他,0,False,0.0
1,转发微博,Sun Aug 01 20:23:46 +0800 2021,猫狗一起药浴了！#萌宠一夏##萌宠星视频# 大战柴柯夫斯基的微博视频,6195560523,Apple,1627747000.0,1627747000.0,f,138,7,521,False,0,59,3,其他,0,False,0.0
2,转发微博,Tue Jul 27 17:30:22 +0800 2021,当你喜欢上一只猫猫主播 全是猫的微博视频,6195560523,Apple,1627747000.0,1625069000.0,f,138,7,521,False,0,59,3,其他,0,False,0.0
3,转发微博,Wed Jul 28 18:23:59 +0800 2021,怎么会有这么又菜又爱玩的小猫咪呢矮脚凹凸喵#新星V计划##萌宠星视频# 矮脚虎凸凸的微博视频,6195560523,Apple,1627747000.0,1625069000.0,f,138,7,521,False,0,59,3,其他,0,False,0.0
4,转发微博,Thu Jul 29 00:30:03 +0800 2021,迷你版的小猫咪！！ 萌宠百科的微博视频,6195560523,Apple,1627747000.0,1625069000.0,f,138,7,521,False,0,59,3,其他,0,False,0.0


In [26]:
def to_post_no_content_df(post_full_df):
    post_full_df.drop([
        "repost_weibo_comment",
        "source_weibo_content"
    ], axis = 'columns', inplace=True)
    return post_full_df

In [27]:
num_dfs = 12
cpu_num = 8
csv_path_sections = np.array_split(csv_path_list, num_dfs)

In [29]:
# Transform Begin
no_content_post_dfs = []

for partition_id, csv_list_sub_section in enumerate(np.array_split(csv_path_list, num_dfs)):
    print('partition %d'%partition_id)
    csv_list_sub_section_tasks = np.array_split(csv_list_sub_section, cpu_num*10)
    with Pool(cpu_num) as p:
        dfs = p.map(concat_user_csvs_to_df, tqdm(csv_list_sub_section_tasks, desc="csv to df"))

        dfs = p.map(to_post_full_df, tqdm(dfs, desc="df to full post df"))
        with open('final_post_full_%d.df' % partition_id, 'wb') as f:
            pickle.dump(pd.concat(dfs), f)

        dfs = p.map(to_post_no_content_df, tqdm(dfs, desc="full post df to no content df"))
        no_content_post_dfs.append(pd.concat(dfs))

with open('final_post_no_content_.df', 'wb') as f:
    pickle.dump(pd.concat(no_content_post_dfs), f)

csv to df: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 80/80 [00:17<00:00,  4.52it/s]
df to full post df: 100%|████████████████████████████████████████████████████████████████████████████████████████| 80/80 [00:04<00:00, 18.34it/s]
full post df to no content df: 100%|█████████████████████████████████████████████████████████████████████████████| 80/80 [00:01<00:00, 69.49it/s]
csv to df: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 80/80 [00:18<00:00,  4.38it/s]
df to full post df: 100%|████████████████████████████████████████████████████████████████████████████████████████| 80/80 [00:04<00:00, 18.16it/s]
full post df to no content df: 100%|█████████████████████████████████████████████████████████████████████████████| 80/80 [00:01<00:00, 68.45it/s]
csv to df: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 80/80 [00