# Table Of Content
---

1. ### [Data Processing](#data_processing)
    1. ### [User Data Processing](#user_data_processing)
    3. ### [Post Data Processing](#post_data_processing)

In [2]:
import pickle
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from collections import deque
from datetime import *
import re
from glob import glob
import os
from collections import defaultdict

### Input Path Definition

In [3]:
processing_dir = os.getcwd()
base_path = '/'.join(processing_dir.split('/')[:-1])
base_path

'/home/sean/Desktop/CSC440_project'

In [4]:
data_path = base_path + '/data_general'
data_path

'/home/sean/Desktop/CSC440_project/data_general'

In [5]:
posts_path = data_path + '/tweets'
posts_path

'/home/sean/Desktop/CSC440_project/data_general/tweets'

In [6]:
csv_path_list = map(lambda x: glob(x + '/*.csv'), glob(posts_path+'/*/'))
csv_path_list = filter(lambda x : len(x) > 0, csv_path_list)
csv_path_list = list(map(lambda x:x[0], csv_path_list))
csv_path_list[:3]

['/home/sean/Desktop/CSC440_project/data_general/tweets/小黑-9-30/6195560523.csv',
 '/home/sean/Desktop/CSC440_project/data_general/tweets/瓜果梨桃葡萄橙子/6912965703.csv',
 '/home/sean/Desktop/CSC440_project/data_general/tweets/鹿家宁浅/3208770254.csv']

### Output Path Definition

In [7]:
output_path = data_path + '/df'
output_path

'/home/sean/Desktop/CSC440_project/data_general/df'

In [8]:
users_df_output_path = output_path+'/user_df.pkl'
posts_full_df_output_path = output_path+'/post_full_%d.pkl'
posts_no_content_df_output_path = output_path+'/post_no_content.pkl'

<a name="data_processing"/>

# Data Processing

<a name="user_data_processing"/>

## User Data Processing

### Read Raw users.csv

In [9]:
users = pd.read_csv(data_path+'/users.csv')
users = users.rename(columns={
    '用户id':'id',
    '性别': 'gender',
    '微博数':'#tweets',
    '粉丝数': '#follower',
    '关注数': '#following',
    '是否认证':'verified'
})
users = users[['id','gender','#tweets','#follower','#following','verified']]
users.drop_duplicates('id',inplace=True)
users.set_index('id', inplace=True)
def to_int_follower_count(follower_count_str):
    if type(follower_count_str) is int:
        return follower_count_str
    if '万' in follower_count_str:
        return int(float(follower_count_str[:-1])*10000)
    if '亿' in follower_count_str:
        return int(float(follower_count_str[:-1])*100000000)
    else:
        return int(follower_count_str)
users['#follower'] = users['#follower'].map(to_int_follower_count)
users.head()

Unnamed: 0_level_0,gender,#tweets,#follower,#following,verified
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
6309921628,f,2902,344,403,False
6007931743,m,11,20,163,False
7471743898,m,5,10,265,False
5701747600,f,3175,95113,1052,False
7577162125,m,25,3,190,False


### Read locations.csv

In [10]:
locations = pd.read_csv(data_path + '/locations_combined.csv', header = None, names = ['id','location'])
locations.drop_duplicates('id',inplace=True)
locations.set_index('id', inplace=True)
locations.astype({
    'location' : 'string'
}, copy = False)
locations.head()

Unnamed: 0_level_0,location
id,Unnamed: 1_level_1
5711341129,其他
7412051152,其他
7568601498,其他
1804028250,广东 广州
7448767160,四川


### Merge Users and Locations

In [11]:
users = users.merge(locations, left_index=True, right_index=True, how='inner')
users = users.reset_index()
users.head()

Unnamed: 0,id,gender,#tweets,#follower,#following,verified,location
0,6309921628,f,2902,344,403,False,河南 许昌
1,6007931743,m,11,20,163,False,江苏 南京
2,7471743898,m,5,10,265,False,其他
3,5701747600,f,3175,95113,1052,False,上海 杨浦
4,7577162125,m,25,3,190,False,山东 济南


### Remove users without collected content

In [12]:
user_ids_with_content = set(map(lambda x: int(x.rsplit('/',maxsplit=1)[1].strip('.csv')), csv_path_list))
users_with_content = users[users['id'].isin(user_ids_with_content)]
del user_ids_with_content
users_with_content.head()

Unnamed: 0,id,gender,#tweets,#follower,#following,verified,location
0,6309921628,f,2902,344,403,False,河南 许昌
1,6007931743,m,11,20,163,False,江苏 南京
2,7471743898,m,5,10,265,False,其他
3,5701747600,f,3175,95113,1052,False,上海 杨浦
4,7577162125,m,25,3,190,False,山东 济南


In [13]:
users_with_content = users_with_content.astype(
    {
        'location': 'string',
        'id': 'int64',
        'gender': 'string'
    }
)
users_with_content.dtypes

id             int64
gender        string
#tweets        int64
#follower      int64
#following     int64
verified        bool
location      string
dtype: object

### Get Censored & Number of Collected Posts for each user

In [14]:
censor_indications = [
    '抱歉，由于作者设置，',
    '该微博因被多人投诉',
    '该账号因被投诉违反',
    '该账号因被投诉',
    '查看帮助： 网页链接'
]

rename_needed_dict = {
    '视频url': 'reposter_device',  
    '原始图片url': 'reposting_time', 
    '位置': '#likes', 
    '日期': '#comments', 
    '工具': '#reposts', 
    '点赞数': 'repost_weibo_comment',
    '源用户id': 'source_user_id', 
    '源用户昵称': 'source_user_nickname', 
    '源微博原始图片url': 'source_weibo_post_time',
    '源微博视频url': 'source_weibo_device', 
    '源微博位置': '#source_weibo_likes', 
    '源微博日期': '#source_weibo_comments', 
    '源微博工具': '#source_weibo_reposts', 
    '源微博点赞数': 'source_weibo_content' 
}

rename_no_need_dict = {
    "正文": 'repost_weibo_comment',
    '日期': 'reposting_time',
    '工具': 'reposter_device',
    '点赞数': '#likes',
    '评论数':'#comments',
    '转发数': '#reposts',
    '源用户id': 'source_user_id',
    '源用户昵称': 'source_user_nickname',
    '源微博正文': 'source_weibo_content',
    '源微博日期': 'source_weibo_post_time',
    '源微博工具': 'source_weibo_device',
    '源微博点赞数': '#source_weibo_likes',
    '源微博评论数': '#source_weibo_comments',
    '源微博转发数': '#source_weibo_reposts'
}

keep_columns = rename_no_need_dict.values()

def reorg_column_names(user_weibo_df):
    # need to iterate by row and then check each row 
    # create 2 dataframes, one holding non-change, other holding change, then merge
    df_no_change = []
    df_w_change = []
#     user_weibo_df = user_weibo_df.astype({
#         '日期': 'string'
#     })
    for _, row in user_weibo_df.iterrows():
        try:
            datetime.strptime(str(row['源微博原始图片url']),'%a %b %d %X %z %Y')
            df_w_change.append(row)
        except ValueError:
            try:
                datetime.strptime(str(row['日期']),'%a %b %d %X %z %Y')
                df_no_change.append(row)
            except ValueError:
                # the column cannot be parsed so it will be dropped
                pass
            
    if df_no_change:
        df_no_change = pd.DataFrame(df_no_change).rename(columns=rename_no_need_dict)[keep_columns]
    if df_w_change:
        df_w_change = pd.DataFrame(df_w_change).rename(columns=rename_needed_dict)[keep_columns]
    return pd.concat([pd.DataFrame(df_no_change),pd.DataFrame(df_w_change)]).astype(
        {
            "repost_weibo_comment":"string",
            "reposting_time":"string",
            "reposter_device":"string",
            "#likes":"int64",
            "#comments":"int64",
            "#reposts":"int64",
            "source_user_id":"float64",
            "source_user_nickname":"string",
            "source_weibo_content":"string",
            "source_weibo_post_time":"string",
            "source_weibo_device":"string",
            "#source_weibo_likes":"int64",
            "#source_weibo_comments":"int64",
            "#source_weibo_reposts":"int64",  
        },
        copy=False
    )

# reorg_column_names(pd.read_csv('/home/sean/Desktop/CSC440_project/data_general/tweets/JadeRing-玉儿/5797519917.csv'))

In [15]:
pool_num = 8 # Should be number of real CPU cores

In [16]:
from multiprocessing import Pool, Lock
from functools import reduce

with tqdm() as bar:
    lock = Lock()
    def calculate_post_stats(csv_path_list):
        censored_vs_collected_post_num = {}
        for csv_path in csv_path_list:
            
            user_id = int(csv_path.split('/')[-1].strip('.csv'))
            user_weibo_df = reorg_column_names(pd.read_csv(csv_path))
            
            
            censored_vs_collected_post_num[user_id] = (
                sum(user_weibo_df['source_weibo_content'].map(lambda content: any((keyword in str(content) for keyword in censor_indications)))),
                len(user_weibo_df)
            )
            with lock:
                bar.update()
        return censored_vs_collected_post_num
    
    with Pool(pool_num) as p:
        csv_sections = np.array_split(csv_path_list, pool_num)
        censored_vs_collected_post_num_list = p.map(calculate_post_stats, csv_sections)
        censored_vs_collected_post_num = reduce(lambda a,b: a | b, censored_vs_collected_post_num_list)

len(censored_vs_collected_post_num)

0it [09:11, ?it/s]9.82it/s]


579676

In [17]:
users_with_content['#censored_posts'] = users_with_content['id'].map(
    lambda id: censored_vs_collected_post_num[id][0] if id in censored_vs_collected_post_num else 0
)

users_with_content['#collected_posts'] = users_with_content['id'].map(
    lambda id: censored_vs_collected_post_num[id][1] if id in censored_vs_collected_post_num else 0
)

In [18]:
users_with_content.head()

Unnamed: 0,id,gender,#tweets,#follower,#following,verified,location,#censored_posts,#collected_posts
0,6309921628,f,2902,344,403,False,河南 许昌,0,56
1,6007931743,m,11,20,163,False,江苏 南京,0,2
2,7471743898,m,5,10,265,False,其他,0,5
3,5701747600,f,3175,95113,1052,False,上海 杨浦,0,52
4,7577162125,m,25,3,190,False,山东 济南,0,2


In [19]:
print("censored post = ",sum([a for a,_ in censored_vs_collected_post_num.values()]))
print("all post = ",sum([b for _,b in censored_vs_collected_post_num.values()]))

censored post =  558817
all post =  19601910


<a name="follower_count_percentile"/>

### Follower Count Percentile

In [20]:
users_with_content.sort_values('#follower', inplace=True)
df_len = len(users_with_content)

lesser_num = 0
cur_follower_num = 0
cur_count = 0
percentiles = deque()

for follower_num in users_with_content['#follower']:
    if follower_num == cur_follower_num:
        cur_count += 1
    else:
        percentile = (lesser_num) / df_len * 100
        percentiles.extend([percentile for _ in range(cur_count)])
        cur_follower_num = follower_num
        lesser_num += cur_count 
        cur_count = 1

percentile = (lesser_num) / df_len * 100
percentiles.extend([percentile for _ in range(cur_count)])

users_with_content["#follower_percentile_grouped"] = list(map(lambda x: x//10 + 1, percentiles))
users_with_content["#follower_percentile_grouped"] = users_with_content["#follower_percentile_grouped"].astype('int32')
users_with_content.tail()

Unnamed: 0,id,gender,#tweets,#follower,#following,verified,location,#censored_posts,#collected_posts,#follower_percentile_grouped
426005,2803301701,m,135297,138009780,3061,True,北京,0,11,10
258474,1934183965,f,2318,158452899,183,True,北京,0,51,10
203828,5878659096,m,10368,181509693,2553,True,北京,0,51,10
505034,2016713117,f,15318,183042610,135,True,北京 海淀,0,54,10
76225,1642909335,f,5632,197923455,2931,True,北京,0,55,10


<a name="location_to_province"/>

### Location to Province

In [22]:
weird_prefixes = {"昵称", "认证"}

def to_province(location):
    location = location.split(' ')[0]
    return location if all((matcher not in location for matcher in weird_prefixes)) else '其他'
users_with_content['province'] = users_with_content['location'].map(to_province).astype('string')
users_with_content['province']

309726    北京
165141    湖南
482160    其他
441120    广东
81609     河北
          ..
426005    北京
258474    北京
203828    北京
505034    北京
76225     北京
Name: province, Length: 574356, dtype: string

### Province GDP

In [23]:
from collections import defaultdict

gdp_by_province = defaultdict(int) # default value for GDP is 0

gdp_by_province.update({
    "北京":23805,
    "上海":23277,
    "江苏":17121,
    "浙江":16358,
    "福建":15531,
    "广东":14223,
    "天津":13569,
    "湖北":10988,
    "重庆":10867,
    "山东":10811,
    "内蒙古":9977,
    "陕西":9239,
    "安徽":8703,
    "湖南":8681,
    "辽宁":8667,
    "海南":8323,
    "河南":8302,
    "四川":8229,
    "新疆":7721,
    "宁夏":7686,
    "江西":7682,
    "青海":6998,
    "西藏":6997,
    "云南":6950,
    "贵州":6828,
    "河北":6797,
    "山西":6735,
    "吉林":6577,
    "广西":6386,
    "黑龙江":5129,
    "甘肃":4624,
    "香港":46700,
    "台湾":28306,
    "澳门":38769,
})

users_with_content['province_gdp'] = users_with_content['province'].map(lambda p: gdp_by_province[p])
users_with_content.head()

Unnamed: 0,id,gender,#tweets,#follower,#following,verified,location,#censored_posts,#collected_posts,#follower_percentile_grouped,province,province_gdp
309726,1724875232,f,8,0,282,False,北京 朝阳,0,2,1,北京,23805
165141,1705438357,f,2,0,8,False,湖南 益阳,0,1,1,湖南,8681
482160,2138418962,f,5910,0,675,False,其他,2,60,1,其他,0
441120,2286345830,f,31,0,205,False,广东,0,25,1,广东,14223
81609,2179748131,m,27,0,36,False,河北 保定,0,3,1,河北,6797


### Censored Ratio & renaming

In [24]:
users_with_content = users_with_content.rename(columns={
    'verified':"verification"
})
users_with_content['censored'] = users_with_content['#censored_posts'].map(lambda x: x > 0)
users_with_content['censored_ratio'] = users_with_content['#censored_posts']/users_with_content['#collected_posts']
users_with_content['censored_ratio'] = users_with_content['censored_ratio'].map(lambda x: x if x != float('inf') else 0)
users_with_content.head()

Unnamed: 0,id,gender,#tweets,#follower,#following,verification,location,#censored_posts,#collected_posts,#follower_percentile_grouped,province,province_gdp,censored,censored_ratio
309726,1724875232,f,8,0,282,False,北京 朝阳,0,2,1,北京,23805,False,0.0
165141,1705438357,f,2,0,8,False,湖南 益阳,0,1,1,湖南,8681,False,0.0
482160,2138418962,f,5910,0,675,False,其他,2,60,1,其他,0,True,0.033333
441120,2286345830,f,31,0,205,False,广东,0,25,1,广东,14223,False,0.0
81609,2179748131,m,27,0,36,False,河北 保定,0,3,1,河北,6797,False,0.0


### Censored Ratio Percentile

In [25]:
censored_users = users_with_content[users_with_content['censored']].copy()
uncensored_users = users_with_content[np.logical_not(users_with_content['censored'])].copy()

censored_users['censored_ratio_percentile'] = pd.qcut(censored_users['censored_ratio'],3,labels=[1,2,3])
uncensored_users['censored_ratio_percentile'] = 0

users_with_content = pd.concat([uncensored_users, censored_users])

del censored_users
del uncensored_users
users_with_content

Unnamed: 0,id,gender,#tweets,#follower,#following,verification,location,#censored_posts,#collected_posts,#follower_percentile_grouped,province,province_gdp,censored,censored_ratio,censored_ratio_percentile
309726,1724875232,f,8,0,282,False,北京 朝阳,0,2,1,北京,23805,False,0.000000,0
165141,1705438357,f,2,0,8,False,湖南 益阳,0,1,1,湖南,8681,False,0.000000,0
441120,2286345830,f,31,0,205,False,广东,0,25,1,广东,14223,False,0.000000,0
81609,2179748131,m,27,0,36,False,河北 保定,0,3,1,河北,6797,False,0.000000,0
168736,1898386165,f,3,0,17,False,其他,0,2,1,其他,0,False,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
381105,3125046087,m,1146,52521601,1924,True,北京,1,43,10,北京,23805,True,0.023256,2
494713,1815418641,m,1210,53897766,344,True,北京,2,106,10,北京,23805,True,0.018868,1
352277,1712539910,f,4972,71591411,470,True,台湾,1,49,10,台湾,28306,True,0.020408,1
117507,1223178222,m,3963,71823990,725,True,上海 徐汇,1,57,10,上海,23277,True,0.017544,1


### Unused Fields Cleansing

In [24]:
users_with_content.drop([
    'content',
    'location'
], axis = 'columns', inplace=True, errors='ignore')
users_with_content.head()

Unnamed: 0,id,gender,#tweets,#follower,#following,verification,#censored_posts,#collected_posts,#follower_percentile_grouped,province,province_gdp,censored,censored_ratio,censored_ratio_percentile
309726,1724875232,f,8,0,282,False,0,2,1,北京,23805,False,0.0,0
165141,1705438357,f,2,0,8,False,0,1,1,湖南,8681,False,0.0,0
441120,2286345830,f,31,0,205,False,0,25,1,广东,14223,False,0.0,0
81609,2179748131,m,27,0,36,False,0,3,1,河北,6797,False,0.0,0
168736,1898386165,f,3,0,17,False,0,2,1,其他,0,False,0.0,0


In [25]:
with open(users_df_output_path, 'wb') as f:
    pickle.dump(users_with_content, f)

### Loading data (Run only when needed)

In [26]:
# with open(users_df_output_path, 'rb') as f:
#     users_with_content = pickle.load(f)

<a name="post_data_processing"/>

## Post Data Processing

### Concat user csv's to a single df

In [27]:
def concat_user_csvs_to_df(csv_list, bar=None, lock=None):
    censored_repost_list = deque()
    user_csv_df_list = []
    for csv_path in csv_list:        
        user_weibo_df = pd.read_csv(csv_path)
        user_weibo_df = reorg_column_names(user_weibo_df)
        user_weibo_df['user_id'] = int(csv_path.split('/')[-1].strip('.csv'))
        user_csv_df_list.append(user_weibo_df)
        if bar is not None and lock is not None:
            with lock:
                bar.update()
    
    return pd.concat(user_csv_df_list).astype({
            'reposting_time': 'string',
            'reposter_device': 'string',
            'repost_weibo_comment': 'string',
            'source_weibo_post_time': 'string',
            'source_weibo_content': 'string',
            'source_user_nickname': 'string',
            'source_weibo_device': 'string'
        }, copy=False)

concat_user_csvs_to_df(csv_path_list[:2]).head()

Unnamed: 0,repost_weibo_comment,reposting_time,reposter_device,#likes,#comments,#reposts,source_user_id,source_user_nickname,source_weibo_content,source_weibo_post_time,source_weibo_device,#source_weibo_likes,#source_weibo_comments,#source_weibo_reposts,user_id
0,转发微博,Thu Aug 05 02:16:37 +0800 2021,iPhone客户端,0,0,0,5721827000.0,王尼美,发现一只乖乖拍证件照的纯白波斯猫！好像一朵云哦#猫咪的毛发有多蓬松# 王尼美的微博视频,Tue Aug 03 13:44:34 +0800 2021,微博视频号,1925,224,980,6195560523
1,转发微博,Wed Aug 04 13:19:00 +0800 2021,iPhone客户端,0,0,0,1686137000.0,大战柴柯夫斯基,猫狗一起药浴了！#萌宠一夏##萌宠星视频# 大战柴柯夫斯基的微博视频,Sun Aug 01 20:23:46 +0800 2021,iPhone客户端,1080,58,88,6195560523
2,转发微博,Tue Aug 03 10:17:30 +0800 2021,iPhone客户端,0,0,0,2844061000.0,全是猫,当你喜欢上一只猫猫主播 全是猫的微博视频,Tue Jul 27 17:30:22 +0800 2021,iPhone客户端,1283,81,265,6195560523
3,转发微博,Mon Aug 02 20:57:01 +0800 2021,iPhone客户端,0,0,0,7390621000.0,矮脚虎凸凸,怎么会有这么又菜又爱玩的小猫咪呢矮脚凹凸喵#新星V计划##萌宠星视频# 矮脚虎凸凸的微博视频,Wed Jul 28 18:23:59 +0800 2021,矮脚凹凸喵超话,354,67,31,6195560523
4,转发微博,Sun Aug 01 06:53:28 +0800 2021,iPhone客户端,0,0,0,2807208000.0,萌宠百科,迷你版的小猫咪！！ 萌宠百科的微博视频,Thu Jul 29 00:30:03 +0800 2021,微博视频号,941,116,162,6195560523


### Device Transform Definition

In [28]:
device_mapping = {
    'Apple':['iPhone','iPad','Mac'],
    'Web':['浏览器', '微博'],
    'Huawei':['Huawei','nova','华为','HUAWEI','nova','Harmony'],
    'Honor':['荣耀'],
    'XiaoMi':['小米', 'Redmi','红米'],
    'vivo':['vivo'],
    'OPPO':['OPPO'],
    'Samsung':['三星','Samsung'],
    'General Andoid':['android','Android'],
    'Realme':['realme','真我'],
    'IQOO':['iQOO'],
    'OnePlus':['一加','OnePlus']
}

def get_reposter_device(df):   
    def to_brand(client_name):
        if type(client_name) is not str:
            return 'NaN'
        for brand, devices in device_mapping.items():
            if any((device in client_name for device in devices)):
                return brand
        return 'other'
    
    return df['reposter_device'].map(to_brand).astype('string')


### Censored Transform Definition

In [29]:
def get_censored(series):
    return series.map(lambda content: any((keyword in str(content) for keyword in censor_indications)))

### Time Transform Definition

In [30]:
from datetime import datetime

def get_timestamp(df, col_name):
    def to_timestamp(s):
        if type(s) is not str:
            return None
        try:
            return datetime.fromtimestamp(datetime.strptime(s, '%a %b %d %X %z %Y').timestamp()).strftime("%Y-%m")
        except ValueError:
            return None
    return df[col_name].map(to_timestamp)


### User Information Preparation

In [31]:
users_df = users_with_content.copy()
users_df = users_df.set_index('id')
users_df.head()

Unnamed: 0_level_0,gender,#tweets,#follower,#following,verification,#censored_posts,#collected_posts,#follower_percentile_grouped,province,province_gdp,censored,censored_ratio,censored_ratio_percentile
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1724875232,f,8,0,282,False,0,2,1,北京,23805,False,0.0,0
1705438357,f,2,0,8,False,0,1,1,湖南,8681,False,0.0,0
2286345830,f,31,0,205,False,0,25,1,广东,14223,False,0.0,0
2179748131,m,27,0,36,False,0,3,1,河北,6797,False,0.0,0
1898386165,f,3,0,17,False,0,2,1,其他,0,False,0.0,0


### Transformation Pipeline

In [32]:
def to_post_full_df(df):
    df = df.assign(
        general_device = get_reposter_device(df),
        repost_timestamp_month = get_timestamp(df, 'reposting_time'),
        post_timestamp_month = get_timestamp(df, 'source_weibo_post_time'),
        post_censored = get_censored(df['source_weibo_content'])
    )
    df.drop([
        "source_user_nickname",
        "source_user_id",
        "reposter_device",
        "source_weibo_device",
        "reposting_time",
        "source_weibo_post_time",
        "#likes",
        "#comments",
        "#reposts",
        "#source_weibo_likes",
        "#source_weibo_comments",
        "#source_weibo_reposts"
    ], axis = 'columns', inplace=True)
    return df.merge(users_df, left_on='user_id', right_on='id', right_index=True, suffixes=('_post', '_user'))

to_post_full_df(concat_user_csvs_to_df(csv_path_list[:100])).head()

Unnamed: 0,repost_weibo_comment,source_weibo_content,user_id,general_device,repost_timestamp_month,post_timestamp_month,post_censored,gender,#tweets,#follower,#following,verification,#censored_posts,#collected_posts,#follower_percentile_grouped,province,province_gdp,censored,censored_ratio,censored_ratio_percentile
0,转发微博,发现一只乖乖拍证件照的纯白波斯猫！好像一朵云哦#猫咪的毛发有多蓬松# 王尼美的微博视频,6195560523,Apple,2021-08,2021-08,False,f,138,7,521,False,0,59,3,其他,0,False,0.0,0
1,转发微博,猫狗一起药浴了！#萌宠一夏##萌宠星视频# 大战柴柯夫斯基的微博视频,6195560523,Apple,2021-08,2021-08,False,f,138,7,521,False,0,59,3,其他,0,False,0.0,0
2,转发微博,当你喜欢上一只猫猫主播 全是猫的微博视频,6195560523,Apple,2021-08,2021-07,False,f,138,7,521,False,0,59,3,其他,0,False,0.0,0
3,转发微博,怎么会有这么又菜又爱玩的小猫咪呢矮脚凹凸喵#新星V计划##萌宠星视频# 矮脚虎凸凸的微博视频,6195560523,Apple,2021-08,2021-07,False,f,138,7,521,False,0,59,3,其他,0,False,0.0,0
4,转发微博,迷你版的小猫咪！！ 萌宠百科的微博视频,6195560523,Apple,2021-07,2021-07,False,f,138,7,521,False,0,59,3,其他,0,False,0.0,0


In [33]:
def to_post_no_content_df(post_full_df):
    post_full_df.drop([
        "repost_weibo_comment",
        "source_weibo_content"
    ], axis = 'columns', inplace=True)
    return post_full_df

In [34]:
num_dfs = 12
cpu_num = 8
csv_path_sections = np.array_split(csv_path_list, num_dfs)

In [35]:
# Transform Begin
no_content_post_dfs = []

for partition_id, csv_list_sub_section in enumerate(np.array_split(csv_path_list, num_dfs)):
    print('partition %d'%partition_id)
    csv_list_sub_section_tasks = np.array_split(csv_list_sub_section, cpu_num*10) # arbitrary number to show progress
    with Pool(cpu_num) as p:
        dfs = p.map(concat_user_csvs_to_df, tqdm(csv_list_sub_section_tasks, desc="csv to df"))

        dfs = p.map(to_post_full_df, tqdm(dfs, desc="df to full post df"))
        with open(posts_full_df_output_path % partition_id, 'wb') as f:
            pickle.dump(pd.concat(dfs), f)

        dfs = p.map(to_post_no_content_df, tqdm(dfs, desc="full post df to no content df"))
        no_content_post_dfs.append(pd.concat(dfs))

with open(posts_no_content_df_output_path, 'wb') as f:
    pickle.dump(pd.concat(no_content_post_dfs), f)

partition 0


csv to df: 100%|████████████████████████████████| 80/80 [00:39<00:00,  2.04it/s]
df to full post df: 100%|███████████████████████| 80/80 [00:05<00:00, 15.68it/s]
full post df to no content df: 100%|████████████| 80/80 [00:01<00:00, 59.09it/s]


partition 1


csv to df: 100%|████████████████████████████████| 80/80 [00:39<00:00,  2.03it/s]
df to full post df: 100%|███████████████████████| 80/80 [00:05<00:00, 15.63it/s]
full post df to no content df: 100%|████████████| 80/80 [00:01<00:00, 55.66it/s]


partition 2


csv to df: 100%|████████████████████████████████| 80/80 [00:39<00:00,  2.02it/s]
df to full post df: 100%|███████████████████████| 80/80 [00:05<00:00, 15.04it/s]
full post df to no content df: 100%|████████████| 80/80 [00:01<00:00, 50.05it/s]


partition 3


csv to df: 100%|████████████████████████████████| 80/80 [00:41<00:00,  1.95it/s]
df to full post df: 100%|███████████████████████| 80/80 [00:05<00:00, 15.25it/s]
full post df to no content df: 100%|████████████| 80/80 [00:01<00:00, 53.25it/s]


partition 4


csv to df: 100%|████████████████████████████████| 80/80 [00:40<00:00,  1.96it/s]
df to full post df: 100%|███████████████████████| 80/80 [00:05<00:00, 14.51it/s]
full post df to no content df: 100%|████████████| 80/80 [00:01<00:00, 51.18it/s]


partition 5


csv to df: 100%|████████████████████████████████| 80/80 [00:40<00:00,  1.97it/s]
df to full post df: 100%|███████████████████████| 80/80 [00:05<00:00, 15.74it/s]
full post df to no content df: 100%|████████████| 80/80 [00:01<00:00, 48.50it/s]


partition 6


csv to df: 100%|████████████████████████████████| 80/80 [00:41<00:00,  1.92it/s]
df to full post df: 100%|███████████████████████| 80/80 [00:05<00:00, 15.81it/s]
full post df to no content df: 100%|████████████| 80/80 [00:01<00:00, 51.98it/s]


partition 7


csv to df: 100%|████████████████████████████████| 80/80 [00:42<00:00,  1.89it/s]
df to full post df: 100%|███████████████████████| 80/80 [00:05<00:00, 15.06it/s]
full post df to no content df: 100%|████████████| 80/80 [00:01<00:00, 53.31it/s]


partition 8


csv to df: 100%|████████████████████████████████| 80/80 [00:40<00:00,  1.95it/s]
df to full post df: 100%|███████████████████████| 80/80 [00:05<00:00, 15.55it/s]
full post df to no content df: 100%|████████████| 80/80 [00:01<00:00, 54.27it/s]


partition 9


csv to df: 100%|████████████████████████████████| 80/80 [00:41<00:00,  1.93it/s]
df to full post df: 100%|███████████████████████| 80/80 [00:05<00:00, 15.82it/s]
full post df to no content df: 100%|████████████| 80/80 [00:01<00:00, 48.93it/s]


partition 10


csv to df: 100%|████████████████████████████████| 80/80 [00:41<00:00,  1.95it/s]
df to full post df: 100%|███████████████████████| 80/80 [00:05<00:00, 15.46it/s]
full post df to no content df: 100%|████████████| 80/80 [00:01<00:00, 48.30it/s]


partition 11


csv to df: 100%|████████████████████████████████| 80/80 [00:41<00:00,  1.93it/s]
df to full post df: 100%|███████████████████████| 80/80 [00:05<00:00, 15.07it/s]
full post df to no content df: 100%|████████████| 80/80 [00:01<00:00, 49.20it/s]
