In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from linearmodels import RandomEffects
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('final_data.csv')
print(df.shape)

(190848, 44)


In [27]:
df['publish_time'] = pd.to_datetime(df['publish_time'], format='%Y-%m-%dT%H:%M:%S')
df['year'] = df['publish_time'].dt.strftime('%Y')
df['hour'] = df['publish_time'].dt.strftime('%H')
df['month'] = df['publish_time'].dt.strftime('%m')
#time feats

In [28]:
group1 = df.groupby(['labels'])
count1 = group1.size().reset_index(name='counts')
count1.sort_values(by='labels', inplace=True)
count1.reset_index(drop=True,inplace=True)
count1['labels']+=1
count1['ratio'] = count1['counts'] / count1['counts'].sum()
mapping = {1:'Seek Advice',2:'Share Knowledge',3:'Seek Support',4:'Offer Support',5:'Express Emotion',6:'Others'}
count1['name'] = count1['labels'].map(mapping)
count1

Unnamed: 0,labels,counts,ratio,name
0,1,18163,0.09517,Seek Advice
1,2,97185,0.509227,Share Knowledge
2,3,20764,0.108799,Seek Support
3,4,31768,0.166457,Offer Support
4,5,12416,0.065057,Express Emotion
5,6,10552,0.05529,Others


In [5]:
from pyecharts import options as opts
from pyecharts.charts import Pie
from pyecharts.globals import CurrentConfig, OnlineHostType
c = (
    Pie(init_opts=opts.InitOpts(width="1600px", height="1000px")) 
    .add(
        series_name="",
        data_pair=[list(z) for z in zip(count1['name'], count1['counts'])],
        radius=["25%", "70%"],   
        center=["40%", "50%"],  
        label_opts=opts.LabelOpts(is_show=True),   
    )
    .set_global_opts(legend_opts=opts.LegendOpts(pos_right="5%", pos_bottom='10%',orient="vertical",textstyle_opts=opts.TextStyleOpts(font_size=20)),)   # 图例在左边和垂直显示
    .set_series_opts(
        label_opts = opts.LabelOpts(formatter="{b}: {c} ({d}%)",font_size=20),
    )
)
c.render_notebook()
# pyecharts rending pie plot

In [16]:
c.render('pie.html')

'c:\\Users\\seanx\\Desktop\\Work\\数据科学相关\\帕金森\\pie.html'

In [6]:
group2 = df.groupby(['year'])
count2 = group2.size().reset_index(name='counts')
count2.sort_values(by='year', inplace=True)
count2.reset_index(drop=True,inplace=True)
count3 = df.groupby(['hour']).size().reset_index(name='counts')
count4 = df.loc[df['year'].isin(['2020','2021','2022'])].groupby(['month']).size().reset_index(name='counts')

In [7]:
count3.to_csv('data/0-24时发帖数量.csv')

In [9]:
table = {
    'labels':[],
    'counts':[]
}
table = pd.DataFrame(table)
for i in range(1,4):
    string = str(i+2019)
    tmp = df.loc[df['year']==string]
    count = tmp.groupby('labels').size().reset_index(name='counts')
    count['LABEL'] = count['labels']
    count['Proportion']=count['counts']/count['counts'].sum()
    count['year']=[string]*len(count)
    table = pd.concat([table, count])

tmp = df.loc[df['year']=='2023']
count = tmp.groupby('labels').size().reset_index(name='counts')
count['LABEL'] = count['labels']
count['Proportion']=count['counts']/count['counts'].sum()
count['year']=['2023']*len(count)
table = pd.concat([table, count])
table['LABEL']=np.int32(table['labels'])
mapping = {0: 'Seek Advice', 1: 'Share Knowledge', 2: 'Seek Support', 3: 'Offer Support', 4: 'Express Emotion',5:'Others'}
table['LABEL'] = table['LABEL'].map(mapping)
table

Unnamed: 0,labels,counts,LABEL,Proportion,year
0,0.0,1286.0,Seek Advice,0.056867,2020
1,1.0,9511.0,Share Knowledge,0.42058,2020
2,2.0,1351.0,Seek Support,0.059742,2020
3,3.0,7760.0,Offer Support,0.34315,2020
4,4.0,1430.0,Express Emotion,0.063235,2020
5,5.0,1276.0,Others,0.056425,2020
0,0.0,3255.0,Seek Advice,0.078347,2021
1,1.0,19174.0,Share Knowledge,0.461513,2021
2,2.0,2702.0,Seek Support,0.065036,2021
3,3.0,9157.0,Offer Support,0.220406,2021


In [10]:
from pyecharts import options as opts
from pyecharts.charts import Bar
from pyecharts.globals import ThemeType

bar = (
    Bar(init_opts=opts.InitOpts(width="1200px", height="1000px"))
    .add_xaxis(list(table['LABEL'].iloc[0:5]))
    .add_yaxis("2020", table[table['year'] == '2020']['Proportion'].tolist())
    .add_yaxis("2021", table[table['year'] == '2021']['Proportion'].tolist())
    .add_yaxis("2022", table[table['year'] == '2022']['Proportion'].tolist())
    .add_yaxis("2023", table[table['year'] == '2023']['Proportion'].tolist())
    .set_global_opts(
        title_opts=opts.TitleOpts(title="Distribution of six categories of posts",title_textstyle_opts=opts.TextStyleOpts(font_size=20)),
        xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=0,font_size=16)),
        legend_opts=opts.LegendOpts(textstyle_opts=opts.TextStyleOpts(font_size=20)),
    )
    .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
)
bar.render_notebook()
# pyecharts bar plot

In [11]:
bar.render('bar.html')

'c:\\Users\\seanx\\Desktop\\Work\\数据科学相关\\帕金森\\bar.html'

In [29]:
lst = ['北京','天津','河北','山西','内蒙古','辽宁','吉林','黑龙江','上海','江苏','浙江','安徽','福建','江西','山东','河南','湖北','湖南','广东','广西','海南','重庆','四川','贵州','云南','西藏','陕西','甘肃','青海','宁夏','新疆']
df1 = df.loc[df['region'].isin(lst)]
df1['publish_time'] = pd.to_datetime(df1['publish_time'], format='%Y-%m-%dT%H:%M:%S')
df1['year'] = df1['publish_time'].dt.strftime('%Y')
# data with region
data1 = df1.loc[(df1.year == '2022') | (df1.year == '2023') ]

In [13]:
data1.sort_values(by='publish_time')['publish_time']
# time range

109122   2022-05-19 09:02:30+08:00
108004   2022-05-19 09:11:07+08:00
106785   2022-05-19 09:19:41+08:00
106759   2022-05-19 09:19:55+08:00
105612   2022-05-19 09:28:13+08:00
                    ...           
32542    2023-03-23 15:05:27+08:00
34146    2023-03-23 15:05:28+08:00
32524    2023-03-23 15:05:52+08:00
30502    2023-03-23 15:07:11+08:00
36712    2023-03-23 15:23:11+08:00
Name: publish_time, Length: 79540, dtype: datetime64[ns, pytz.FixedOffset(480)]

In [14]:
group1 = data1.groupby(["region"])
count1 = group1.size().reset_index(name='counts')
count1.sort_values(by='counts', inplace=True, ascending=False)
count1.reset_index(drop=True,inplace=True)
count1['ratio'] = count1['counts'] / 79540
count1
#provincial posts

Unnamed: 0,region,counts,ratio
0,北京,46443,0.583895
1,广东,4466,0.056148
2,河北,3898,0.049007
3,重庆,2741,0.034461
4,山东,2639,0.033178
5,天津,2363,0.029708
6,江苏,2267,0.028501
7,浙江,1859,0.023372
8,上海,1773,0.022291
9,河南,1253,0.015753


In [15]:
from pyecharts import options as opts
from pyecharts.charts import Sunburst
from pyecharts.commons.utils import JsCode
data = [
    {
        "name": "Tier 1",
        "children": [
            {"name": "Beijing", 
             "children": [
             {"name": "58.3%","value": 46443},
             ],

            },
        ],
    },
    {
        "name": "Tier 2",
        "children": [
            {"name": "Hebei",
             "children": [
             {"name": "4.9%","value": 3898},
            ],
            },
            {"name": "Guangdong", 
             "children": [
             {"name": "5.6%","value": 4466},
            ],
            },
        ],
    },
    {
        "name": "Tier 3",
        "children": [
            {"name": "Chongqing",
             "children": [
             {"name": "3.4%","value": 2741},
            ],
            },
            {"name": "Shandong", 
             "children": [
             {"name": "3.3%","value": 2639},
            ],
            },
            {"name": "Tianjin", 
             "children": [
             {"name": "2.9%","value": 2363},
            ],
            },
            {"name": "Jiangsu", 
             "children": [
             {"name": "2.8%","value": 2267},
            ],
            },
            {"name": "Zhejiang", 
             "children": [
             {"name": "2.3%","value": 1859},
            ],
            },
            {"name": "Shanghai", 
             "children": [
             {"name": "2.2%","value": 1773},
            ],
            },
        ],
    },
    
    {
        "name": "Tier 4",
        "children": [
            {"name": "Henan",
             "children": [
             {"name": "1.5%","value": 1253},
            ],
            },
            {"name": "Liaoning", 
             "children": [
             {"name": "1.4%","value": 1129},
            ],
            },
            {"name": "Sichuan", 
             "children": [
             {"name": "1.3%","value": 1097},
            ],
            },
            {"name": "Hubei", 
             "children": [
             {"name": "1.2%","value": 966},
            ],
            },
            {"name": "Fujian", 
             "children": [
             {"name": "1%","value": 826},
            ],
            },
            {"name": "Anhui", 
             "children": [
             {"name": "0.9%","value": 718},
            ],
            },
        ],
    },
    {
        "name": "Tier 5",
        "children": [
            {"name": "Other Provinces","children": [
             {"name": "6.4%","value": 5102},
            ],
            },
        ],
    },
]

sunburst = (
    Sunburst(init_opts=opts.InitOpts(width="600px", height="600px"))
    .add(series_name="", data_pair=data, radius=[0, "90%"], sort_= JsCode('null'),
        levels=[
            {},
            {
                "r0": "15%",
                "r": "35%",
                "itemStyle": {"borderWidth": 2},
                "label": {"rotate": "tangential"},
            },
            {"r0": "35%", "r": "70%", "label": {"align": "right"}},
            {
                "r0": "70%",
                "r": "72%",
                "label": {"position": "outside", "padding": 3, "silent": False},
                "itemStyle": {"borderWidth": 3},
            },
        ],
         )
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}"))
    
)

sunburst.render_notebook()
sunburst.render('sunburst.html')

'c:\\Users\\seanx\\Desktop\\Work\\数据科学相关\\帕金森\\sunburst.html'

In [120]:
data2 = df1.loc[(df1.year == '2022')]
data2['month-day'] = data2['publish_time'].dt.strftime('%m-%d')
count4 = data2.groupby(['month-day','region']).size().reset_index(name='counts')
count4

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['month-day'] = data2['publish_time'].dt.strftime('%m-%d')


Unnamed: 0,month-day,region,counts
0,05-19,上海,1
1,05-19,北京,16
2,05-19,山东,1
3,05-19,广东,1
4,05-19,江苏,3
...,...,...,...
4796,12-31,湖南,2
4797,12-31,福建,3
4798,12-31,辽宁,1
4799,12-31,重庆,8


In [29]:
count4.to_csv('count_day_region.csv')

In [30]:
data2 = df1.loc[(df1.year == '2022')]
data2['month'] = data2['publish_time'].dt.strftime('%m')
count_month = data2.groupby(['region','month']).size().reset_index(name='counts')
count_month.columns = ['地区','month','counts']
count_month['month'] = np.int32(count_month['month'])
count_month

Unnamed: 0,地区,month,counts
0,上海,5,59
1,上海,6,164
2,上海,7,126
3,上海,8,197
4,上海,9,169
...,...,...,...
237,黑龙江,8,50
238,黑龙江,9,72
239,黑龙江,10,50
240,黑龙江,11,73


In [31]:
df1 = pd.read_csv('Covid_Data_Month.csv')
df1.drop(columns='Unnamed: 0',inplace=True)
df1['month'] = np.int32(df1['month'])
df1

Unnamed: 0,地区,month,confirmedCount,suspectedCount,deadCount,curedCount
0,上海,1,586,0.0,0,387
1,上海,2,518,0.0,0,329
2,上海,3,1738,0.0,0,1109
3,上海,4,51098,0.0,384,29593
4,上海,5,4662,0.0,166,24029
...,...,...,...,...,...,...
271,黑龙江,8,81,0.0,0,42
272,黑龙江,9,300,0.0,0,213
273,黑龙江,10,110,0.0,0,220
274,黑龙江,11,894,0.0,0,154


In [32]:
region = df1['地区'].drop_duplicates()
count_month = count_month.loc[count_month['地区'].isin(region)]
count_month

Unnamed: 0,地区,month,counts
0,上海,5,59
1,上海,6,164
2,上海,7,126
3,上海,8,197
4,上海,9,169
...,...,...,...
237,黑龙江,8,50
238,黑龙江,9,72
239,黑龙江,10,50
240,黑龙江,11,73


In [33]:
merge = pd.merge(df1, count_month)
merge['region'] = merge['地区']
merge.set_index(['地区', 'month'],inplace=True)
merge
# panel data

Unnamed: 0_level_0,Unnamed: 1_level_0,confirmedCount,suspectedCount,deadCount,curedCount,counts,region
地区,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
上海,5,4662,0.0,166,24029,59,上海
上海,6,243,0.0,0,800,164,上海
上海,7,339,0.0,0,299,126,上海
上海,8,231,0.0,0,264,197,上海
上海,9,316,0.0,0,253,169,上海
...,...,...,...,...,...,...,...
黑龙江,8,81,0.0,0,42,50,黑龙江
黑龙江,9,300,0.0,0,213,72,黑龙江
黑龙江,10,110,0.0,0,220,50,黑龙江
黑龙江,11,894,0.0,0,154,73,黑龙江


In [24]:
import statsmodels.formula.api as smf
model = smf.ols(formula='counts ~ confirmedCount + deadCount  + C(region)', data=merge)
results = model.fit()
results.summary()

0,1,2,3
Dep. Variable:,counts,R-squared:,0.862
Model:,OLS,Adj. R-squared:,0.841
Method:,Least Squares,F-statistic:,41.33
Date:,"Tue, 26 Sep 2023",Prob (F-statistic):,4.0100000000000002e-56
Time:,13:29:49,Log-Likelihood:,-1340.0
No. Observations:,184,AIC:,2730.0
Df Residuals:,159,BIC:,2810.0
Df Model:,24,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,165.0032,143.121,1.153,0.251,-117.660,447.667
C(region)[T.云南],-154.9634,195.935,-0.791,0.430,-541.933,232.007
C(region)[T.内蒙古],-180.7144,196.039,-0.922,0.358,-567.890,206.461
C(region)[T.北京],4075.2615,198.433,20.537,0.000,3683.357,4467.166
C(region)[T.吉林],-131.2016,195.951,-0.670,0.504,-518.203,255.800
C(region)[T.四川],-126.1085,196.112,-0.643,0.521,-513.429,261.212
C(region)[T.天津],75.4454,195.939,0.385,0.701,-311.534,462.425
C(region)[T.安徽],-107.8213,195.943,-0.550,0.583,-494.807,279.165
C(region)[T.山东],59.2052,195.852,0.302,0.763,-327.601,446.012

0,1,2,3
Omnibus:,188.017,Durbin-Watson:,1.067
Prob(Omnibus):,0.0,Jarque-Bera (JB):,13784.02
Skew:,-3.383,Prob(JB):,0.0
Kurtosis:,44.858,Cond. No.,63200.0


In [35]:
df_montht = df.loc[df['year'].isin(['2021','2022'])]
df_montht['month'] = np.int16(df_montht['month'])
df_montht = pd.DataFrame(df_montht.groupby(['year','month']).size())
df_montht.columns=['counts']
df_montht.reset_index(drop=True,inplace=True)
df_montht['counts']

0      2685
1      3421
2      2409
3     10654
4      2608
5      2030
6      2160
7      2723
8      2962
9      3625
10     3268
11     3001
12     2486
13     2075
14     5025
15    28705
16     7436
17     5901
18     5983
19     9409
20    10253
21     8042
22     9659
23    10922
Name: counts, dtype: int64

In [38]:
df2 = pd.read_csv('data/Covid_Data_Month_Total.csv')
df2['counts'] = df_montht['counts']
df2.set_index(['year','month'],inplace=True)
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,month1,year1,confirmedCount_month,suspectedCount_month,curedCount_month,deadCount_month,counts
year,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021,1,1,2021,2110.0,14.0,853.0,2.0,2685
2021,2,2,2021,109.0,0.0,1356.0,0.0,3421
2021,3,3,2021,22.0,0.0,12.0,0.0,2409
2021,4,4,2021,277.0,0.0,224.0,0.0,10654
2021,5,5,2021,416.0,0.0,426.0,0.0,2608
2021,6,6,2021,135.0,10.0,74.0,0.0,2030
2021,7,7,2021,496.0,50.0,121.0,0.0,2160
2021,8,8,2021,1233.0,0.0,316.0,0.0,2723
2021,9,9,2021,22.0,0.0,404.0,0.0,2962
2021,10,10,2021,825.0,0.0,745.0,0.0,3625


In [39]:
df2.to_csv('data/quasipoisson.csv')
# data for R