In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from functools import reduce
from collections import Counter

import warnings
warnings.filterwarnings('ignore')

### JSON FILES
creating list of filenames + loading data to dataframes

In [2]:
#filelist
filelist = []
filename_part = 'data/covid19_2020_02_'

for i in range(15,22):
#for i in range(15,17):
    filelist.append(filename_part + str(i) + '.json')

filelist

['data/covid19_2020_02_15.json',
 'data/covid19_2020_02_16.json',
 'data/covid19_2020_02_17.json',
 'data/covid19_2020_02_18.json',
 'data/covid19_2020_02_19.json',
 'data/covid19_2020_02_20.json',
 'data/covid19_2020_02_21.json']

In [3]:
#read files into pandas
#df = pd.read_json('data/covid19_2020_02_15.json', lines=True) # 15-21
df_list = [pd.read_json(file, lines=True) for file in tqdm(filelist)]

100%|██████████| 7/7 [02:14<00:00, 19.21s/it]


In [4]:
df_list[0]

Unnamed: 0,tweet_id,user_id,date,keywords,location
0,1228544443910959105,4328344283,2020-02-15 04:59:55+00:00,[coronavirus],
1,1228544444539924480,808810829797670912,2020-02-15 04:59:55+00:00,"[coronavirus, covid19]",
2,1228544445978562560,71977357,2020-02-15 04:59:55+00:00,[coronavirus],
3,1228544446830018560,1219954720397152257,2020-02-15 04:59:56+00:00,[2019ncov],
4,1228544447450796032,944505310806360064,2020-02-15 04:59:56+00:00,[coronavirus],
...,...,...,...,...,...
463124,1228906827196186624,1125735079412502530,2020-02-16 04:59:54+00:00,[coronavirus],"{'country': 'Hong Kong', 'state': 'Hong Kong I..."
463125,1228906827691237376,1942312542,2020-02-16 04:59:54+00:00,"[wuhan, coronavirus]",
463126,1228906828760698880,93835628,2020-02-16 04:59:54+00:00,[coronavirus],"{'country': 'Hong Kong', 'state': 'Hong Kong I..."
463127,1228906829461090304,860664355297181696,2020-02-16 04:59:54+00:00,[coronavirus],


### COUNT COVID-RELATED HASHTAGS OVER COUNTRIES
filtering the data content + separating country information for each data instance + counting the number of COVID-related tweets for each country + new dataframe with counts divided into dates

In [5]:
for i, df in enumerate(df_list):
    print(i, ' - ', df.shape)

#df_list[0].head(10)

0  -  (463129, 5)
1  -  (454233, 5)
2  -  (497281, 5)
3  -  (752972, 5)
4  -  (494639, 5)
5  -  (586811, 5)
6  -  (845044, 5)


In [6]:
for i, df in enumerate(df_list):
    location_not_none = df['location'].notnull()
    df_list[i] = df[location_not_none]
    print(i, ' - ', df.shape)

#df_list[0].head(10)

0  -  (463129, 5)
1  -  (454233, 5)
2  -  (497281, 5)
3  -  (752972, 5)
4  -  (494639, 5)
5  -  (586811, 5)
6  -  (845044, 5)


In [7]:
for i, df in enumerate(df_list):
    df_list[i]['country'] = df['location'].apply(lambda x: x['country'])
    print(i, ' - ', df.shape)

#df_list[0].head(10)

0  -  (155270, 6)
1  -  (157225, 6)
2  -  (179835, 6)
3  -  (272787, 6)
4  -  (173622, 6)
5  -  (209851, 6)
6  -  (263832, 6)


In [8]:
counts_df_list = []

for i, df in enumerate(df_list):
    counts_df_list.append(df['country'].value_counts().rename_axis('country').reset_index(name='counts'))
    print(i, ' - ', counts_df_list[i].shape)
    
#counts_df_list[2]

0  -  (178, 2)
1  -  (178, 2)
2  -  (178, 2)
3  -  (180, 2)
4  -  (177, 2)
5  -  (176, 2)
6  -  (180, 2)


In [9]:
counts_df = reduce(lambda left,right: pd.merge(left, right, on='country', how='outer'), counts_df_list)

counts_df.columns = ['country', '2020-02-15', '2020-02-16', '2020-02-17', 
                     '2020-02-18', '2020-02-19', '2020-02-20', '2020-02-21']


#df['column name'] = df['column name'].fillna(0).astype(np.int64)
#counts_df.dtypes

counts_df = counts_df.fillna(0).astype({'2020-02-15': np.int64, '2020-02-16': np.int64, 
                                        '2020-02-17': np.int64, '2020-02-18': np.int64,
                                        '2020-02-19': np.int64, '2020-02-20': np.int64,
                                        '2020-02-21': np.int64})

counts_df

Unnamed: 0,country,2020-02-15,2020-02-16,2020-02-17,2020-02-18,2020-02-19,2020-02-20,2020-02-21
0,United States,52144,60264,68712,111692,61063,79299,91301
1,United Kingdom,9822,8953,10266,16386,9688,10860,13590
2,France,7058,9870,8850,11339,6598,9438,10391
3,India,7032,7694,8172,13261,7360,7494,9318
4,Canada,5846,5864,6538,9476,5739,7763,9490
...,...,...,...,...,...,...,...,...
179,Seychelles,0,2,3,2,4,2,2
180,Saint Vincent and the Grenadines,0,1,1,3,1,0,4
181,Netherlands Antilles,0,1,0,4,0,1,2
182,South Georgia and the South Sandwich Islands,0,0,5,3,1,8,7


In [299]:
#SAVE COUNTS DATAFRAME TO CSV
counts_df.to_csv('out/count_of_COVID_hashtags_over_countries_.csv')

### MOST FREQUENT KEYWORDS
listing keywords for each country + finding the most common keyword for each country

In [10]:
df = pd.concat(df_list, ignore_index=True)

print(df.shape)

keywords_df = pd.DataFrame(df.groupby(['country']).keywords.apply(list).reset_index(name='keywords'))

print(keywords_df.shape)

keywords_df

(1412422, 6)
(184, 2)


Unnamed: 0,country,keywords
0,Afghanistan,"[[coronavirus], [coronavirus], [coronavirus], ..."
1,Albania,"[[coronavirus], [coronavirus], [coronavirus], ..."
2,Algeria,"[[coronavirus], [coronavirus], [coronavirus, w..."
3,Andorra,"[[coronavirus], [coronavirus], [coronavirus], ..."
4,Angola,"[[coronavirus], [covid], [coronavirus], [coron..."
...,...,...
179,Venezuela,"[[coronavirus], [coronavirus], [coronavirus], ..."
180,Vietnam,"[[wuhan], [covid], [coronavirus], [covid19, co..."
181,Yemen,"[[coronavirus], [coronavirus], [coronavirus], ..."
182,Zambia,"[[coronavirus], [coronavirus], [wuhan], [coron..."


In [11]:
keywords_list = keywords_df['keywords'].to_list()

keywords_list_flaten = []

for sublist in keywords_list:
    keywords_list_flaten.append(list([y for x in sublist for y in x]))

keywords_modes = [Counter(l).most_common(1)[0][0] for l in keywords_list_flaten]

len(keywords_modes)

184

In [12]:
keywords_df['mode_keyword'] = np.array(keywords_modes)

#botswana_hashtags = keywords_df['keywords'].loc[keywords_df['country'] == 'Botswana'].to_list()
#botswana_hashtags

#keywords_df

mode_keywords_df = keywords_df.drop('keywords', 1)
print(mode_keywords_df['mode_keyword'].value_counts())
mode_keywords_df

coronavirus    182
wuhan            2
Name: mode_keyword, dtype: int64


Unnamed: 0,country,mode_keyword
0,Afghanistan,coronavirus
1,Albania,coronavirus
2,Algeria,coronavirus
3,Andorra,coronavirus
4,Angola,coronavirus
...,...,...
179,Venezuela,coronavirus
180,Vietnam,coronavirus
181,Yemen,coronavirus
182,Zambia,coronavirus


In [29]:
#SAVE MODES DATAFRAME TO CSV
mode_keywords_df.to_csv('out/prefered_hashtags_over_countries.csv')

### OTHER

In [None]:
mode_df = df[['country']]
mode_df = mode_df.drop_duplicates(subset=['country'])
mode_df.sort_values(by=['country'], inplace=True)
mode_df.reset_index(drop=True, inplace=True)
mode_df['mode_keyword'] = np.array(keywords_mode)

mode_df['mode_keyword'].value_counts()
mode_df.head()

In [None]:
df['country'] = df['location'].apply(lambda x: x['country'])
#df1 = pd.DataFrame(df['country'].value_counts())
#df1.columns = ['country', 'occurences']

df1 = df['country'].value_counts().rename_axis('country').reset_index(name='counts')
df1