# Setup

In [None]:
# Tweepy
import tweepy

# DataFrame
import pandas as pd

# Utility
import numpy as np
from numpy import array       
from numpy import asarray
from numpy import zeros
import math
import re
import string
import pickle
import random
import csv

# nltk
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.tag import pos_tag

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

# Retrieve Data from Twitter (50,000 tweets each day)

In [None]:
consumerKey = ""
consumerSecret = ""

accessToken = ""
accessTokenSecret = ""

auth = tweepy.OAuthHandler(consumerKey, consumerSecret)
auth.set_access_token(accessToken, accessTokenSecret)

api = tweepy.API(auth, wait_on_rate_limit=True)

In [None]:
tweets = []
for tweet in tweepy.Cursor(api.search, 
                                   q="#COVID19", 
                                   lang='en', 
                                   tweet_mode='extended',
                                   until='2020-09-18').items(50000):
                                   
    tweets.append(tweet)

In [None]:
# re-text of RT(retweet) to full_text
for i in range(len(tweets)):
  tweet_dict_object = vars(tweets[i])
  
  if 'retweeted_status' in tweet_dict_object:
    name_author = tweet_dict_object['retweeted_status'].user.screen_name
    new_text = tweet_dict_object['retweeted_status'].full_text

    tweet_dict_object['full_text'] = "RT @" + name_author + ": " + new_text

In [None]:
tweets_df = pd.DataFrame(vars(tweets[i]) for i in range(len(tweets)))

In [None]:
tweets_df.to_csv('covid19_tweets.csv')

# Concat each day of Feb month (15 - 29 feb)

In [None]:
list_df = list()

day = 15
while day < 30:
  df = pd.read_csv('{}_feb_covid19_tweets.csv'.format(str(day)), usecols=[6])

  list_df.append(df)
  day += 1

In [None]:
data_df = pd.concat(list_df, ignore_index=True)

# Save
data_df.to_csv('Feb_Covid_Tweets.csv')

# Preview the data

In [None]:
feb_df = pd.read_csv('Feb_Covid_Tweets.csv')
feb_df = feb_df.drop(['Unnamed: 0'], axis=1)

In [None]:
feb_df

Unnamed: 0,full_text
0,RT @aHEMandias: I wonder if locking people in ...
1,RT @IsChinar: ⚠️normal⚠️\n\nMore infected peop...
2,RT @ezracheungtoto: #BREAKING: South China Uni...
3,RT @The_Mask_For_Pr: 😱😱😱😱😱😱😱😱😱\nSomebody is te...
4,RT @The_Mask_For_Pr: 😱😱😱😱😱😱😱😱😱\nSomebody is te...
...,...
663263,RT @FreeeIran: Appalling situation: \n\nPrison...
663264,"RT @SamRo: ""The average American does not need..."
663265,RT @Laurie_Garrett: This is priceless -- how t...
663266,RT @Cara_TXZEAL: MSC Meraviglia cruise ship cl...


# Setup training dataset

In [None]:
def get_only_cases_and_deaths(feb_df):
  cases = ['case', 'cases', 'infect', 'infected']
  deaths = ['dead', 'deads', 'death', 'deaths', 'die', 'died']

  df = pd.DataFrame(columns=['full_text'])

  for i in range(len(feb_df)):
    try:
      text = feb_df['full_text'][i].lower()
    except:
      text = ''

    for word in text.split(' '):
      if word in cases or word in deaths:
        full_text = feb_df['full_text'][i]

        new_column = pd.Series([full_text], index=df.columns)
        df = df.append(new_column, ignore_index=True)
        break

  return df

In [None]:
feb_df = get_only_cases_and_deaths(feb_df)

In [None]:
feb_df

Unnamed: 0,full_text
0,RT @IsChinar: ⚠️normal⚠️\n\nMore infected peop...
1,Further Stupidity #COVID19 :\n#Japan goes ahe...
2,140 people died from the #CoronaVirus today.\n...
3,"RT @PDChina: A 67-day-old baby, infected with ..."
4,UPDATE: Coronavirus cases have almost reached ...
...,...
157618,RT @tanja819: We estimated 4000-19000 #COVID19...
157619,RT @QuickTake: BREAKING: The U.S. reported its...
157620,RT @CPHO_Canada: Travelling abroad with the fa...
157621,RT @DrEricDing: This first death means that it...


In [None]:
# prepare to list
feb_list = set(list(feb_df['full_text'].values))

# tokenize
tokenizer = TweetTokenizer()
feb_list = [tokenizer.tokenize(str(x).replace(',', '')) for x in feb_list]

In [None]:
def remove_noise(tweet_tokens, stop_words = ()):
    cleaned_tokens = []
    
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    
    for token in tweet_tokens:
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                        '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub('RT','', token)
        token = re.sub('(@[A-Za-z0-9_]+)','', token)
        token = re.sub('(#[A-Za-z0-9_]+)','', token)
 
        token = re.sub('“','', token)
        token = re.sub('”','', token)
        token = re.sub('’','', token)
        token = re.sub('—','', token)
        
        token = emoji_pattern.sub(r'', token)
 
        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [None]:
def cleaning(month_list):
  stop_words = stopwords.words('english')
  for i in range(len(month_list)):
    month_list[i] = remove_noise(month_list[i], stop_words)
    
def add_end_token(data_list):
  for i in range(len(data_list)):
    data_list[i].append('<end>')

In [None]:
# cleaning each token
cleaning(feb_list)

# add <end> to the end of sentence
add_end_token(feb_list)

In [None]:
len(feb_list)

32827

In [None]:
# initial tag default value
def set_default_tag(string_list):
  list_of_tag = [0] * len(string_list)
  return list_of_tag

In [None]:
tag_list = []
for i in range(len(feb_list)):
  tag_list.append(set_default_tag(feb_list[i]))

In [None]:
df = pd.DataFrame(feb_list)

In [None]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72
0,1500,died,since,outbreak,covid,19,coronavirus,. .\n.,wonder,measures,people,taken,prevent,getting,infected,.\n.\n.,face,masks,really,work,think,…,<end>,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,already,wanted,twitter,current,cultural-political,diversities,bad,far,handling,concerned,religious,differences,hometown,case,measles,cancer,leukemia,bad,every,group,hit,successively,hardly,learned,f,one,another,<end>,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,confirmed,case,supporting,drc,effective,preparedness,activities,prevent,outbreak,country,<end>,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,breaking,norway,reports,first,case,ー19,<end>,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,end,big,drop,new,cases,today,400,new,confirmed,ones,rate,peaked,4-5,feb,deaths,peaked,10,days,later,14-15,feb,see,graph,china,well,way,fully,recover,<end>,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32822,preprint,indicates,caucasians,prone,asians,infected,coronavirus,significant,disparities,ace,2,gene,expression,found,racial,groups,asian,vs,caucasian,however,smokers,may,risk,<end>,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
32823,drawing,conclusions,know,binds,ace,2,leading,infection,know,certain,populations,gene,variants,greater,ace,2,expression,conclusive,cases,studies,proving,infection,rates,greater,populations,<end>,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
32824,wonder,united,states,cases,<end>,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
32825,cdc,director,dr,robert,redfield,aggressively,evaluating,cases,potential,community,transmission,ca,wa,<end>,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
df_2 = pd.DataFrame(tag_list)

In [None]:
df_2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72
0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,0,0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32822,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
32823,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
32824,0,0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
32825,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


# Save to csv and do manual labelling

In [None]:
df.to_csv('feb_text_labelling.csv', index=False)
df_2.to_csv('tag_labelling.csv', index=False)

# After finish manual labelling

## Clean the data

In [None]:
train_df = pd.read_csv('feb_text_labelled.csv', low_memory=False)
tag_df = pd.read_csv('tag_labelled.csv', low_memory=False)

In [None]:
train_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72
0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32698,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
32699,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
32700,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
32701,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
tag_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72
0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32698,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
32699,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
32700,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
32701,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## Remove NaN from the data

In [None]:
def remove_nan_train_list(df_list):
  new_list = []
  for sent in df_list:
    new_sent = []
    for word in sent:
      try:
        # if word is NaN do nothoing
        if math.isnan(word):
          pass
        else:
          # if it is number it will go to this (add to new list)
          new_sent.append(str(word))
      except:
        # if word is not NaN add to new list
        new_sent.append(str(word))

    # if new list has member (not empty)
    if new_sent:
      new_list.append(new_sent)
  
  return new_list

def remove_nan_tag_list(df_list):
  new_list = []
  for sent in df_list:
    new_sent = []
    for word in sent:
      try:
        # if word is NaN do nothoing
        if math.isnan(word):
          pass
        else:
          # if it is number it will go to this (add to new list)
          new_sent.append(int(word))
      except:
        # if word is not NaN add to new list
        try:
          new_sent.append(int(word))
        except:
          new_sent.append(0)

    # if new list has member (not empty)
    if new_sent:
      new_list.append(new_sent)
  
  return new_list

In [None]:
train_list = remove_nan_train_list(train_df.values.tolist())
tag_list = remove_nan_tag_list(tag_df.values.tolist())

In [None]:
print(len(train_list))
print(len(tag_list))

700
700


## Save the dataset

In [None]:
# Save
with open('train_list_(cleaned).pickle', 'wb') as handle:
    pickle.dump(train_list, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('tag_list_(cleaned).pickle', 'wb') as handle:
    pickle.dump(tag_list, handle, protocol=pickle.HIGHEST_PROTOCOL)