## Testing notebook playground for extracting data

In [1]:
import pandas as pd
import mysql.connector as msc
import json
import tweepy
import csv
import re
import string
import requests
import pprint
import datetime
from dateutil.rrule import rrule, DAILY
from bs4 import BeautifulSoup

In [2]:
# read config file into JSON object
configFile = open('config.json')
config = json.load(configFile)

# make connection to the GCloud database
db_config = config['database']
connection = msc.connect(
    host=db_config['host'], 
    port=db_config['port'],
    user=db_config['user'],
    password=db_config['password'],
    database=db_config['database']
)

# authorize Twitter app with Tweepy OAuthHandler
twit_config = config['twitter']
auth = tweepy.OAuthHandler(twit_config['api_key'], twit_config['api_key_secret'])
auth.set_access_token(twit_config['access_token'], twit_config['access_token_secret'])
twitapi = tweepy.API(auth, wait_on_rate_limit=True)

In [3]:
# extract Tweets from Malaysian MOH about Malaysian COVID-19 case data
# write to pandas database
tweet_collection = twitapi.search("KKMPutrajaya")

In [4]:
# Get user id of @KKMPutrajaya
url = "https://api.twitter.com"
headers = {"Authorization": "Bearer {}".format(twit_config['bearer_token'])}
response = requests.request("GET", url + "/2/users/by/username/KKMPutrajaya", headers=headers)
user_id = response.json()['data']['id']

In [5]:
# initialize pandas data set
tweets = []
next_tokens = []

In [6]:
# get posts from Malaysian MOH (@KKMPutrajaya)
search_url = (url + "/2/users/{}/tweets".format(user_id))
response = requests.request("GET", search_url, headers=headers)
data = response.json()['data']

In [14]:
# fetch tweet data
# this fetches thousands of MOH's tweets, which are factored into the quota for my 
# Twitter account, so beware!
def fetch_data():

    # get posts from Malaysian MOH (@KKMPutrajaya)
    search_url = (url + "/2/users/{}/tweets".format(user_id))
    response = requests.request("GET", search_url, headers=headers)
    data = response.json()['data']

    # cycle and add to data
    for tweet in data:
        if "Status Terkini #COVID19" in tweet['text']:
            tweets.append(tweet)
    # get next pagination token
    next_token = response.json()['meta'].get('next_token')
    next_tokens.append(next_token)
    # while next token is not None (ie there is still a next page)
    while next_token is not None:
        # get next page
        search_url = (url + "/2/users/{}/tweets?pagination_token={}".format(user_id, next_token))
        response = requests.request("GET", search_url, headers=headers)
        data = response.json()['data']
        next_token = response.json()['meta'].get('next_token')
        # cycle through the next page
        for tweet in data:
            if "Status Terkini #COVID19" in tweet['text']:
                tweets.append(tweet)
                print(tweet)
                next_tokens.append(next_token)

In [23]:
# fetch_data()

In [24]:
tweetsdf = pd.DataFrame(tweets)
tweetsdf = tweetsdf[0:101]

In [25]:
next_tokensdf = pd.DataFrame(next_tokens)
next_tokensdf = next_tokensdf[0:102]

In [27]:
tweetsdf['text']

0      Status Terkini #COVID19, 22 Jun 2021 \nKes sem...
1      Status Terkini #COVID19, 21 Jun 2021 \nKes sem...
2      Status Terkini #COVID19, 20 Jun 2021 \nKes sem...
3      Status Terkini #COVID19, 19 Jun 2021 \nKes sem...
4      Status Terkini #COVID19, 18 Jun 2021 \nKes sem...
                             ...                        
96     Status Terkini #COVID19, 17 Mac 2021 \n\nKes s...
97     Status Terkini #COVID19, 16 Mac 2021 \n\nKes s...
98     Status Terkini #COVID19, 15 Mac 2021 \n\nKes s...
99     Status Terkini #COVID19, 14 Mac 2021 \n\nKes s...
100    Status Terkini #COVID19, 13 Mac 2021 \n\nKes s...
Name: text, Length: 101, dtype: object

In [46]:
tweetsdf['text'][0]

'Status Terkini #COVID19, 22 Jun 2021 \nKes sembuh= 5,557\nJumlah kes sembuh= 639,181\nKes baharu positif= 4,743 (2 import) \nJumlah positif= 705,762\nKes kematian= 77\nJumlah kes kematian= 4,554\nKes dirawat di ICU= 875\nBantuan alat pernafasan= 445 https://t.co/MZHf51NU0C'

In [47]:
tweetsdf['text'][100]

'Status Terkini #COVID19, 13 Mac 2021 \n\nKes sembuh=1,830\nJumlah kes sembuh=304,492 kes \nKes baharu positif=1,470 kes (1,458 tempatan, 12 import) \nJumlah positif=322,409\nKes kematian=3\nJumlah kes kematian=1,206 kes \nKes dirawat di ICU=162 kes \nBantuan Alat Pernafasan=70 kes https://t.co/S6kwwkmLZw'

In [20]:
search_url = (url + "/2/users/{}/tweets?pagination_token={}".format(user_id, "7140dibdnow9c7btw3w3xuo59eonz5ff1wirbd9w1h7ao"))
response = requests.request("GET", search_url, headers=headers)
response.json()

{'meta': {'result_count': 0,
  'previous_token': '77qpymm88g5h9vqklulfea6i0etntp4c28y39657vvi9l'}}

In [14]:
# function to cycle through pages automatically
print(next_token)
def goForward():
    global next_token
    if next_token is None:
        raise Exception("no next page")
    else:
        search_url = (url + "/2/users/{}/tweets?end_time={}&pagination_token={}".format(user_id, "2021-03-11T12:00:00Z", next_token))
        response = requests.request("GET", search_url, headers=headers)
        next_token = response.json()['meta'].get('next_token')
        print(response.json()['data'])

NameError: name 'next_token' is not defined

In [28]:
example_tweet = tweetsdf.iloc[0, :]['text']

In [79]:
# function to convert python date to DD-MMM-YYYY, where "MMM" is the month in malay
month_dict = [
    'januari',
    'februari',
    'mac',
    'april',
    'mei',
    'jun',
    'julai',
    'ogos',
    'september',
    'oktober',
    'november',
    'disember'
]

# month is in Malay (like the above)
def parseDate(day, month, year):
    return datetime.date(int(year), int(list(map(lambda m: m.title(), month_dict)).index(month))+1, int(day))

def convertDate(date):
    return date.strftime("%d-{}-%Y".format(month_dict[int(date.strftime("%m"))-1]).title())

In [80]:
parseDate("15", "April", "2021")

datetime.date(2021, 4, 15)

In [191]:
# parser for tweet content
def parseTweetContent(tweet):
    # print(tweet)
    # get tweet content without \n and 
    # explode it via spaces
    exploded_tweet = [x for x in re.split('\n| |=', tweet.replace('\n\n', '\n')) if x]
    # print(exploded_tweet)
    
    # get date as "YYYY MMM DD", where the month is in Malay
    date = parseDate(exploded_tweet[3], exploded_tweet[4], exploded_tweet[5])
    # print(date)

    try:

        # get other data
        # the numbers represent the indexes at which the relevant values are in
        # the exploded tweet
        data_dict = {
            "cured_cases": 8,
            "total_cured_cases": 12,
            "new_cases": 16,
            "total_new_cases": 23 if date <= datetime.date(2021, 6, 9) else 21,
            "deaths": 26 if date <= datetime.date(2021, 6, 9) else 24,
            "total_deaths": 30 if date <= datetime.date(2021, 6, 9) else 28,
            "icu_cases": 35 if date <= datetime.date(2021, 6, 9) else 33,
            "resp_cases": 39 if date <= datetime.date(2021, 6, 9) else 37
        }
        
        # parse data
        result = {name: int(exploded_tweet[value].replace(',', '')) for name, value in data_dict.items()} 

        # add date
        result['date'] = date.strftime('%d %m %Y')
        # return result
        return result
    except BaseException as e:
        print(e)
        print('cannot parse tweet at date {}'.format(date))
        return {}

In [192]:
# example tweet content
parseTweetContent(tweetsdf['text'][100])

invalid literal for int() with base 10: 'positif'
cannot parse tweet at date 2021-03-13


{}

In [211]:
# store data into csv
parsed_data = list(map(parseTweetContent, list(tweetsdf['text'])))
# parsed_data += ({
#     "cured_cases": 1830,
#     "total_cured_cases": 304492, 
#     "new_cases": 1470,
#     "total_new_cases": 322409, 
#     "deaths": 3,
#     "total_deaths": 1206, 
#     "icu_cases": 162,
#     "resp_cases": 70,
#     "date": "13 03 2021"
# })

invalid literal for int() with base 10: 'positif'
cannot parse tweet at date 2021-03-13


In [213]:
df = pd.DataFrame(parsed_data)
df.to_csv("covid_data.csv")

In [32]:
# initialize dataset
dataset = []
# initialize variable to text mapping
vtt_map = {
    'cured_cases': 'Kes sembuh',
    'new_cases': 'Kes baharu',
    'import_cases': 'Kes import',
    'local_cases': 'Kes tempatan',
    'active_cases': 'Kes aktif',
    'resp_asst_cases': 'Kes yang memerlukan rawatan',
    'death_cases': 'Kes kematian',
    'number_of_clusters': 'Jumlah kluster',
    'number_of_new_clusters': 'Jumlah kluster baharu',
    'number_of_expired_clusters': 'Jumlah kluster yang telah tamat',
    'number_of_active_clusters': 'Jumlah kluster aktif'
}

### Testing out web scraper for kpkesihatan.com instead to fetch data

In [20]:
[convertDate(datetime.datetime(2021, i, 21)) for i in range(1, 13)]

['21-januari-2021',
 '21-februari-2021',
 '21-mac-2021',
 '21-april-2021',
 '21-mei-2021',
 '21-jun-2021',
 '21-julai-2021',
 '21-ogos-2021',
 '21-september-2021',
 '21-oktober-2021',
 '21-november-2021',
 '21-disember-2021']

In [21]:
# can do formatting on this later
def makeURL(date):
    return "https://kpkesihatan.com/{}/kenyataan-akhbar-kpk-{}-situasi-semasa-jangkitan-penyakit-coronavirus-2019-covid-19-di-malaysia/".format(
        date.strftime("%d/%m/%Y"), 
        convertDate(date)
    )

In [22]:
makeURL(datetime.datetime(2021, 6, 21))

'https://kpkesihatan.com/21/06/2021/kenyataan-akhbar-kpk-21-jun-2021-situasi-semasa-jangkitan-penyakit-coronavirus-2019-covid-19-di-malaysia/'

In [31]:
# Create soup specifically for the response content
soup_cases = BeautifulSoup(str(result[1]), 'html.parser')
# Create variable to (Malay) text mapping
vtt_map = {
    'cured_cases': 'Kes sembuh',
    'new_cases': 'Kes baharu',
    'import_cases': 'Kes import',
    'local_cases': 'Kes tempatan',
    'active_cases': 'Kes aktif',
    'resp_asst_cases': 'Kes yang memerlukan rawatan',
    'death_cases': 'Kes kematian',
    'number_of_clusters': 'Jumlah kluster',
    'number_of_new_clusters': 'Jumlah kluster baharu',
    'number_of_expired_clusters': 'Jumlah kluster yang telah tamat',
    'number_of_active_clusters': 'Jumlah kluster aktif'
}

IndexError: list index out of range

In [192]:
# Fetch data into dict object
day_data = {}
for var in vtt_map:
    # Get the string containing the key words
    keyword_string = str(soup_cases.find(lambda tag: tag.name == "li" and vtt_map[var] in tag.text))
    # Get the soup for the specific string
    soup_var = BeautifulSoup(keyword_string, 'html.parser')
    # use the custom soup to extract the case data, which is stored into the day_data object
    content = soup_var.find(lambda tag: tag.name == "strong").contents[0]
    # parse the content to only extract the integer value for the corresponding variable
    for s in ["\xa0", "kes", "kluster", " ", ","]:
        content = content.replace(s, "")
    # convert value to int
    day_data[var] = int(content)

In [193]:
day_data

{'cured_cases': 5941,
 'new_cases': 5293,
 'import_cases': 7,
 'local_cases': 5286,
 'active_cases': 63815,
 'resp_asst_cases': 880,
 'death_cases': 60,
 'number_of_clusters': 2623,
 'number_of_new_clusters': 19,
 'number_of_expired_clusters': 1789,
 'number_of_active_clusters': 834}

In [50]:
# function to get data from website by date
def get_data(date):
    # make url to website for date
    url = makeURL(date)

    # get content of website
    response = requests.get(url)

    # filter for the case data
    # create soup object
    soup = BeautifulSoup(response.content, 'html.parser')
    # get all bulleted lists
    result = soup.find_all('ul')
    # the case data will be the second element
    cases_content = str(result[1])

    # get soup for cases
    soup_cases = BeautifulSoup(cases_content, 'html.parser')

    # store the data into the result object
    day_data = {}
    for var in vtt_map:
        # Get the string containing the key words
        keyword_string = str(soup_cases.find(lambda tag: tag.name == "li" and vtt_map[var] in tag.text))
        # Get the soup for the specific string
        # soup_var = BeautifulSoup(keyword_string, 'html.parser')
        # use the custom soup to extract the case data, which is stored into the day_data object
        # content = soup_var.find(lambda tag: tag.name == "strong")
        # parse the content to only extract the integer value for the corresponding variable
        # for s in ["\xa0", "kes", "kluster", " ", ",", ";"]:
        #     content = content.replace(s, "")
        # convert value to int
        day_data[var] = keyword_string

    # put date into data
    day_data['date'] = date.strftime("%d/%m/%Y")

    # return result when done
    return day_data

In [54]:
get_data(datetime.date(2021, 4, 21))

{'cured_cases': '<li>Kes sembuh : <strong>1,910 kes </strong>(358,726 kes kumulatif, 94.0%);</li>',
 'new_cases': '<li>Kes baharu :<strong> 2,340 kes</strong> (381,813 kes kumulatif);</li>',
 'import_cases': '<li>Kes import : <strong>12 kes </strong>(1 warganegara, 11 bukan warganegara);<strong></strong></li>',
 'active_cases': '<li>Kes aktif :<strong> 21,687 kes;</strong></li>',
 'resp_asst_cases': '<li>Kes yang memerlukan rawatan di Unit Rawatan Rapi (ICU) :<strong> 248 kes</strong>;</li>',
 'death_cases': '<li>Kes kematian :<strong> 11 kes </strong>(1,400 kes kumulatif, 0.37%; 9 warganegara; 2 bukan warganegara).</li>',
 'number_of_clusters': 'None',
 'number_of_new_clusters': 'None',
 'number_of_expired_clusters': 'None',
 'number_of_active_clusters': 'None',
 'date': '21/04/2021'}

In [60]:
print("hello")
# iterate over Jan 2021 (for now)
# start_date = datetime.date(2021, 1, 1)
# end_date = datetime.date(2021, 6, 21)
# for dt in rrule(DAILY, dtstart=start_date, until=end_date)
#     try {
#         dataset.append(get_data(dt))
#         print("Data retrieved for the date {}".format(dt.strftime("%d/%m/%Y")))
#     } catch (e) {
#         print("Unable to retrieve data for the date {}".format(dt.strftime("%d/%m/%Y"))
#     }

{'cured_cases': '<li>Kes sembuh : <strong>1,346 kes </strong>(310,958 kes kumulatif, 95.0%);</li>',
 'new_cases': '<li>Kes baharu : <strong>1,219 kes</strong> (327,253 kes kumulatif);</li>',
 'import_cases': '<li>Kes import : <strong>7</strong><strong> </strong><strong>kes </strong>(1 warganegara; 6 bukan warganegara)<strong></strong></li>',
 'active_cases': '<li>Kes aktif :<strong> 15,075 kes;</strong></li>',
 'resp_asst_cases': '<li>Kes yang memerlukan rawatan di Unit Rawatan Rapi (ICU) :<strong> 154 kes</strong>;</li>',
 'death_cases': '<li>Kes kematian :<strong> 2 kes </strong>(1,220 kes kumulatif, 0.37%; 2 warganegara).</li>',
 'number_of_clusters': 'None',
 'number_of_new_clusters': 'None',
 'number_of_expired_clusters': 'None',
 'number_of_active_clusters': 'None',
 'date': '17/03/2021'}

In [66]:
get_data(datetime.date(2021, 3, 17))

Data retrieved for the date 01/03/2021
Data retrieved for the date 02/03/2021
Data retrieved for the date 03/03/2021
Data retrieved for the date 04/03/2021
Data retrieved for the date 05/03/2021
Data retrieved for the date 06/03/2021
Data retrieved for the date 07/03/2021
Data retrieved for the date 08/03/2021
Data retrieved for the date 09/03/2021
Data retrieved for the date 10/03/2021
Data retrieved for the date 11/03/2021
Data retrieved for the date 12/03/2021
Data retrieved for the date 13/03/2021
Data retrieved for the date 14/03/2021
Data retrieved for the date 15/03/2021
Data retrieved for the date 16/03/2021
Data retrieved for the date 17/03/2021
Data retrieved for the date 18/03/2021
Data retrieved for the date 19/03/2021
Data retrieved for the date 20/03/2021
Data retrieved for the date 21/03/2021
Data retrieved for the date 22/03/2021
Data retrieved for the date 23/03/2021
Data retrieved for the date 24/03/2021
Data retrieved for the date 25/03/2021
Data retrieved for the da

In [67]:
# insert data from march onwards into dataset
dataset = []
start_date = datetime.date(2021, 3, 1)
end_date = datetime.date.today()

for dt in rrule(DAILY, dtstart=start_date, until=end_date):
    try:
        dataset.append(get_data(dt))
        print("Data retrieved for the date {}".format(dt.strftime("%d/%m/%Y")))
    except:
        print("Unable to retrieve data for the date {}".format(dt.strftime("%d/%m/%Y")))

In [None]:
# export data to csv
df = pd.DataFrame(dataset)
df.to_csv(r'covid_data.csv')