## Testing notebook playground for extracting data

In [3]:
import pandas as pd
import mysql.connector as msc
import json
import tweepy
import csv
import re
import string
import requests
import pprint
import datetime
from dateutil.rrule import rrule, DAILY
from bs4 import BeautifulSoup

In [4]:
# read config file into JSON object
configFile = open('config.json')
config = json.load(configFile)

# make connection to the GCloud database
db_config = config['database']
connection = msc.connect(
    host=db_config['host'], 
    port=db_config['port'],
    user=db_config['user'],
    password=db_config['password'],
    database=db_config['database']
)

# authorize Twitter app with Tweepy OAuthHandler
twit_config = config['twitter']
auth = tweepy.OAuthHandler(twit_config['api_key'], twit_config['api_key_secret'])
auth.set_access_token(twit_config['access_token'], twit_config['access_token_secret'])
twitapi = tweepy.API(auth, wait_on_rate_limit=True)

In [5]:
# extract Tweets from Malaysian MOH about Malaysian COVID-19 case data
# write to pandas database
tweet_collection = twitapi.search("KKMPutrajaya")

In [6]:
# Get user id of @KKMPutrajaya
url = "https://api.twitter.com"
headers = {"Authorization": "Bearer {}".format(twit_config['bearer_token'])}
response = requests.request("GET", url + "/2/users/by/username/KKMPutrajaya", headers=headers)
user_id = response.json()['data']['id']

In [7]:
# initialize pandas data set
tweets = []
next_tokens = []

In [8]:
# get posts from Malaysian MOH (@KKMPutrajaya)
search_url = (url + "/2/users/{}/tweets".format(user_id))
response = requests.request("GET", search_url, headers=headers)
data = response.json()['data']

In [9]:
# fetch tweet data
# this fetches thousands of MOH's tweets, which are factored into the quota for my 
# Twitter account, so beware!
def fetch_data():
    # get posts from Malaysian MOH (@KKMPutrajaya)
    search_url = (url + "/2/users/{}/tweets".format(user_id))
    response = requests.request("GET", search_url, headers=headers)
    data = response.json()['data']

    # cycle and add to data
    for tweet in data:
        if "Status Terkini #COVID19" in tweet['text']:
            tweets.append(tweet)
    # get next pagination token
    next_token = response.json()['meta'].get('next_token')
    next_tokens.append(next_token)
    # while next token is not None (ie there is still a next page)
    while next_token is not None:
        # get next page
        search_url = (url + "/2/users/{}/tweets?pagination_token={}".format(user_id, next_token))
        response = requests.request("GET", search_url, headers=headers)
        data = response.json()['data']
        next_token = response.json()['meta'].get('next_token')
        # cycle through the next page
        for tweet in data:
            if "Status Terkini #COVID19" in tweet['text']:
                tweets.append(tweet)
                next_tokens.append(next_token)

In [10]:
tweetsdf = pd.DataFrame(tweets)

In [11]:
next_tokensdf = pd.DataFrame(next_tokens)

In [12]:
search_url = (url + "/2/users/{}/tweets?end_time={}".format(user_id, "2021-03-11T12:00:00Z"))
response = requests.request("GET", search_url, headers=headers)

In [14]:
# function to cycle through pages automatically
print(next_token)
def goForward():
    global next_token
    if next_token is None:
        raise Exception("no next page")
    else:
        search_url = (url + "/2/users/{}/tweets?end_time={}&pagination_token={}".format(user_id, "2021-03-11T12:00:00Z", next_token))
        response = requests.request("GET", search_url, headers=headers)
        next_token = response.json()['meta'].get('next_token')
        print(response.json()['data'])

NameError: name 'next_token' is not defined

In [18]:
example_tweet = tweetsdf.iloc[0, :]['text']

IndexError: single positional indexer is out-of-bounds

In [110]:
# parser for tweet content
def parseTweetContent(tweet):
    # get tweet content without \n and 
    # explode it via spaces
    exploded_tweet = re.split('\n| ', tweet)
    
    # get date as "DD MMM YYYY", where the month is in Malay
    date = "{} {} {}".format(exploded_tweet[3], exploded_tweet[4], exploded_tweet[5])

    

In [108]:
# example tweet content
parseTweetContent(example_tweet)

['Status',
 'Terkini',
 '#COVID19,',
 '21',
 'Jun',
 '2021',
 '',
 'Kes',
 'sembuh=',
 '5,439',
 'Jumlah',
 'kes',
 'sembuh=',
 '633,624',
 'Kes',
 'baharu',
 'positif=',
 '4,611',
 '(8',
 'import)',
 '',
 'Jumlah',
 'positif=',
 '701,019',
 'Kes',
 'kematian=',
 '69',
 'Jumlah',
 'kes',
 'kematian=',
 '4,477',
 'Kes',
 'dirawat',
 'di',
 'ICU=',
 '880',
 'Bantuan',
 'alat',
 'pernafasan=',
 '452',
 'https://t.co/zUQch1MbEs']

### Testing out web scraper for kpkesihatan.com instead to fetch data

In [19]:
# function to convert python date to DD-MMM-YYYY, where "MMM" is the month in malay
month_dict = [
    'januari',
    'februari',
    'mac',
    'april',
    'mei',
    'jun',
    'julai',
    'ogos',
    'september',
    'oktober',
    'november',
    'disember'
]
def convertDate(date):
    return date.strftime("%d-{}-%Y".format(month_dict[int(date.strftime("%m"))-1]))

In [20]:
[convertDate(datetime.datetime(2021, i, 21)) for i in range(1, 13)]

['21-januari-2021',
 '21-februari-2021',
 '21-mac-2021',
 '21-april-2021',
 '21-mei-2021',
 '21-jun-2021',
 '21-julai-2021',
 '21-ogos-2021',
 '21-september-2021',
 '21-oktober-2021',
 '21-november-2021',
 '21-disember-2021']

In [21]:
# can do formatting on this later
def makeURL(date):
    return "https://kpkesihatan.com/{}/kenyataan-akhbar-kpk-{}-situasi-semasa-jangkitan-penyakit-coronavirus-2019-covid-19-di-malaysia/".format(
        date.strftime("%d/%m/%Y"), 
        convertDate(date)
    )

In [22]:
makeURL(datetime.datetime(2021, 6, 21))

'https://kpkesihatan.com/21/06/2021/kenyataan-akhbar-kpk-21-jun-2021-situasi-semasa-jangkitan-penyakit-coronavirus-2019-covid-19-di-malaysia/'

In [31]:
# Create soup specifically for the response content
soup_cases = BeautifulSoup(str(result[1]), 'html.parser')
# Create variable to (Malay) text mapping
vtt_map = {
    'cured_cases': 'Kes sembuh',
    'new_cases': 'Kes baharu',
    'import_cases': 'Kes import',
    'local_cases': 'Kes tempatan',
    'active_cases': 'Kes aktif',
    'resp_asst_cases': 'Kes yang memerlukan rawatan',
    'death_cases': 'Kes kematian',
    'number_of_clusters': 'Jumlah kluster',
    'number_of_new_clusters': 'Jumlah kluster baharu',
    'number_of_expired_clusters': 'Jumlah kluster yang telah tamat',
    'number_of_active_clusters': 'Jumlah kluster aktif'
}

IndexError: list index out of range

In [192]:
# Fetch data into dict object
day_data = {}
for var in vtt_map:
    # Get the string containing the key words
    keyword_string = str(soup_cases.find(lambda tag: tag.name == "li" and vtt_map[var] in tag.text))
    # Get the soup for the specific string
    soup_var = BeautifulSoup(keyword_string, 'html.parser')
    # use the custom soup to extract the case data, which is stored into the day_data object
    content = soup_var.find(lambda tag: tag.name == "strong").contents[0]
    # parse the content to only extract the integer value for the corresponding variable
    for s in ["\xa0", "kes", "kluster", " ", ","]:
        content = content.replace(s, "")
    # convert value to int
    day_data[var] = int(content)

In [193]:
day_data

{'cured_cases': 5941,
 'new_cases': 5293,
 'import_cases': 7,
 'local_cases': 5286,
 'active_cases': 63815,
 'resp_asst_cases': 880,
 'death_cases': 60,
 'number_of_clusters': 2623,
 'number_of_new_clusters': 19,
 'number_of_expired_clusters': 1789,
 'number_of_active_clusters': 834}

In [32]:
# initialize dataset
dataset = []
# initialize variable to text mapping
vtt_map = {
    'cured_cases': 'Kes sembuh',
    'new_cases': 'Kes baharu',
    'import_cases': 'Kes import',
    'local_cases': 'Kes tempatan',
    'active_cases': 'Kes aktif',
    'resp_asst_cases': 'Kes yang memerlukan rawatan',
    'death_cases': 'Kes kematian',
    'number_of_clusters': 'Jumlah kluster',
    'number_of_new_clusters': 'Jumlah kluster baharu',
    'number_of_expired_clusters': 'Jumlah kluster yang telah tamat',
    'number_of_active_clusters': 'Jumlah kluster aktif'
}

In [50]:
# function to get data from website by date
def get_data(date):
    # make url to website for date
    url = makeURL(date)

    # get content of website
    response = requests.get(url)

    # filter for the case data
    # create soup object
    soup = BeautifulSoup(response.content, 'html.parser')
    # get all bulleted lists
    result = soup.find_all('ul')
    # the case data will be the second element
    cases_content = str(result[1])

    # get soup for cases
    soup_cases = BeautifulSoup(cases_content, 'html.parser')

    # store the data into the result object
    day_data = {}
    for var in vtt_map:
        # Get the string containing the key words
        keyword_string = str(soup_cases.find(lambda tag: tag.name == "li" and vtt_map[var] in tag.text))
        # Get the soup for the specific string
        # soup_var = BeautifulSoup(keyword_string, 'html.parser')
        # use the custom soup to extract the case data, which is stored into the day_data object
        # content = soup_var.find(lambda tag: tag.name == "strong")
        # parse the content to only extract the integer value for the corresponding variable
        # for s in ["\xa0", "kes", "kluster", " ", ",", ";"]:
        #     content = content.replace(s, "")
        # convert value to int
        day_data[var] = keyword_string

    # put date into data
    day_data['date'] = date.strftime("%d/%m/%Y")

    # return result when done
    return day_data

In [54]:
get_data(datetime.date(2021, 4, 21))

{'cured_cases': '<li>Kes sembuh : <strong>1,910 kes </strong>(358,726 kes kumulatif, 94.0%);</li>',
 'new_cases': '<li>Kes baharu :<strong> 2,340 kes</strong> (381,813 kes kumulatif);</li>',
 'import_cases': '<li>Kes import : <strong>12 kes </strong>(1 warganegara, 11 bukan warganegara);<strong></strong></li>',
 'active_cases': '<li>Kes aktif :<strong> 21,687 kes;</strong></li>',
 'resp_asst_cases': '<li>Kes yang memerlukan rawatan di Unit Rawatan Rapi (ICU) :<strong> 248 kes</strong>;</li>',
 'death_cases': '<li>Kes kematian :<strong> 11 kes </strong>(1,400 kes kumulatif, 0.37%; 9 warganegara; 2 bukan warganegara).</li>',
 'number_of_clusters': 'None',
 'number_of_new_clusters': 'None',
 'number_of_expired_clusters': 'None',
 'number_of_active_clusters': 'None',
 'date': '21/04/2021'}

In [59]:
print("hello")
# iterate over Jan 2021 (for now)
# start_date = datetime.date(2021, 1, 1)
# end_date = datetime.date(2021, 6, 21)
# for dt in rrule(DAILY, dtstart=start_date, until=end_date)
#     try {
#         dataset.append(get_data(dt))
#         print("Data retrieved for the date {}".format(dt.strftime("%d/%m/%Y")))
#     } catch (e) {
#         print("Unable to retrieve data for the date {}".format(dt.strftime("%d/%m/%Y"))
#     }

{'cured_cases': 'None',
 'new_cases': 'None',
 'import_cases': 'None',
 'local_cases': 'None',
 'active_cases': 'None',
 'resp_asst_cases': 'None',
 'death_cases': 'None',
 'number_of_clusters': 'None',
 'number_of_new_clusters': 'None',
 'number_of_expired_clusters': 'None',
 'number_of_active_clusters': 'None',
 'date': '07/02/2021'}

In [56]:
get_data(datetime.date(2021, 2, 7))

Unnamed: 0,cured_cases,new_cases,import_cases,local_cases,active_cases,resp_asst_cases,death_cases,number_of_clusters,number_of_new_clusters,number_of_expired_clusters,number_of_active_clusters,date
0,,,,,,,,,,,,01/05/2021
1,,,,,,,,,,,,02/05/2021
2,,,,,,,,,,,,03/05/2021
3,,,,,,,,,,,,04/05/2021
4,,,,,,,,,,,,05/05/2021
5,,,,,,,,,,,,06/05/2021
6,,,,,,,,,,,,07/05/2021
7,,,,,,,,,,,,08/05/2021
8,,,,,,,,,,,,09/05/2021
9,"<li>Kes sembuh : <strong>3,454 kes </strong>(4...","<li>Kes baharu :<strong> 3,807 kes</strong> (4...",<li>Kes import : <strong>5 kes </strong>(2 war...,"<li>Kes tempatan : <strong>3,</strong><strong>...","<li>Kes aktif :<strong> 37,396 kes</strong>;</li>",<li>Kes yang memerlukan rawatan di Unit Rawata...,"<li>Kes kematian :<strong> 17 kes </strong>(1,...","<li>Jumlah kluster : <strong>1,807</strong><st...",<li>Jumlah kluster baharu :<strong> 19 kluster...,<li>Jumlah kluster yang telah tamat :<strong>1...,<li>Jumlah kluster aktif : <strong>448 kluster...,10/05/2021


In [None]:
pd.DataFrame(dataset)