## Testing notebook playground for extracting data

In [5]:
import pandas as pd
import mysql.connector as msc
import json
import tweepy
import csv
import re
import string
import requests
import pprint
import datetime
from dateutil.rrule import rrule, DAILY
from bs4 import BeautifulSoup

In [3]:
# read config file into JSON object
configFile = open('config.json')
config = json.load(configFile)

# make connection to the GCloud database
db_config = config['database']
connection = msc.connect(
    host=db_config['host'], 
    port=db_config['port'],
    user=db_config['user'],
    password=db_config['password'],
    database=db_config['database']
)

# authorize Twitter app with Tweepy OAuthHandler
twit_config = config['twitter']
auth = tweepy.OAuthHandler(twit_config['api_key'], twit_config['api_key_secret'])
auth.set_access_token(twit_config['access_token'], twit_config['access_token_secret'])
twitapi = tweepy.API(auth, wait_on_rate_limit=True)

In [4]:
# extract Tweets from Malaysian MOH about Malaysian COVID-19 case data
# write to pandas database
tweet_collection = twitapi.search("KKMPutrajaya")

In [5]:
# Get user id of @KKMPutrajaya
url = "https://api.twitter.com"
headers = {"Authorization": "Bearer {}".format(twit_config['bearer_token'])}
response = requests.request("GET", url + "/2/users/by/username/KKMPutrajaya", headers=headers)
user_id = response.json()['data']['id']

In [6]:
# initialize pandas data set
tweets = []
next_tokens = []

In [7]:
# get posts from Malaysian MOH (@KKMPutrajaya)
search_url = (url + "/2/users/{}/tweets".format(user_id))
response = requests.request("GET", search_url, headers=headers)
data = response.json()['data']

In [8]:
# fetch tweet data
# this fetches thousands of MOH's tweets, which are factored into the quota for my 
# Twitter account, so beware!
def fetch_data():

    # get posts from Malaysian MOH (@KKMPutrajaya)
    search_url = (url + "/2/users/{}/tweets".format(user_id))
    response = requests.request("GET", search_url, headers=headers)
    data = response.json()['data']

    # cycle and add to data
    for tweet in data:
        if "Status Terkini #COVID19" in tweet['text']:
            tweets.append(tweet)
    # get next pagination token
    next_token = response.json()['meta'].get('next_token')
    next_tokens.append(next_token)
    # while next token is not None (ie there is still a next page)
    while next_token is not None:
        # get next page
        search_url = (url + "/2/users/{}/tweets?pagination_token={}".format(user_id, next_token))
        response = requests.request("GET", search_url, headers=headers)
        data = response.json()['data']
        next_token = response.json()['meta'].get('next_token')
        # cycle through the next page
        for tweet in data:
            if "Status Terkini #COVID19" in tweet['text']:
                tweets.append(tweet)
                print(tweet)
                next_tokens.append(next_token)

In [9]:
# fetch_data()

In [10]:
tweetsdf = pd.DataFrame(tweets)
tweetsdf = tweetsdf[0:101]

In [11]:
next_tokensdf = pd.DataFrame(next_tokens)
next_tokensdf = next_tokensdf[0:102]

In [12]:
tweetsdf['text']

KeyError: 'text'

In [None]:
tweetsdf['text'][0]

In [None]:
tweetsdf['text'][100]

In [None]:
search_url = (url + "/2/users/{}/tweets?pagination_token={}".format(user_id, "7140dibdnow9c7btw3w3xuo59eonz5ff1wirbd9w1h7ao"))
response = requests.request("GET", search_url, headers=headers)
response.json()

In [None]:
# function to cycle through pages automatically
print(next_token)
def goForward():
    global next_token
    if next_token is None:
        raise Exception("no next page")
    else:
        search_url = (url + "/2/users/{}/tweets?end_time={}&pagination_token={}".format(user_id, "2021-03-11T12:00:00Z", next_token))
        response = requests.request("GET", search_url, headers=headers)
        next_token = response.json()['meta'].get('next_token')
        print(response.json()['data'])

In [None]:
example_tweet = tweetsdf.iloc[0, :]['text']

In [46]:
# function to convert python date to DD-MMM-YYYY, where "MMM" is the month in malay
month_dict = [
    'januari',
    'februari',
    'mac',
    'april',
    'mei',
    'jun',
    'julai',
    'ogos',
    'september',
    'oktober',
    'november',
    'disember'
]

# month is in Malay (like the above)
def parseDate(day, month, year):
    return datetime.date(int(year), int(list(map(lambda m: m.title(), month_dict)).index(month))+1, int(day))

# def convertDate(date):
#     return date.strftime("%d-{}-%Y".format(month_dict[int(date.strftime("%m"))-1]).title())

def convertDate(date):
    return date.strftime("%#d-{}-%Y".format(month_dict[int(date.strftime("%m"))-1]))

In [48]:
convertDate(parseDate("6", "April", "2021"))

'6-april-2021'

In [49]:
# parser for tweet content
def parseTweetContent(tweet):
    # print(tweet)
    # get tweet content without \n and 
    # explode it via spaces
    exploded_tweet = [x for x in re.split('\n| |=', tweet.replace('\n\n', '\n')) if x]
    # print(exploded_tweet)
    
    # get date as "YYYY MMM DD", where the month is in Malay
    date = parseDate(exploded_tweet[3], exploded_tweet[4], exploded_tweet[5])
    # print(date)

    try:

        # get other data
        # the numbers represent the indexes at which the relevant values are in
        # the exploded tweet
        data_dict = {
            "cured_cases": 8,
            "total_cured_cases": 12,
            "new_cases": 16,
            "total_new_cases": 23 if date <= datetime.date(2021, 6, 9) else 21,
            "deaths": 26 if date <= datetime.date(2021, 6, 9) else 24,
            "total_deaths": 30 if date <= datetime.date(2021, 6, 9) else 28,
            "icu_cases": 35 if date <= datetime.date(2021, 6, 9) else 33,
            "resp_cases": 39 if date <= datetime.date(2021, 6, 9) else 37
        }
        
        # parse data
        result = {name: int(exploded_tweet[value].replace(',', '')) for name, value in data_dict.items()} 

        # add date
        result['date'] = date.strftime('%d %m %Y')
        # return result
        return result
    except BaseException as e:
        print(e)
        print('cannot parse tweet at date {}'.format(date))
        return {}

In [31]:
# example tweet content
parseTweetContent(tweetsdf['text'][100])

KeyError: 'text'

In [32]:
# store data into csv
parsed_data = list(map(parseTweetContent, list(tweetsdf['text'])))
# parsed_data += ({
#     "cured_cases": 1830,
#     "total_cured_cases": 304492, 
#     "new_cases": 1470,
#     "total_new_cases": 322409, 
#     "deaths": 3,
#     "total_deaths": 1206, 
#     "icu_cases": 162,
#     "resp_cases": 70,
#     "date": "13 03 2021"
# })

KeyError: 'text'

In [33]:
df = pd.DataFrame(parsed_data)
df.to_csv("covid_data.csv")

NameError: name 'parsed_data' is not defined

In [34]:
# initialize dataset
dataset = []
# initialize variable to text mapping
vtt_map = {
    'cured_cases': 'Kes sembuh',
    'new_cases': 'Kes baharu',
    'import_cases': 'Kes import',
    'local_cases': 'Kes tempatan',
    'active_cases': 'Kes aktif',
    'resp_asst_cases': 'Kes yang memerlukan rawatan',
    'death_cases': 'Kes kematian',
    'number_of_clusters': 'Jumlah kluster',
    'number_of_new_clusters': 'Jumlah kluster baharu',
    'number_of_expired_clusters': 'Jumlah kluster yang telah tamat',
    'number_of_active_clusters': 'Jumlah kluster aktif'
}

### Testing out web scraper for kpkesihatan.com instead to fetch data

In [35]:
[convertDate(datetime.datetime(2021, i, 21)) for i in range(1, 13)]

['21-januari-2021',
 '21-februari-2021',
 '21-mac-2021',
 '21-april-2021',
 '21-mei-2021',
 '21-jun-2021',
 '21-julai-2021',
 '21-ogos-2021',
 '21-september-2021',
 '21-oktober-2021',
 '21-november-2021',
 '21-disember-2021']

In [41]:
# can do formatting on this later
def makeURL(date):
    return "https://kpkesihatan.com/{}/kenyataan-akhbar-kpk-{}-situasi-semasa-jangkitan-penyakit-coronavirus-2019-covid-19-di-malaysia/".format(
        date.strftime("%#d/%#m/%Y"), 
        convertDate(date)
    )

In [42]:
makeURL(datetime.datetime(2021, 6, 21))

'https://kpkesihatan.com/21/6/2021/kenyataan-akhbar-kpk-21-jun-2021-situasi-semasa-jangkitan-penyakit-coronavirus-2019-covid-19-di-malaysia/'

In [43]:
# Create soup specifically for the response content
# soup_cases = BeautifulSoup(str(result[1]), 'html.parser')
# Create variable to (Malay) text mapping
vtt_map = {
    'cured_cases': 'Kes sembuh',
    'new_cases': 'Kes baharu',
    'import_cases': 'Kes import',
    'local_cases': 'Kes tempatan',
    'active_cases': 'Kes aktif',
    'resp_asst_cases': 'Kes yang memerlukan rawatan',
    'death_cases': 'Kes kematian',
    # 'number_of_clusters': 'Jumlah kluster',
    # 'number_of_new_clusters': 'Jumlah kluster baharu',
    # 'number_of_expired_clusters': 'Jumlah kluster yang telah tamat',
    # 'number_of_active_clusters': 'Jumlah kluster aktif'
}

In [52]:
# function to get data from website by date
def get_data(date):
    # make url to website for date
    url = makeURL(date)
    result = None
    response = None
    # print(url)

    # get content of website
    response = requests.get(url)
    # print(response)

    if (response.status_code == 200):
        # filter for the case data
        # create soup object
        soup = BeautifulSoup(response.content, 'html.parser')
        # get all bulleted lists
        result = soup.find_all('ul')
        # print(result)
        cases_content = str(result[1])
        # print(cases_content)

        # get soup for cases
        soup_cases = BeautifulSoup(cases_content, 'html.parser')

        # store the data into the result object
        day_data = {}
        for var in vtt_map:
            # Get the string containing the key words
            keyword_string = str(soup_cases.find(lambda tag: tag.name == "li" and vtt_map[var] in tag.text))
            # Get the soup for the specific string
            soup_var = BeautifulSoup(keyword_string, 'html.parser')
            # use the custom soup to extract the case data, which is stored into the day_data object
            content = soup_var.find(lambda tag: tag.name == "strong").contents[0]
            # parse the content to only extract the integer value for the corresponding variable
            for s in ["\xa0", "kes", "kluster", " ", ",", ";"]:
                content = content.replace(s, "")
            # convert value to int
            day_data[var] = int(content)

        # put date into data
        day_data['date'] = date.strftime("%d/%m/%Y")

        # return result when done
        return day_data
    else:
        print("{} returned {} error".format(url, response.status_code))
        return {}

In [53]:
get_data(datetime.date(2021, 6, 1))

{'cured_cases': 6083,
 'new_cases': 7105,
 'import_cases': 2,
 'local_cases': 7103,
 'active_cases': 80474,
 'resp_asst_cases': 872,
 'death_cases': 71,
 'date': '01/06/2021'}

In [54]:
get_data(datetime.date(2021, 3, 1))

{'cured_cases': 2486,
 'new_cases': 1828,
 'import_cases': 7,
 'local_cases': 1821,
 'active_cases': 25542,
 'resp_asst_cases': 198,
 'death_cases': 5,
 'date': '01/03/2021'}

In [60]:
# insert data from march onwards into dataset
dataset = []
start_date = datetime.date(2021, 1, 1)
end_date = datetime.date.today()
failed_dates = []

for dt in rrule(DAILY, dtstart=start_date, until=end_date):
    try:
        dataset.append(get_data(dt))
        print("Data retrieved for the date {}".format(dt.strftime("%d/%m/%Y")))
    except:
        print("Unable to retrieve data for the date {}".format(dt.strftime("%d/%m/%Y")))
        failed_dates.append(dt)

Unable to retrieve data for the date 01/01/2021
Unable to retrieve data for the date 02/01/2021
Unable to retrieve data for the date 03/01/2021
Unable to retrieve data for the date 04/01/2021
Unable to retrieve data for the date 05/01/2021
Unable to retrieve data for the date 06/01/2021
Unable to retrieve data for the date 07/01/2021
Unable to retrieve data for the date 08/01/2021
Unable to retrieve data for the date 09/01/2021
Unable to retrieve data for the date 10/01/2021
Unable to retrieve data for the date 11/01/2021
Unable to retrieve data for the date 12/01/2021
Unable to retrieve data for the date 13/01/2021
Unable to retrieve data for the date 14/01/2021
Unable to retrieve data for the date 15/01/2021
Unable to retrieve data for the date 16/01/2021
Unable to retrieve data for the date 17/01/2021
Unable to retrieve data for the date 18/01/2021
Unable to retrieve data for the date 19/01/2021
Data retrieved for the date 20/01/2021
Data retrieved for the date 21/01/2021
Data retri

In [61]:
# export data to csv
df = pd.DataFrame(dataset)
df

Unnamed: 0,cured_cases,new_cases,import_cases,local_cases,active_cases,resp_asst_cases,death_cases,date
0,2,4008,5,4003,41087,246,11,20/01/2021
1,2,3170,8,3162,4,260,12,21/01/2021
2,2554,3631,6,3625,42,251,18,22/01/2021
3,4313,4275,11,4264,42,260,7,23/01/2021
4,4427,3346,7,3339,41,265,11,24/01/2021
...,...,...,...,...,...,...,...,...
144,6918,5911,1,5910,64523,886,72,19/06/2021
145,5941,5293,7,5286,63815,880,60,20/06/2021
146,5439,4611,8,4603,62918,880,69,21/06/2021
147,5557,4743,2,4741,62027,875,77,22/06/2021


In [62]:
df.to_csv("covid_data.csv")