## Testing notebook playground for extracting data

In [122]:
import pandas as pd
import mysql.connector as msc
import json
import tweepy
import csv
import re
import string
import requests
import pprint
from bs4 import BeautifulSoup

In [3]:
# read config file into JSON object
configFile = open('config.json')
config = json.load(configFile)

# make connection to the GCloud database
db_config = config['database']
connection = msc.connect(
    host=db_config['host'], 
    port=db_config['port'],
    user=db_config['user'],
    password=db_config['password'],
    database=db_config['database']
)

# authorize Twitter app with Tweepy OAuthHandler
twit_config = config['twitter']
auth = tweepy.OAuthHandler(twit_config['api_key'], twit_config['api_key_secret'])
auth.set_access_token(twit_config['access_token'], twit_config['access_token_secret'])
twitapi = tweepy.API(auth, wait_on_rate_limit=True)

In [4]:
# extract Tweets from Malaysian MOH about Malaysian COVID-19 case data
# write to pandas database
tweet_collection = twitapi.search("KKMPutrajaya")

15


In [82]:
# Get user id of @KKMPutrajaya
url = "https://api.twitter.com"
headers = {"Authorization": "Bearer {}".format(twit_config['bearer_token'])}
response = requests.request("GET", url + "/2/users/by/username/KKMPutrajaya", headers=headers)
user_id = response.json()['data']['id']

In [44]:
# initialize pandas data set
tweets = []
next_tokens = []

In [81]:
# get posts from Malaysian MOH (@KKMPutrajaya)
search_url = (url + "/2/users/{}/tweets".format(user_id))
response = requests.request("GET", search_url, headers=headers)
data = response.json()['data']

In [47]:
# fetch tweet data
# this fetches thousands of MOH's tweets, which are factored into the quota for my 
# Twitter account, so beware!
def fetch_data():
    # get posts from Malaysian MOH (@KKMPutrajaya)
    search_url = (url + "/2/users/{}/tweets".format(user_id))
    response = requests.request("GET", search_url, headers=headers)
    data = response.json()['data']

    # cycle and add to data
    for tweet in data:
        if "Status Terkini #COVID19" in tweet['text']:
            tweets.append(tweet)
    # get next pagination token
    next_token = response.json()['meta'].get('next_token')
    next_tokens.append(next_token)
    # while next token is not None (ie there is still a next page)
    while next_token is not None:
        # get next page
        search_url = (url + "/2/users/{}/tweets?pagination_token={}".format(user_id, next_token))
        response = requests.request("GET", search_url, headers=headers)
        data = response.json()['data']
        next_token = response.json()['meta'].get('next_token')
        # cycle through the next page
        for tweet in data:
            if "Status Terkini #COVID19" in tweet['text']:
                tweets.append(tweet)
                next_tokens.append(next_token)

In [80]:
tweetsdf = pd.DataFrame(tweets)

In [83]:
next_tokensdf = pd.DataFrame(next_tokens)

In [79]:
search_url = (url + "/2/users/{}/tweets?end_time={}".format(user_id, "2021-03-11T12:00:00Z"))
response = requests.request("GET", search_url, headers=headers)

In [77]:
# function to cycle through pages automatically
print(next_token)
def goForward():
    global next_token
    if next_token is None:
        raise Exception("no next page")
    else:
        search_url = (url + "/2/users/{}/tweets?end_time={}&pagination_token={}".format(user_id, "2021-03-11T12:00:00Z", next_token))
        response = requests.request("GET", search_url, headers=headers)
        next_token = response.json()['meta'].get('next_token')
        print(response.json()['data'])

None


In [98]:
example_tweet = tweetsdf.iloc[0, :]['text']

In [110]:
# parser for tweet content
def parseTweetContent(tweet):
    # get tweet content without \n and 
    # explode it via spaces
    exploded_tweet = re.split('\n| ', tweet)
    
    # get date as "DD MMM YYYY", where the month is in Malay
    date = "{} {} {}".format(exploded_tweet[3], exploded_tweet[4], exploded_tweet[5])

    

In [108]:
# example tweet content
parseTweetContent(example_tweet)

['Status',
 'Terkini',
 '#COVID19,',
 '21',
 'Jun',
 '2021',
 '',
 'Kes',
 'sembuh=',
 '5,439',
 'Jumlah',
 'kes',
 'sembuh=',
 '633,624',
 'Kes',
 'baharu',
 'positif=',
 '4,611',
 '(8',
 'import)',
 '',
 'Jumlah',
 'positif=',
 '701,019',
 'Kes',
 'kematian=',
 '69',
 'Jumlah',
 'kes',
 'kematian=',
 '4,477',
 'Kes',
 'dirawat',
 'di',
 'ICU=',
 '880',
 'Bantuan',
 'alat',
 'pernafasan=',
 '452',
 'https://t.co/zUQch1MbEs']

### Testing out web scraper for kpkesihatan.com instead to fetch data

In [113]:
# can do formatting on this later
date1 = "2021/06/20"
date2 = "20-jun-2021"
myurl = "https://kpkesihatan.com/{}/kenyataan-akhbar-kpk-{}-situasi-semasa-jangkitan-penyakit-coronavirus-2019-covid-19-di-malaysia/".format(date1, date2)
myurl

'https://kpkesihatan.com/2021/06/20/kenyataan-akhbar-kpk-20-jun-2021-situasi-semasa-jangkitan-penyakit-coronavirus-2019-covid-19-di-malaysia/'

In [119]:
# get content of website
response = requests.get(myurl)
# initialize printer
pp = pprint.PrettyPrinter(indent=4)

In [124]:
# create soup object
soup = BeautifulSoup(response.content, 'html.parser')

In [128]:
# get content of post
result = soup.find(class_="post")
result.prettify()

td>\n       464\n      </td>\n      <td>\n       129\n      </td>\n      <td>\n       –\n      </td>\n      <td>\n       34\n      </td>\n     </tr>\n     <tr>\n      <td>\n       <strong>\n        WP KUALA LUMPUR\n       </strong>\n      </td>\n      <td>\n       379\n      </td>\n      <td>\n       35\n      </td>\n      <td>\n       224\n      </td>\n      <td>\n       –\n      </td>\n      <td>\n       120\n      </td>\n     </tr>\n     <tr>\n      <td>\n       <strong>\n        PERAK\n       </strong>\n       <strong>\n       </strong>\n      </td>\n      <td>\n       205\n      </td>\n      <td>\n       171\n      </td>\n      <td>\n       18\n      </td>\n      <td>\n       –\n      </td>\n      <td>\n       16\n      </td>\n     </tr>\n     <tr>\n      <td>\n       <strong>\n        MELAKA\n       </strong>\n      </td>\n      <td>\n       183\n      </td>\n      <td>\n       110\n      </td>\n      <td>\n       38\n      </td>\n      <td>\n       –\n      </td>\n      <td>\n  