In [3]:
import pandas as pd
import json

import requests
from bs4 import BeautifulSoup

import gc
import time
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

In [4]:
headers = {
    'sec-ch-ua': '"Chromium";v="118", "Google Chrome";v="118", "Not=A?Brand";v="99"',
    'Referer': 'https://www.vikatan.com/topics/malaysia',
    'sec-ch-ua-mobile': '?0',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
    'sec-ch-ua-platform': '"Windows"',
}

#### **Testing**

In [10]:
# we can get the number of pages by toggling the offset parameter
response = requests.get(
    'https://www.vikatan.com/api/v1/stories?tag-slugs=malaysia&offset=54&limit=9&fields=id%2Curl',
    headers=headers,
)

print(response)

<Response [200]>


In [11]:
# Parse the JSON data directly from the response text
data = response.json()

In [12]:
data

{'stories': [{'id': '7766d197-94c1-4cea-9fd4-60ed1dc19611',
   'url': 'https://www.vikatan.com/government-and-politics/election/155337-our-duty-is-to-vote-pudukkottai-youth-group-traveled-from-abroad-for-elections'},
  {'id': '5099b7e8-2e64-4110-a581-aede8e263302',
   'url': 'https://www.vikatan.com/government-and-politics/151669-explanation-about-the-49-tamilians-who-were-rescued-from-the-malaysian-jail'},
  {'id': 'da97d906-7f83-4b08-8880-3796f25d376d',
   'url': 'https://www.vikatan.com/government-and-politics/150136-49-tamilians-who-were-arrested-in-malaysia-were-rescued-with-the-help-of-kanimozhi'},
  {'id': '7239410d-b439-4af4-b0bb-11b9ed229531',
   'url': 'https://www.vikatan.com/government-and-politics/150090-kanimozhi-take-action-to-release-49-people-who-trapped-in-malaysia'},
  {'id': 'a7a253be-456f-4ea0-b3f5-224e8ef366bc',
   'url': 'https://www.vikatan.com/literature/arts/148587-2-elderly-women-killed-in-freemeal-stampede-in-malaysia'},
  {'id': '5f2c935e-b798-4425-9461-111

In [15]:
urls = [story['url'] for story in data['stories']]
urls

['https://www.vikatan.com/government-and-politics/election/155337-our-duty-is-to-vote-pudukkottai-youth-group-traveled-from-abroad-for-elections',
 'https://www.vikatan.com/government-and-politics/151669-explanation-about-the-49-tamilians-who-were-rescued-from-the-malaysian-jail',
 'https://www.vikatan.com/government-and-politics/150136-49-tamilians-who-were-arrested-in-malaysia-were-rescued-with-the-help-of-kanimozhi',
 'https://www.vikatan.com/government-and-politics/150090-kanimozhi-take-action-to-release-49-people-who-trapped-in-malaysia',
 'https://www.vikatan.com/literature/arts/148587-2-elderly-women-killed-in-freemeal-stampede-in-malaysia',
 'https://www.vikatan.com/literature/146208-thiruvalluvar-statue-going-to-malaysia',
 'https://www.vikatan.com/agriculture/146582-agricultural-export-success-formulas',
 'https://www.vikatan.com/government-and-politics/141013-mother-demanding-to-rescue-his-son-in-pattukottai',
 'https://www.vikatan.com/health/137859-actor-munishkanths-fitnes

**Note:** There's a lot of redundant articles! Upon inspection on the site, there's **not many articles with Malaysia tags** on the website. Sadge.

In [79]:
response = requests.get(
    'https://www.vikatan.com/api/v1/stories?tag-slugs=malaysia&offset=53&limit=9&fields=id%2Curl',
    headers=headers,
)

print(response)
data = response.json()
urls = [story['url'] for story in data['stories']]
urls

<Response [200]>


['https://www.vikatan.com/crime/157573-16-year-girl-died-after-instagram-polling',
 'https://www.vikatan.com/government-and-politics/election/155337-our-duty-is-to-vote-pudukkottai-youth-group-traveled-from-abroad-for-elections',
 'https://www.vikatan.com/government-and-politics/151669-explanation-about-the-49-tamilians-who-were-rescued-from-the-malaysian-jail',
 'https://www.vikatan.com/government-and-politics/150136-49-tamilians-who-were-arrested-in-malaysia-were-rescued-with-the-help-of-kanimozhi',
 'https://www.vikatan.com/government-and-politics/150090-kanimozhi-take-action-to-release-49-people-who-trapped-in-malaysia',
 'https://www.vikatan.com/literature/arts/148587-2-elderly-women-killed-in-freemeal-stampede-in-malaysia',
 'https://www.vikatan.com/literature/146208-thiruvalluvar-statue-going-to-malaysia',
 'https://www.vikatan.com/agriculture/146582-agricultural-export-success-formulas',
 'https://www.vikatan.com/government-and-politics/141013-mother-demanding-to-rescue-his-son

#### **Get page nums**

In [19]:
page_nums = []
i = 0
while True:
    url = f"https://www.vikatan.com/api/v1/stories?tag-slugs=malaysia&offset={i}&limit=9&fields=id"
    i += 1
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"There are only has {i-1} pages.")
        break
    else:
        soup = BeautifulSoup(response.content, 'html.parser')

        # Parse the JSON data directly from the response text
        data = response.json()

        # Access the 'stories' dictionary
        stories = data['stories']
        
        # offset = 2000 still returns a 200 status code.
        # Hence, we'll break the while loop if it returns an empty list
        if stories != []:
            page_nums.append(url)
        else:
            print(f"There are only has {i-1} pages.")
            break

page_nums_unique = list(set(page_nums))
print(f'Num. of unique pages: {len(page_nums_unique)}')
with open(f'vikatan_numpages.json', 'a') as f:
    json.dump(page_nums_unique, f)

There are only has 71 pages.
Num. of unique pages: 71


#### **Get urls from pagenums**

In [42]:
page_nums = []
with open(f'vikatan_numpages.json') as fopen:
    links = json.load(fopen)
page_nums.extend(links)

print(f"Num. of pages to get article links from: {len(page_nums)}")

Num. of pages to get article links from: 71


In [44]:
def crawl_pagenums(url):
    while True:
        try:
            response = requests.get(url + '%2Curl', headers=headers)
            break
        except Exception as e:
            print(e)
            time.sleep(1.0)

    data = response.json()
    urls = [story['url'] for story in data['stories']]
    
    articles_to_scrape.extend(urls)

In [45]:
articles_to_scrape = []
max_worker = 10

for i in tqdm(range(0, len(page_nums), max_worker)):
    gc.collect()
    with ThreadPoolExecutor(max_workers=max_worker) as executor:
        futures = {executor.submit(crawl_pagenums, t): t for t in page_nums[i: i + max_worker]}

articles_unique = list(set(articles_to_scrape))
print(f'Num. of unique articles: {len(articles_unique)}')
with open(f'vikatan_articles.json', 'a') as f:
    json.dump(articles_unique, f)

  0%|          | 0/8 [00:00<?, ?it/s]

Num. of unique articles: 71


In [41]:
#bapak lah dari 603 dapat 71 je. Ada error ke. Hm.
len(articles_to_scrape)

603

#### **Get content from articles** (to edit)

In [14]:
articles = []
with open(f'vikatan_articles.json') as fopen:
    links = json.load(fopen)
articles.extend(links)

print(f"Num. of pages to get article links from: {len(articles)}")

Num. of pages to get article links from: 71


In [15]:
# found some articles we missed. This is because it doesn't have the *Malaysia* tag
articles.append('https://www.vikatan.com/literature/arts/125344-malaysia-scrapping-gst-from-june')
print(f"Num. of pages to get article links from: {len(articles)}")

Num. of pages to get article links from: 72


In [16]:
url='https://www.vikatan.com/government-and-politics/130008-we-will-not-send-zakir-naik-to-india-malaysian-pm'
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "lxml")

# headline = soup.find('h1', class_="styles-m__headline__1uXyt").text
# subheadline = soup.find('p', class_= "subheadline-main styles-m__subheadline__2_IOU").text

paragraphs = soup.find_all('div', attrs = {"class":"story-element story-element-text"})
words = [p.text for p in paragraphs]
combined_paragraph = "\n".join(words)
print(combined_paragraph)

தீவிரவாத நடவடிக்கைகளைத் தூண்டும் வகையில் பேசியதால் இந்தியாவில் பிடிவாரன்ட் பிறப்பிக்கப்பட்டுள்ள இஸ்லாமிய மதபோதகர் ஜாகிர் நாயக்கை மலேசியாவில் இருந்து நாடு கடத்த முடியாது என்று மலேசிய பிரதமர் மஹாதிர் முகம்மது கூறியிருக்கிறார்.
தீவிரவாத நடவடிக்கைகளில் தொடர்பு, வன்முறையைத் தூண்டும் வகையிலான பேச்சு போன்ற குற்றச்சாட்டுகள் காரணமாக, ஜாகிர் நாயக்குக்கு எதிராக இந்தியாவில் பிடிவாரன்ட் பிறப்பிக்கப்பட்டுள்ளது. தற்போது அவர் மலேசியாவில் நிரந்தரக் குடியுரிமை பெற்று அங்கு தங்கியுள்ளார். அவரை இந்தியாவுக்கு அனுப்பி வைக்க வேண்டும் என்று ஏற்கெனவே மத்திய அரசின் சார்பில் கோரிக்கை விடப்பட்டுள்ளது. 
தொலைக்காட்சிகளில் மதப் பிரசார நிகழ்ச்சிகளில் பங்கேற்கும் ஜாகிர் நாயக், கடந்த 2016-ம் ஆண்டு இந்தியாவிலிருந்து வெளியேறி மலேசியாவில் தஞ்சம் புகுந்தார், அங்கு அவருக்கு நிரந்தரக் குடியுரிமையை அந்நாட்டு அரசு வழங்கியுள்ளது. 
இந்நிலையில், அவரை இந்தியாவுக்கு அனுப்பி வைக்குமாறு மத்திய அரசு, கடந்த ஜனவரி மாதம் மலேசியாவிடம் கோரிக்கை விடுத்தது. இந்தியா - மலேசியா இடையே குற்றம்சாட்டப்பட்டவர்களை பரஸ்பரம் ஒப்படைக்கக்கூடிய வகையிலான ஒ

In [17]:
def crawl_article(url):
    while True:
        try:
            response = requests.get(url, headers=headers)
            break
        except Exception as e:
            print(e)
            time.sleep(1.0)

    soup = BeautifulSoup(response.text, "lxml")

    try:
        headline = soup.find('h1', class_="styles-m__headline__1uXyt").text
        subheadline = soup.find('p', class_= "subheadline-main styles-m__subheadline__2_IOU").text
        paragraphs = soup.find_all('div', attrs = {"class":"story-element story-element-text"})
        words = [p.text for p in paragraphs]
        combined_paragraph = "\n".join(words)

    except Exception as e:
        print('error in link:'+ url)
        print(e)
        return None

    data = {
        'url': url,
        'headline': headline,
        'subheadline': subheadline,
        'text': combined_paragraph
    }
    return data

In [18]:
max_worker = 10

for i in tqdm(range(0, len(articles), max_worker)):
    gc.collect()
    with ThreadPoolExecutor(max_workers=max_worker) as executor:
        futures = {executor.submit(crawl_article, t): t for t in articles[i: i + max_worker]}

    for future in as_completed(futures):
        result = future.result()
        if result:
            with open(f'vikatan-my-scraped-data.jsonl', 'a') as final:
                json.dump(result, final)
                final.write('\n')

  0%|          | 0/8 [00:00<?, ?it/s]

error in link:https://cinema.vikatan.com/malaysian-singer-dato-sri-siti-nurhaliza-s-exclusive-interview
'NoneType' object has no attribute 'text'
error in link:https://www.vikatan.com/health/this-makeup-artist-stuns-everyone-with-his-skills
'NoneType' object has no attribute 'text'
error in link:https://cinema.vikatan.com/music/malaysia-singer-siti-nurhaliza-interview
'NoneType' object has no attribute 'text'
error in link:https://cinema.vikatan.com/kollywood/actor-karunas-interview-4
'NoneType' object has no attribute 'text'
error in link:https://www.vikatan.com/spiritual/gods/ooty-lord-murugan-temple-like-malayia-murugan
'NoneType' object has no attribute 'text'
error in link:https://www.vikatan.com/agriculture/146582-agricultural-export-success-formulas
'NoneType' object has no attribute 'text'
error in link:https://www.vikatan.com/government-and-politics/policy/144204-malaysia-sand-import-issue-junior-vikatan
'NoneType' object has no attribute 'text'


#### **Final check**

7 articles are blocked by premium subscriptions. Sadge.

In [19]:
final_test = pd.read_json('vikatan-my-scraped-data.jsonl', lines=True)
print(f"Num. of rows in dataframe: {len(final_test)}")

# check if there's any articles with no content
no_content = final_test[final_test['text'].isnull()]
print(f"Num. of webpages with no content: {no_content}")
no_content.head(3)

Num. of rows in dataframe: 65
Num. of webpages with no content: Empty DataFrame
Columns: [url, headline, subheadline, text]
Index: []


Unnamed: 0,url,headline,subheadline,text
