# Data Collection

## Web Scraping URLs

In [1]:
%pip install requests beautifulsoup4 fake_useragent pandas openpyxl

Collecting requests
  Using cached requests-2.32.3-py3-none-any.whl (64 kB)
Collecting beautifulsoup4
  Using cached beautifulsoup4-4.12.3-py3-none-any.whl (147 kB)
Collecting fake_useragent
  Using cached fake_useragent-1.5.1-py3-none-any.whl (17 kB)
Collecting pandas
  Using cached pandas-2.2.2-cp311-cp311-macosx_11_0_arm64.whl (11.3 MB)
Collecting openpyxl
  Using cached openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Collecting charset-normalizer<4,>=2 (from requests)
  Using cached charset_normalizer-3.3.2-cp311-cp311-macosx_11_0_arm64.whl (118 kB)
Collecting idna<4,>=2.5 (from requests)
  Using cached idna-3.7-py3-none-any.whl (66 kB)
Collecting urllib3<3,>=1.21.1 (from requests)
  Using cached urllib3-2.2.2-py3-none-any.whl (121 kB)
Collecting certifi>=2017.4.17 (from requests)
  Using cached certifi-2024.7.4-py3-none-any.whl (162 kB)
Collecting soupsieve>1.2 (from beautifulsoup4)
  Using cached soupsieve-2.5-py3-none-any.whl (36 kB)
Collecting numpy>=1.23.2 (from pandas)
  Using c

In [2]:
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import urllib.parse

def google_search(query, num_results, time_filter = None):
    ua = UserAgent()
    headers = {'User-Agent': ua.random}

    query = urllib.parse.quote_plus(query)

    google_url = f"https://www.google.com/search?q={query}&num={num_results}"

    if time_filter:
        google_url += f"&tbs={time_filter}"

    response = requests.get(google_url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        search_results = []

        for g in soup.find_all('div', class_='g'):
            anchors = g.find_all('a')
            if anchors:
                link = anchors[0]['href']
                search_results.append(link)
                
        return search_results
    else:
        print(f"failed to retrieve search results: status code {response.status_code}")
        return []


In [14]:
def generate_quarters(start_year, end_year):
    quarters = {}
    if end_year == 2024:
        quarters["2024 Q1"] = "cdr:1,cd_min:1/1/2024,cd_max:3/31/2024"
        quarters["2024 Q2"] = "cdr:1,cd_min:4/1/2024,cd_max:6/30/2024"
        end_year -= 1
    for year in range(start_year, end_year + 1):
        quarters[f"{year} Q1"] = f"cdr:1,cd_min:1/1/{year},cd_max:3/31/{year}"
        quarters[f"{year} Q2"] = f"cdr:1,cd_min:4/1/{year},cd_max:6/30/{year}"
        quarters[f"{year} Q3"] = f"cdr:1,cd_min:7/1/{year},cd_max:9/30/{year}"
        quarters[f"{year} Q4"] = f"cdr:1,cd_min:10/1/{year},cd_max:12/31/{year}"
    return quarters

def generate_query(source_list):
    dictionary = {}
    for source in source_list:
        if source in dictionary:
            continue
        else:
            dictionary[source] = f"singapore industrial property market {source}"
    return dictionary

In [17]:
import pandas as pd

query_dictionary = generate_query(["cna", "singapore business review"])

quarter_dictionary = generate_quarters(2020, 2024)

headers = ["URLs", "Source", "Quarter"]
df = pd.DataFrame(columns=headers)

for source, query in query_dictionary.items():
    for quarter, time_filter in quarter_dictionary.items():
        results = google_search(query, num_results=20, time_filter=time_filter)
        temp_df = pd.DataFrame({"URLs": results, "Source": source, "Quarter": quarter})
        df = pd.concat([df, temp_df], ignore_index=True)

print(df)

query_dictionary = generate_query(["straits times", "business times", "edgeprop"])

quarter_dictionary = generate_quarters(2020, 2024)

for source, query in query_dictionary.items():
    for quarter, time_filter in quarter_dictionary.items():
        results = google_search(query, num_results=30, time_filter=time_filter)
        temp_df = pd.DataFrame({"URLs": results, "Source": source, "Quarter": quarter})
        df = pd.concat([df, temp_df], ignore_index=True)

print(df)

df.to_excel("urls.xlsx", index=False)

                                                  URLs  \
0    https://www.channelnewsasia.com/singapore/mark...   
1          https://www.youtube.com/watch?v=JsMbvmY9FWA   
2    https://www.channelnewsasia.com/singapore/joho...   
3          https://www.youtube.com/watch?v=iJ2Xfc0xnZk   
4    https://m.facebook.com/ChannelNewsAsia/posts/8...   
..                                                 ...   
432  https://plbinsights.com/the-rise-of-self-stora...   
433  https://www.propertyguru.com.sg/property-guide...   
434  https://sbr.com.sg/commercial-property/in-focu...   
435  https://realestateasia.com/videos/proptech-kuc...   
436  https://cosysingapore.com/commercial-property-...   

                        Source  Quarter  
0                          cna  2024 Q1  
1                          cna  2024 Q1  
2                          cna  2024 Q1  
3                          cna  2024 Q1  
4                          cna  2024 Q1  
..                         ...      ...  
432  sing

## URLs Cleaning

In [20]:
df = pd.read_excel("urls.xlsx")
duplicates = df["URLs"].duplicated(keep=False)
df = df[~duplicates]

with pd.ExcelWriter("urls.xlsx", engine="openpyxl", mode="a", if_sheet_exists="replace") as writer:
    df.to_excel(writer, sheet_name="2020 - 2024", index=False)

## Date Scraping URLs

In [24]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def get_article_date(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        return f"failed to retrieve the webpage: {e}"
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # locate the date in the meta tag with property 'article:published_time'
    date = soup.find('meta', attrs={'property': 'article:published_time'})
    if date:
        return date['content']
    
    # locate the date in the meta tag with name 'cXenseParse:recs:publishtime'
    date = soup.find('meta', attrs={'name': 'cXenseParse:recs:publishtime'})
    if date:
        return date['content']
    
    # locate the date in the meta tag with name 'article:published_time'
    date = soup.find('meta', attrs={'name': 'article:published_time'})
    if date:
        return date['content']
    
    # if not found
    return "date not found"

In [25]:
excel_file = 'urls.xlsx'
sheet_name = '2020 - 2024'
df_existing = pd.read_excel(excel_file, sheet_name=sheet_name)

urls = df_existing['URLs'].tolist()

date_data = []
for url in urls:
    date = get_article_date(url)
    print(date)
    date_data.append({'URLs': url, 'Date': date})

df_date = pd.DataFrame(date_data)

df_updated = pd.merge(df_existing, df_date, on='URLs', how='left')

with pd.ExcelWriter(excel_file, engine="openpyxl", mode="a", if_sheet_exists="replace") as writer:
    df_updated.to_excel(writer, sheet_name=sheet_name, index=False)

2022-02-01T05:00:00+08:00
2021-07-20T05:00:00+08:00
2023-08-18T21:20:00+08:00
2021-04-16T22:30:39+08:00
2023-10-03T14:29:33+08:00
2022-01-24T17:06:40+08:00
2023-08-03T22:21:43+08:00
2023-05-10T21:45:00+08:00
2020-05-12T16:00:00+08:00
2021-06-06T05:00:00+08:00
2020-10-09T18:00:00+08:00
2023-03-09T05:00:00+08:00
2023-02-14T19:20:55+08:00
2023-12-13T05:00:00+08:00
2023-09-24T05:00:00+08:00
2023-10-16T10:24:53+08:00
2023-10-26T16:00:00+08:00
2023-10-16T08:00:00+08:00
2024-05-10T07:24:00+08:00
2024-04-12T07:37:00+08:00
2024-05-23T08:00:00+08:00
2023-02-15T13:01:20+08:00
2023-01-27T11:36:55+08:00
2023-12-11T07:14:11+08:00
2023-09-26T13:53:26+08:00
2024-04-26T13:00:00+08:00
2023-11-22T08:00:00+08:00
2024-06-25T19:40:00+08:00
2024-04-17T13:34:20+08:00
2020-06-30T11:27:00+08:00
2020-01-07T13:48:02+08:00
2021-01-27T04:00:00+08:00
2021-08-30T20:14:50+08:00
2022-03-28T18:04:40+08:00
2022-02-23T14:40:22+08:00
2020-10-16T12:54:21+08:00
2020-10-26T15:16:12+08:00
2023-08-01T14:45:00+08:00
2022-01-24T1

In [28]:
excel_file = 'urls.xlsx'
date_df = pd.read_excel(excel_file, sheet_name='2020 - 2024')

filtered_df = date_df[date_df['Date'] == "date not found"]
filtered_df

Unnamed: 0,URLs,Source,Quarter,Date
89,https://www.edgeprop.sg/property-news/weaker-i...,edgeprop,2023 Q2,date not found
90,https://www.edgeprop.sg/property-news/value-bu...,edgeprop,2021 Q3,date not found
91,https://www.edgeprop.sg/property-news/two-indu...,edgeprop,2022 Q4,date not found
92,https://www.edgeprop.sg/property-news/two-indu...,edgeprop,2020 Q2,date not found
93,https://www.edgeprop.sg/property-news/top-mark...,edgeprop,2022 Q4,date not found
94,https://www.edgeprop.sg/property-news/tj-choo-...,edgeprop,2023 Q4,date not found
95,https://www.edgeprop.sg/property-news/tech-lay...,edgeprop,2022 Q4,date not found
96,https://www.edgeprop.sg/property-news/stellar-...,edgeprop,2021 Q4,date not found
97,https://www.edgeprop.sg/property-news/singapor...,edgeprop,2023 Q3,date not found
98,https://www.edgeprop.sg/property-news/singapor...,edgeprop,2020 Q1,date not found


Unnamed: 0,URLs,Source,Quarter,Date
0,https://www.straitstimes.com/tech/tech-news/si...,straits times,2022 Q1,2022-02-01T05:00:00+08:00
1,https://www.straitstimes.com/tech/how-ai-and-d...,straits times,2021 Q3,2021-07-20T05:00:00+08:00
2,https://www.straitstimes.com/singapore/using-p...,straits times,2023 Q3,2023-08-18T21:20:00+08:00
3,https://www.straitstimes.com/singapore/study-t...,straits times,2021 Q2,2021-04-16T22:30:39+08:00
4,https://www.straitstimes.com/singapore/politic...,straits times,2023 Q4,2023-10-03T14:29:33+08:00
...,...,...,...,...
304,https://sbr.com.sg/commercial-property-residen...,singapore business review,2021 Q3,2021-09-14T08:54:37+0800
305,https://sbr.com.sg/co-written-partner/more-new...,singapore business review,2020 Q3,2020-08-14T09:50:50+0800
306,https://sbr.com.sg/co-written-partner/more-new...,singapore business review,2020 Q4,2020-11-03T10:52:43+0800
307,https://sbr.com.sg/co-written-partner/event-ne...,singapore business review,2022 Q2,2022-05-12T10:00:00+0800
