# Data Cleaning

In [44]:
%pip install pandas openpyxl

Note: you may need to restart the kernel to use updated packages.


# Text Cleaning

In [45]:
import pandas as pd

excel_file = 'texts.xlsx'
sheet_name = '2010 - 2014'
df = pd.read_excel(excel_file, sheet_name=sheet_name)

from urllib.parse import urlparse

def extract_domain(url):
    try:
        return urlparse(url).netloc
    except:
        return ''

df['Domain'] = df['URLs'].apply(extract_domain)

## Singapore Business Review

In [46]:
df['Domain'] = df['URLs'].apply(extract_domain)
filtered_df = df[df['Domain'] == 'sbr.com.sg']

# remove heading
remove_text = "Singapore Business Review website works best with Javascript enabled. Please enable your javascript and reload the page."
filtered_df.loc[:, 'Text'] = filtered_df['Text'].str.replace(f'^{remove_text}', '', regex=True)
# remove ending
remove_text = ("..there are many ways you can work with us to advertise your company and connect to your customers."
               "Our team can help you dight and create an advertising campaign, in print and digital, on this website and in print magazine."
               "We can also organize a real life or digital event for you and find thought leader speakers as well as industry leaders, who could be your potential partners, to join the event."
               "We also run some awards programmes which give you an opportunity to be recognized for your achievements during the year and you can join this as a participant or a sponsor."
               "Let us help you drive your business forward with a good partnership!"
               "Copyright 2024 Charlton Media Group."
               "Web Design by: Halcyon Web DesignCopyright 2024 Charlton Media Group.Web Design by: Halcyon Web Design")

filtered_df.loc[:, 'Text'] = filtered_df['Text'].str.replace(remove_text, '')

filtered_df.to_excel('data_cleaning.xlsx', index=False)

In [47]:
# save to texts.xlsx
cleaned_df = pd.DataFrame(filtered_df)
cleaned_dict = cleaned_df.set_index('URLs')['Text'].to_dict()

df['Text'] = df.apply(
    lambda row: cleaned_dict.get(row['URLs'], row['Text']), axis=1
)

df = df.drop(columns=['Domain'])

with pd.ExcelWriter('texts_cleaned.xlsx', engine="openpyxl", mode="a", if_sheet_exists="replace") as writer:
    df.to_excel(writer, sheet_name=sheet_name, index=False)

## Business Times

In [48]:
df['Domain'] = df['URLs'].apply(extract_domain)
filtered_df = df[df['Domain'] == 'www.businesstimes.com.sg']

# remove heading
remove_text = "Login"
filtered_df.loc[:, 'Text'] = filtered_df['Text'].str.replace(f'^{remove_text}', '', regex=True)
# remove the last 301 characters
filtered_df.loc[:, 'Text'] = filtered_df['Text'].str[:-301]

filtered_df.to_excel('data_cleaning.xlsx', index=False)

In [49]:
# save to texts.xlsx
cleaned_df = pd.DataFrame(filtered_df)
cleaned_dict = cleaned_df.set_index('URLs')['Text'].to_dict()

df['Text'] = df.apply(
    lambda row: cleaned_dict.get(row['URLs'], row['Text']), axis=1
)

df = df.drop(columns=['Domain'])

with pd.ExcelWriter('texts_cleaned.xlsx', engine="openpyxl", mode="a", if_sheet_exists="replace") as writer:
    df.to_excel(writer, sheet_name=sheet_name, index=False)

## Straits Times

In [50]:
df['Domain'] = df['URLs'].apply(extract_domain)
filtered_df = df[df['Domain'] == 'www.straitstimes.com']

remove_text = ("Join ST's Telegram channel and get the latest breaking news delivered to you."
               "Read 3 articles and stand to win rewards"
               "Spin the wheel now"
               "MCI (P) 066/10/2023. Published by SPH Media Limited, Co. Regn. No. 202120748H. "
               "Copyright 2024 SPH Media Limited. All rights reserved.")

filtered_df.loc[:, 'Text'] = filtered_df['Text'].str.replace(remove_text, '')

filtered_df.to_excel('data_cleaning.xlsx', index=False)

In [51]:
# save to texts.xlsx
cleaned_df = pd.DataFrame(filtered_df)
cleaned_dict = cleaned_df.set_index('URLs')['Text'].to_dict()

df['Text'] = df.apply(
    lambda row: cleaned_dict.get(row['URLs'], row['Text']), axis=1
)

df = df.drop(columns=['Domain'])

with pd.ExcelWriter('texts_cleaned.xlsx', engine="openpyxl", mode="a", if_sheet_exists="replace") as writer:
    df.to_excel(writer, sheet_name=sheet_name, index=False)

## Channel News Asia

In [52]:
df['Domain'] = df['URLs'].apply(extract_domain)
filtered_df = df[df['Domain'] == 'www.channelnewsasia.com']

## EdgeProp

In [53]:
import requests
from bs4 import BeautifulSoup

def scrape_text(urls):
    results = []
    for index, url in enumerate(urls):
        print(f"processing URL {index}: {url}")
        try:
            response = requests.get(url)
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                text_blocks = soup.find_all('div', class_='jsx-213751841 truncated_textview_box')
                    
                if text_blocks:
                    text = ' '.join(block.get_text(strip=True) for block in text_blocks)
                    results.append({'URLs': url, 'Text': text})
            else:
                results.append({'URLs': url, 'Text': 'failed to retrieve content'})
        except Exception as e:
            results.append({'URLs': url, 'Text': str(e)})
    return pd.DataFrame(results)

In [54]:
df['Domain'] = df['URLs'].apply(extract_domain)
filtered_df = df[df['Domain'] == 'www.edgeprop.sg']

urls = filtered_df['URLs'].tolist()
scraped_df = scrape_text(urls)
scraped_df.to_excel('data_cleaning.xlsx', index=False)

processing URL 0: https://www.edgeprop.sg/property-news/sitting-goldmine-%E2%80%93-landed-housing-segment
processing URL 1: https://www.edgeprop.sg/property-news/competition-heats-farrer-road-leedon-road
processing URL 2: https://www.edgeprop.sg/property-news/ho-bee%E2%80%99s-metropolis
processing URL 3: https://www.edgeprop.sg/property-news/new-lease-%E2%80%98awesomeness%E2%80%99
processing URL 4: https://www.edgeprop.sg/property-news/good-class-bungalow-35-mil
processing URL 5: https://www.edgeprop.sg/property-news/prudential-tower-strata-unit-sale-2800-psf
processing URL 6: https://www.edgeprop.sg/property-news/jewel-rangoon-road
processing URL 7: https://www.edgeprop.sg/property-news/shadow-d%E2%80%99leedon
processing URL 8: https://www.edgeprop.sg/property-news/pricing-shock-iskandar%E2%80%99s-puteri-harbour


In [55]:
# save to texts.xlsx
cleaned_df = pd.DataFrame(scraped_df)
cleaned_dict = cleaned_df.set_index('URLs')['Text'].to_dict()

df['Text'] = df.apply(
    lambda row: cleaned_dict.get(row['URLs'], row['Text']), axis=1
)

df = df.drop(columns=['Domain'])

with pd.ExcelWriter('texts_cleaned.xlsx', engine="openpyxl", mode="a", if_sheet_exists="replace") as writer:
    df.to_excel(writer, sheet_name=sheet_name, index=False)

## Remove Whitespaces

In [56]:
excel_file = 'texts_cleaned.xlsx'
sheet_name = '2010 - 2014'
df = pd.read_excel(excel_file, sheet_name=sheet_name)

# remove leading and trailing whitespaces
df['Text'] = df['Text'].str.strip()
print(df)

# replace multiple spaces between words with a single space
df.loc[:, 'Text'] = df['Text'].str.replace(r'\s+', ' ', regex=True)

with pd.ExcelWriter(excel_file, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
    df.to_excel(writer, sheet_name=sheet_name, index=False)

                                                 URLs  \
0   https://sbr.com.sg/information-technology/more...   
1   https://www.businesstimes.com.sg/incoming/ura-...   
2   https://www.businesstimes.com.sg/incoming/hdb-...   
3   https://www.edgeprop.sg/property-news/sitting-...   
4   https://www.businesstimes.com.sg/incoming/lawm...   
..                                                ...   
73  https://www.edgeprop.sg/property-news/good-cla...   
74  https://www.edgeprop.sg/property-news/prudenti...   
75  https://www.edgeprop.sg/property-news/jewel-ra...   
76  https://www.edgeprop.sg/property-news/shadow-d...   
77  https://www.edgeprop.sg/property-news/pricing-...   

                         Date  \
0    2010-05-21T20:30:40+0800   
1   2013-11-25T22:00:00+08:00   
2   2013-10-14T22:00:00+08:00   
3   2014-03-01T00:00:00+08:00   
4   2014-03-04T22:00:00+08:00   
..                        ...   
73  2014-10-13T00:00:00+08:00   
74  2014-10-20T00:00:00+08:00   
75  2014-10-27T10: