In [22]:
from pathlib import Path
from bs4 import BeautifulSoup
import pandas as pd
import re
import os
import pickle

In [3]:
keywords_sweden = [
    "Uppsala", 
    "Sweden", 
    "Swedish", 
    "Stockholm", 
    "Scandinavia", 
    "Nordics", 
    "Nordic", 
    "Riksdag", 
    "IKEA", 
    "Volvo", 
    "Saab", 
    "Ericsson", 
    "H&M", 
    "Spotify", 
    "Skype", 
    "Swede", 
    "Swedes", 
    "Gotland", 
    "Sverige", 
    "Svenska", 
    "SVT", 
    "Lund", 
    "Malmö", 
    "Gothenburg", 
    "Linköping", 
    "Kiruna", 
    "Örebro", 
    "Västerås", 
    "Skåne", 
    "Lapland", 
    "Norrland", 
    "Sami", 
    "Öresund", 
    "Allemansrätten", 
    "ABBA", 
    "Nobel", 
    "Vasa Museum", 
    "Smörgåsbord", 
    "Fika", 
    "Köttbullar",  
    "Midsommar", 
    "Göta Canal", 
    "Liseberg", 
    "Stieg Larsson", 
    "Wallander", 
    "Ingmar Bergman", 
    "Swedish krona", 
    "Kungliga", 
    "Stockholm Archipelago", 
    "Arlanda", 
    "Swedish Armed Forces", 
    "Folkhälsomyndigheten", 
    "Karolinska Institute", 
    "Stockholm School of Economics", 
    "SAS Group", 
    "Vattenfall", 
    "Electrolux"
]



In [25]:
keywords_pakistan = [
    "Pakistan", 
    "Pakistani", 
    "Gwadar", 
    "Karachi", 
    "Islamabad", 
    "Lahore", 
    "Quetta", 
    "Peshawar", 
    "Gilgit-Baltistan", 
    "Azad Jammu and Kashmir", 
    "Sindh", 
    "Balochistan", 
    "Punjab", 
    "CPEC", 
    "Gwadar Port", 
    "Thar Coal", 
    "Karakoram Highway", 
    "ML-1 Railway", 
    "Port Qasim", 
    "Karot Hydropower", 
    "Lahore", 
    "Fauji Fertilizer", 
    "Engro", 
    "Habib Bank", 
    "K-Electric", 
    "WAPDA", 
    "PIA", 
    "OGDCL", 
    "PSO",  
    "Quaid-e-Azam University", 
    "COMSATS", 
    "Pak-China", 
    "Pakistan-China",
]


In [26]:
keywords = keywords_pakistan

# Set the root directory
root_dir = Path(r"C:\Users\ondre\Desktop\semester3\Mining of Social Data\project\github\Mining_of_Social_Data")

# Output file where the visible text will be written
output_file_path = Path("china_content.txt")

data = []

# Open the output file in write mode (this will overwrite the file each time)
with open(output_file_path, "w", encoding="utf-8") as output_file:  

    # Loop through all .html files recursively
    for html_file in root_dir.rglob("*.html"):
        # print(f"Processing file: {html_file}")
        
        # Open the file and parse it
        with html_file.open('r', encoding='utf-8') as f: 
            content = f.read()

            # Use BeautifulSoup to parse the HTML content
            soup = BeautifulSoup(content, 'html.parser')

            # Article text
            text = ""
            content_element = soup.find(id='Content')
            if content_element:
                paragraphs = content_element.find_all('p')
                text = ' '.join([paragraph.get_text(separator=' ', strip=True) for paragraph in paragraphs])

            # Title
            h1_text = ""
            h1_elements = soup.find_all('h1')
            if h1_elements:
                h1_text = ' '.join([h1.get_text(separator=' ', strip=True) for h1 in h1_elements])

            # Date - Time
            info_l_text = ""
            info_l_elements = soup.find_all(class_='info_l')
            if info_l_elements:
                info_l_text = ' '.join([info_l.get_text(separator=' ', strip=True) for info_l in info_l_elements])

            # URL
            meta_tag = soup.find('meta', property='og:url')
            url = meta_tag['content'] if meta_tag else None


            if content_element:

                # Img links
                figures = content_element.find_all('figure')

                img_links = []
                for fig in figures:
                    image = fig.find('img')  # Use find instead of find_all to get a single image
                    if image and 'src' in image.attrs:  # Check if image exists and has a 'src' attribute
                        img_links.append(image['src'])

                # Image captions
                fig_captions = []
                for fig in figures:
                    caption = fig.find('figcaption')  # Use find instead of find_all to get a single caption
                    if caption:
                        fig_captions.append(caption.get_text(separator=' ', strip=True))


            # Function to clean text (remove punctuation)
            def clean_text(input_text):
                return re.sub(r'[^\w\s]', '', input_text)  # Remove everything except alphanumeric characters and spaces

            # Preprocess text
            cleaned_text = clean_text(text.lower())

            temp_keywords = []
            # Match keywords in sanitized text
            for keyword in keywords:
                if keyword.lower() in cleaned_text.split():  # Match whole words
                    print(f"Keyword '{keyword}' found in file: {html_file}")
                    temp_keywords.append(keyword)


            if temp_keywords != []:
                # Add a record to the list
                cleaned_text = re.sub(r'\\x[0-9A-Fa-f]{2}', '', text)
                data.append({"keyword": temp_keywords, "file": html_file, "title": h1_text, "date": info_l_text, "url": url, "text": cleaned_text, "img_links": img_links, "fig_captions": fig_captions})
                


Keyword 'Pakistan' found in file: C:\Users\ondre\Desktop\semester3\Mining of Social Data\project\github\Mining_of_Social_Data\chinadaily\201404\03\WS5a169521a31040ac000dd16d.html
Keyword 'Pakistani' found in file: C:\Users\ondre\Desktop\semester3\Mining of Social Data\project\github\Mining_of_Social_Data\chinadaily\201404\03\WS5a169521a31040ac000dd16d.html
Keyword 'Karachi' found in file: C:\Users\ondre\Desktop\semester3\Mining of Social Data\project\github\Mining_of_Social_Data\chinadaily\201404\03\WS5a169521a31040ac000dd16d.html
Keyword 'Punjab' found in file: C:\Users\ondre\Desktop\semester3\Mining of Social Data\project\github\Mining_of_Social_Data\chinadaily\201404\03\WS5a169521a31040ac000dd16d.html
Keyword 'Pakistan' found in file: C:\Users\ondre\Desktop\semester3\Mining of Social Data\project\github\Mining_of_Social_Data\chinadaily\201908\02\WS5d43daaca310cf3e35563916_2.html
Keyword 'Pakistan' found in file: C:\Users\ondre\Desktop\semester3\Mining of Social Data\project\github\M

In [27]:
# Convert dictionary to DataFrame
df_pakistan = pd.DataFrame(data)

In [149]:
#repair of the data in general

df_pakistan['keyword'] = df_pakistan['keyword'].apply(lambda x: tuple(x))

dates = []
for row in df_pakistan['date']:
    dates.append(row[row.index('Updated: ')+len('Updated: '):])

df_pakistan['date_cleaned'] = dates

authors = []
for row in df_pakistan['date']:
    try:
        row[:row.index('|')]
        if 'By ' in row:
            authors.append(row[row.index('By ')+len('By '):row.index('|')].strip())
        else:
            authors.append(row[:row.index('|')].strip())
    except ValueError:
        authors.append(None)

df_pakistan['author'] = authors

In [129]:
#Links reparations for Sweden

# Update 'url' column
df.loc[df['url'] == 'WS5d0836eda3103dbf14328c36.html', 'url'] = 'https://www.chinadaily.com.cn/a/201906/18/WS5d0836eda3103dbf14328c36_1.html'
df.loc[df['url'] == 'WS673f28c1a310f1265a1cee83.html', 'url'] = 'https://www.chinadaily.com.cn/a/202411/21/WS673f28c1a310f1265a1cee83.html'
df.loc[df['url'] == 'WS6746dc95a310f1265a1cfead.html', 'url'] = 'https://www.chinadaily.com.cn/a/202411/27/WS6746dc95a310f1265a1cfead.html'
df.loc[df['url'] == 'WS674a727ca310f1265a1d05ce.html', 'url'] = 'https://www.chinadaily.com.cn/a/202411/30/WS674a727ca310f1265a1d05ce.html'
df.loc[df['url'] == 'WS674d5da3a310f1265a1d0984.html', 'url'] = 'https://www.chinadaily.com.cn/a/202412/02/WS674d5da3a310f1265a1d0984.html'


In [None]:
#Links reparations for Pakistan
for n, i in enumerate(df_pakistan['url']):
    if 'http' not in i:
        # Use .loc to safely access values
        path = str(df_pakistan.loc[n, 'file'])  
        
        # Extract and transform the path
        if 'chinadaily\\' in path:
            sub_path = path[path.index('chinadaily\\') + len('chinadaily\\'):].replace('\\', '/')
            df_pakistan.loc[n, 'url'] = 'https://www.chinadaily.com.cn/a/' + sub_path


In [None]:

df_pakistan = df_pakistan.drop('date', axis=1)  # axis=1 means column

In [155]:
df = df.drop('date', axis=1)  # axis=1 means column

In [176]:
# Group by title and concatenate text (some articles are carousels with multiple pages)
merged_df = df.groupby('title', as_index=False).agg({
    'keyword': 'first',            # Keep the first value of keyword (or choose a custom logic)
    'file': 'first',               # Keep the first value of file
    'url': 'first',                # Keep the first value of url
    'text': lambda x: ' '.join(x), # Concatenate all text values
    'img_links': 'first',          # Keep the first value of img_links
    'fig_captions': 'first',       # Keep the first value of fig_captions
    'date_cleaned': 'first',       # Keep the first value of date_cleaned
    'author': 'first'              # Keep the first value of author
})

In [178]:
# File path to save the data
file_path = "chinadaily_sweden_cleaned.pkl"

# Save the data using pickle.dump
with open(file_path, "wb") as file:
    pickle.dump(merged_df, file)

In [None]:
#improvement suggestions

# only pages of the articles which have at least one keyword are taken (carousel pages that don't have any keyword are not taken)
# false positives when Nobel is mentioned in the context of Nobel prize in literature, Nobel prize in physics, etc.