Methods for data cleaning

In [8]:
from bs4 import BeautifulSoup

def remove_html_and_script(text):
    soup = BeautifulSoup(text, "html.parser")

    # Remove script and style tags completely
    for tag in soup(["script", "style"]):
        tag.decompose()
        

    return soup.get_text(strip=False)



In [9]:
def unicode_handling(text):
    # Dictionary of unicode escape sequences mapped to their actual characters
    unicode_map = {
        r'\u2018': '‘',  # Left single quote
        r'\u2019': '’',  # Right single quote
        r'\u201c': '“',  # Left double quote
        r'\u201d': '”',  # Right double quote
        r'\u2013': '–',  # En dash
        r'\u2014': '—',  # Em dash
        r'\u2022': '•',  # Bullet
        r'\u2026': '…',  # Ellipsis
        r'\u00a0': ' ',  # Non-breaking space
        r'\u00b7': '·',  # Middle dot
        r'\u00e9': 'é',  # e acute
        r'\u00e2': 'â',  # a circumflex
        r'\u00e0': 'à',  # a grave
        r'\u00e8': 'è',  # e grave
        r'\u00e7': 'ç',  # c cedilla
        r'\u00f4': 'ô',  # o circumflex
        r'\u00fb': 'û',  # u circumflex
        r'\u00ee': 'î',  # i circumflex
        r'\u00ef': 'ï',  # i diaeresis
        r'\u00e4': 'ä',  # a umlaut
        r'\u00f6': 'ö',  # o umlaut
        r'\u00fc': 'ü',  # u umlaut
        r'\u00df': 'ß',  # sharp s
        r'\u2082': '₂',  # subscript 2
        r'\u2083': '₃',  # subscript 3
        r'\u267b': '',         # Recycling symbol
        r'\ufe0f': '',         # Variation selector
        # r'\ud83d\udd25': '',   # Fire emoji
        # r'\ud83c\udf1f': '', 
        # r'\u2744\ufe0f': '',
        r'\u2744': '',
        r'\u2122': '™',
        r'\u27a1': '',
        r'\u20ac': '€',
        r'\u201': '',
        r'\u2013': '–',
        r'\u2014': '—',
        #r'\ud83d\udccd': '',
        #r'\ud83c\udf89': '',
        #r'\ud83d\udd17': '',
        #r'\ud83d\udd0e': '',
        #r'\ud83d\udcf8': '',
        #r'\ud83d\udc49': '',
        #r'\ud83c\udfa7': '',
        #r'\ud83e\udd1d': '',
        #r'\u2714': '',
        #r'\ud83d\udca1': '',
        r'\u23f0': '',
        # r'\ud83c\udf88': '',
        r'\u2': '',
        r'\u201e': '',
        r'\u26a1': '',
        # r'\ud83d\udd12': '',
        # r'\ud83d\ude80': '',  # Unicode for "ROCKET" emoji (🚀).
        # r'\ud83c\u': '',  # Represents other emojis or special characters.
        r'\u25b6': '',  # Unicode for "BLACK RIGHT-POINTING TRIANGLE" (▶), used for video/play buttons.
        r'\u2b05': '',
        r'\u0130': '',
        # r'\ud83c\udf2c': '',  # Unicode for "TROPICAL STORM" emoji (🌀)
        # r'\ud83c\uud83c': '',  # Represents other emojis or special characters.
        
    }

    for code, char in unicode_map.items():
        text = text.replace(code, char)

    return text

In [10]:
import re


def remove_matches(text):
    # Regular expression to match Unicode escape sequences
    unicode_pattern = r'\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8}'

    # Replace all matches with an empty string
    updated_string = re.sub(unicode_pattern, '', text)

    return updated_string

In [11]:
from datetime import datetime

def extract_date_ddmmyyyy(iso_datetime: str) -> str:
    """
    Extracts and formats the date portion from an ISO 8601 datetime string
    into 'dd-mm-yyyy' format.

    Parameters:
        iso_datetime (str): An ISO 8601 datetime string (e.g., '2025-04-16T04:31:13Z').

    Returns:
        str: The date in 'dd-mm-yyyy' format.
    """
    dt = datetime.fromisoformat(iso_datetime.replace("Z", "+00:00"))
    return dt.strftime("%Y-%m-%d")


api request


In [4]:
import requests
import json

In [5]:
# API Query to test functionality (without pagination)

# API base URL and static parameters
api_url = "https://my.intelligence2day.com/components/api/search.cfc"
params = {
    "method": "query",
    "APIid": "I2DE_4880557FFC6ABA165C916880849F9CAC",
    "authKey": "c51e7492-ab7f-46d8-9d10-edd4e434d2c1",
    "customerGUID": "b6150206-d9b1-4963-8907-22b7695c0477",
    "accessGroups": "8329",
    "returnFields": "*",
    #"queryString": "*:*",      #Query for all records
    "queryString": "dateline:[NOW-2MONTHS TO NOW] AND topicId:135576",  # Query for all records within time range
    "maxRows": 10,  # Limit to x results
    "sort": "dateline desc",  # Sort by 

}

total_articles = 0
all_articles = []  # To store all article data


# Make the request
response = requests.get(api_url, params=params, verify=False)

# Print the status code
print(f"Status Code: {response.status_code}")
    
if response.status_code == 200:
    try:
        data = response.json()  # Parse the response as JSON
        print("Returned Data:")

        formatted_json = json.dumps(data, indent=4)
        print(formatted_json)    # Print the raw JSON response
            
        articles = data.get("docs", [])


        if not articles:
            print("No more articles returned.")
            

        print(f"Retrieved {len(articles)} articles")

        # Print the articles' title, summary, and URL
        for i, article in enumerate(articles, 1):
            title = article.get("headline", "No title")
            summary = article.get("summary", "No summary")
            url = article.get("attachmenturl", "No URL")
            date = article.get("dateline", "No date")

            all_articles.append({"Title": title, "Summary": summary, "URL": url, "Date": date})

            print(f"\nArticle {total_articles + i}")
            print(f"Title   : {title}")
            print(f"Summary : {summary}")
            print(f"URL     : {url}")
            print(f"Date    : {date}")


    except ValueError:
        print("Error: Response is not valid JSON.")
        
else:
    print(f"Request failed with status code {response.status_code}")
    
total_articles = len(all_articles)
print(f"\n✅ Total articles fetched: {total_articles}")





Status Code: 200
Returned Data:
{
    "numFound": 295,
    "start": 0,
    "docs": [
        {
            "uid": "34598287",
            "uid_int": 34598287,
            "customerGUID": "b6150206-d9b1-4963-8907-22b7695c0477",
            "url": "https://my.intelligence2day.com/cc/view/article/?a=4e6fbbd312ac17dce6544db8cec83abf",
            "modified": "{ts '2025-05-20 03:42:1747712529'}",
            "docHash": "4e6fbbd312ac17dce6544db8cec83abf",
            "AI_category": [
                "/science and technology",
                "/science and technology/social sciences/economics",
                "/economy&#44; business and finance/business information/human resources/executive officer",
                "/science and technology/technology and engineering",
                "/economy&#44; business and finance/economy",
                "/economy&#44; business and finance",
                "/science and technology/social sciences",
                "/economy&#44; business and finance

In [6]:
import pandas as pd
test_df = pd.DataFrame(all_articles)
test_df.head()

Unnamed: 0,Title,Summary,URL,Date
0,Danfoss Live Online-Event: GEG ­effektiv um­se...,| Druckvorschau Im Danfoss-Event werden zentr...,https://www.sbz-online.de/meldungen/03062025-1...,2025-05-20T03:00:34Z
1,Danfoss welcomes Danish Foreign Minister Lars ...,"May 19, 2025 As part of a high-level visit to...",https://www.ejarn.com/article/detail/88767,2025-05-19T16:23:15Z
2,Danfoss welcomes Danish Foreign Minister Lars ...,"As part of a high-level visit to China, Danish...",https://www.danfoss.com/en/about-danfoss/news/...,2025-05-19T13:26:24Z
3,"Last Saturday, May 17th, we had the pleasure o...","Last Saturday, May 17th, we had the pleasure o...",https://www.linkedin.com/feed/update/urn:li:ac...,2025-05-19T10:07:41Z
4,Supermarkets play a crucial role in shaping a ...,Supermarkets play a crucial role in shaping a ...,https://www.linkedin.com/feed/update/urn:li:ac...,2025-05-19T07:39:02Z


In [12]:
test_df['Title'] = test_df['Title'].apply(remove_html_and_script)
test_df['Title'] = test_df['Title'].apply(unicode_handling)
test_df['Title'] = test_df['Title'].apply(remove_matches)
test_df['Summary'] = test_df['Summary'].apply(unicode_handling)
test_df['Summary'] = test_df['Summary'].apply(remove_matches)
test_df['Summary'] = test_df['Summary'].apply(remove_html_and_script)
test_df['Date'] = test_df['Date'].apply(extract_date_ddmmyyyy)


In [77]:
# writing to excel and formatting the output so that it is in "table" format with name "table1"

current_date = datetime.now().strftime("%Y-%m-%d")

# Construct filename with current date
filename = f"../updated_{current_date}.xlsx"

with pd.ExcelWriter(filename, engine="xlsxwriter") as writer:
    test_df.to_excel(writer, sheet_name="Sheet1", index=False, startrow=0)

    workbook = writer.book
    worksheet = writer.sheets["Sheet1"]

    # defining column settings
    (max_row, max_col) = test_df.shape
    column_settings = [{"header": col} for col in test_df.columns]

    # Define table range & add table
    worksheet.add_table(0, 0, max_row, max_col -1,{
        "columns": column_settings,
        "name": "Table1",
    })

In [20]:
import pandas as pd
from datetime import datetime
import os
from openpyxl import load_workbook

def export_data(df):
    file_path = "../records.xlsx"

    # Check if file exists and determine mode and start row
    if os.path.exists(file_path):
        workbook = load_workbook(file_path)
        sheet = workbook.active
        start_row = sheet.max_row
        mode = "a"
        sheet_exists_option = {"if_sheet_exists": "overlay"}
        write_header = False
    else:
        start_row = 0
        mode = "w"
        sheet_exists_option = {}
        write_header = True

    # Use unpacking to only include 'if_sheet_exists' when needed
    with pd.ExcelWriter(file_path, engine="openpyxl", mode=mode, **sheet_exists_option) as writer:
        df.to_excel(writer, sheet_name="Sheet1", index=False, startrow=start_row,header=write_header)

    print(f"Data written to {file_path}")


In [22]:
export_data(test_df)

Data written to ../records.xlsx
