Methods for data cleaning

In [27]:
from bs4 import BeautifulSoup

def remove_html_and_script(text):
    soup = BeautifulSoup(text, "html.parser")

    # Remove script and style tags completely
    for tag in soup(["script", "style"]):
        tag.decompose()
        

    return soup.get_text(strip=False)



In [28]:
def unicode_handling(text):
    # Dictionary of unicode escape sequences mapped to their actual characters
    unicode_map = {
        r'\u2018': '‚Äò',  # Left single quote
        r'\u2019': '‚Äô',  # Right single quote
        r'\u201c': '‚Äú',  # Left double quote
        r'\u201d': '‚Äù',  # Right double quote
        r'\u2013': '‚Äì',  # En dash
        r'\u2014': '‚Äî',  # Em dash
        r'\u2022': '‚Ä¢',  # Bullet
        r'\u2026': '‚Ä¶',  # Ellipsis
        r'\u00a0': ' ',  # Non-breaking space
        r'\u00b7': '¬∑',  # Middle dot
        r'\u00e9': '√©',  # e acute
        r'\u00e2': '√¢',  # a circumflex
        r'\u00e0': '√†',  # a grave
        r'\u00e8': '√®',  # e grave
        r'\u00e7': '√ß',  # c cedilla
        r'\u00f4': '√¥',  # o circumflex
        r'\u00fb': '√ª',  # u circumflex
        r'\u00ee': '√Æ',  # i circumflex
        r'\u00ef': '√Ø',  # i diaeresis
        r'\u00e4': '√§',  # a umlaut
        r'\u00f6': '√∂',  # o umlaut
        r'\u00fc': '√º',  # u umlaut
        r'\u00df': '√ü',  # sharp s
        r'\u2082': '‚ÇÇ',  # subscript 2
        r'\u2083': '‚ÇÉ',  # subscript 3
        r'\u267b': '',         # Recycling symbol
        r'\ufe0f': '',         # Variation selector
        # r'\ud83d\udd25': '',   # Fire emoji
        # r'\ud83c\udf1f': '', 
        # r'\u2744\ufe0f': '',
        r'\u2744': '',
        r'\u2122': '‚Ñ¢',
        r'\u27a1': '',
        r'\u20ac': '‚Ç¨',
        r'\u201': '',
        r'\u2013': '‚Äì',
        r'\u2014': '‚Äî',
        #r'\ud83d\udccd': '',
        #r'\ud83c\udf89': '',
        #r'\ud83d\udd17': '',
        #r'\ud83d\udd0e': '',
        #r'\ud83d\udcf8': '',
        #r'\ud83d\udc49': '',
        #r'\ud83c\udfa7': '',
        #r'\ud83e\udd1d': '',
        #r'\u2714': '',
        #r'\ud83d\udca1': '',
        r'\u23f0': '',
        # r'\ud83c\udf88': '',
        r'\u2': '',
        r'\u201e': '',
        r'\u26a1': '',
        # r'\ud83d\udd12': '',
        # r'\ud83d\ude80': '',  # Unicode for "ROCKET" emoji (üöÄ).
        # r'\ud83c\u': '',  # Represents other emojis or special characters.
        r'\u25b6': '',  # Unicode for "BLACK RIGHT-POINTING TRIANGLE" (‚ñ∂), used for video/play buttons.
        r'\u2b05': '',
        r'\u0130': '',
        # r'\ud83c\udf2c': '',  # Unicode for "TROPICAL STORM" emoji (üåÄ)
        # r'\ud83c\uud83c': '',  # Represents other emojis or special characters.
        
    }

    for code, char in unicode_map.items():
        text = text.replace(code, char)

    return text

In [29]:
def remove_matches(text):
    # Regular expression to match Unicode escape sequences
    unicode_pattern = r'\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8}'

    # Replace all matches with an empty string
    updated_string = re.sub(unicode_pattern, '', text)

    return updated_string

api request


In [5]:
import requests

In [17]:
# API Query to test functionality (without pagination)

#last_run_date = "2025-05-03T06:04:11Z"

# API base URL and static parameters
api_url = "https://my.intelligence2day.com/components/api/search.cfc"
params = {
    "method": "query",
    "APIid": "I2DE_4880557FFC6ABA165C916880849F9CAC",
    "authKey": "c51e7492-ab7f-46d8-9d10-edd4e434d2c1",
    "customerGUID": "b6150206-d9b1-4963-8907-22b7695c0477",
    "accessGroups": "8329",
    "returnFields": "*",
    "queryString": "*:*",
    #"queryString": "dateline:[NOW-2DAYS TO NOW]",  # Query for all records within time range
    #"queryString": "dateline:[NOW-1MONTH TO NOW]",  # Query for all records within time range
    #"queryString": f"dateline:[{last_run_date} TO NOW] AND topicId:135576", # Query for all records between the last run date (max run date in excel) and now & on topic ID
    #"queryString": "topicID:135576",  # Query for all records within time range
    "maxRows": 100,  # Limit to x results
    "sort": "dateline desc",  # Sort by 

}

total_articles = 0
all_articles = []  # To store all article data


# Make the request
response = requests.get(api_url, params=params, verify=False)

# Print the status code
print(f"Status Code: {response.status_code}")
    
if response.status_code == 200:
    try:
        data = response.json()  # Parse the response as JSON
        print("Returned Data:")
        print(data)  # Print the raw JSON response
            
        articles = data.get("docs", [])


        if not articles:
            print("No more articles returned.")
            

        print(f"Retrieved {len(articles)} articles")

        # Print the articles' title, summary, and URL
        for i, article in enumerate(articles, 1):
            title = article.get("headline", "No title")
            summary = article.get("summary", "No summary")
            url = article.get("url", "No URL")
            date = article.get("dateline", "No date")

            all_articles.append({"title": title, "summary": summary, "url": url})

            print(f"\nArticle {total_articles + i}")
            print(f"Title   : {title}")
            print(f"Summary : {summary}")
            print(f"URL     : {url}")
            print(f"Date    : {date}")


    except ValueError:
        print("Error: Response is not valid JSON.")
        
else:
    print(f"Request failed with status code {response.status_code}")
    
total_articles = len(all_articles)
print(f"\n‚úÖ Total articles fetched: {total_articles}")





Status Code: 200
Returned Data:
{'numFound': 34288, 'start': 0, 'docs': [{'uid': '34367421', 'uid_int': 34367421, 'customerGUID': 'b6150206-d9b1-4963-8907-22b7695c0477', 'url': 'https://my.intelligence2day.com/cc/view/article/?a=fe428b2cf4cca7369e99e3a54d0d13ea', 'modified': "{ts '2025-05-13 16:55:1747155347'}", 'docHash': 'fe428b2cf4cca7369e99e3a54d0d13ea', 'AI_category': ['/science and technology', '/economy&#44; business and finance/economy', '/economy&#44; business and finance/products and services/energy and resource', '/labour', '/science and technology/social sciences/economics', '/science and technology/social sciences', '/environment', '/labour/employment', '/politics/government policy/environmental policy', '/economy&#44; business and finance/products and services/energy and resource/renewable energy'], 'AI_entity_LOCATION': ['United States', 'Canada', 'Knoxville, Tennessee', 'Americas', 'Tennessee'], 'AI_entity_ORGANIZATION': ['United States', 'Knoxville, Tennessee', 'United

In [30]:
import pandas as pd
test_df = pd.DataFrame(all_articles)
test_df['title'].iloc[2]

'Carrier Announces Additional $1 Billion Investment in U.S. Manufacturing Footprint, Advanced Cutting-Edge R&amp;D and Workforce Expa'

In [31]:
test_df['title'] = test_df['title'].apply(remove_html_and_script)
test_df['title'] = test_df['title'].apply(unicode_handling)
test_df['title'] = test_df['title'].apply(remove_matches)
test_df['summary'] = test_df['summary'].apply(unicode_handling)
test_df['summary'] = test_df['summary'].apply(remove_matches)
test_df['summary'] = test_df['summary'].apply(remove_html_and_script)

print(test_df['title'].iloc[0])

Kelvion Knoxville hosts U.S. Senator Bill Hagerty for exclusive facility tour ‚Äì spotlight on innovation, growth, and commun


In [32]:
test_df.to_excel("../updated.xlsx", index=False)