Methods for data cleaning

In [1]:
from bs4 import BeautifulSoup

def remove_html_and_script(text):
    soup = BeautifulSoup(text, "html.parser")

    # Remove script and style tags completely
    for tag in soup(["script", "style"]):
        tag.decompose()
        

    return soup.get_text(strip=False)



In [80]:
def unicode_handling(text):
    # Dictionary of unicode escape sequences mapped to their actual characters
    unicode_map = {
        r'\u2018': '‘',  # Left single quote
        r'\u2019': '’',  # Right single quote
        r'\u201c': '“',  # Left double quote
        r'\u201d': '”',  # Right double quote
        r'\u2013': '–',  # En dash
        r'\u2014': '—',  # Em dash
        r'\u2022': '•',  # Bullet
        r'\u2026': '…',  # Ellipsis
        r'\u00a0': ' ',  # Non-breaking space
        r'\u00b7': '·',  # Middle dot
        r'\u00e9': 'é',  # e acute
        r'\u00e2': 'â',  # a circumflex
        r'\u00e0': 'à',  # a grave
        r'\u00e8': 'è',  # e grave
        r'\u00e7': 'ç',  # c cedilla
        r'\u00f4': 'ô',  # o circumflex
        r'\u00fb': 'û',  # u circumflex
        r'\u00ee': 'î',  # i circumflex
        r'\u00ef': 'ï',  # i diaeresis
        r'\u00e4': 'ä',  # a umlaut
        r'\u00f6': 'ö',  # o umlaut
        r'\u00fc': 'ü',  # u umlaut
        r'\u00df': 'ß',  # sharp s
        r'\u2082': '₂',  # subscript 2
        r'\u2083': '₃',  # subscript 3
        r'\u267b': '',         # Recycling symbol
        r'\ufe0f': '',         # Variation selector
        r'\ud83d\udd25': '',   # Fire emoji
        r'\ud83c\udf1f': '', 
        r'\u2744\ufe0f': '',
    }

    for code, char in unicode_map.items():
        text = text.replace(code, char)

    return text

api request


In [4]:
import requests

In [81]:
# API Query to test functionality (without pagination)

last_run_date = "2025-05-03T06:04:11Z"

# API base URL and static parameters
api_url = "https://my.intelligence2day.com/components/api/search.cfc"
params = {
    "method": "query",
    "APIid": "I2DE_4880557FFC6ABA165C916880849F9CAC",
    "authKey": "c51e7492-ab7f-46d8-9d10-edd4e434d2c1",
    "customerGUID": "b6150206-d9b1-4963-8907-22b7695c0477",
    "accessGroups": "8329",
    "returnFields": "*",
    #"queryString": "dateline:[NOW-2DAYS TO NOW]",  # Query for all records within time range
    #"queryString": "dateline:[NOW-1MONTH TO NOW]",  # Query for all records within time range
    "queryString": f"dateline:[{last_run_date} TO NOW] AND topicId:135576", # Query for all records between the last run date (max run date in excel) and now & on topic ID
    "maxRows": 100,  # Limit to x results
    "sort": "dateline desc",  # Sort by 

}

total_articles = 0
all_articles = []  # To store all article data


# Make the request
response = requests.get(api_url, params=params, verify=False)

# Print the status code
print(f"Status Code: {response.status_code}")
    
if response.status_code == 200:
    try:
        data = response.json()  # Parse the response as JSON
        print("Returned Data:")
        print(data)  # Print the raw JSON response
            
        articles = data.get("docs", [])


        if not articles:
            print("No more articles returned.")
            

        print(f"Retrieved {len(articles)} articles")

        # Print the articles' title, summary, and URL
        for i, article in enumerate(articles, 1):
            title = article.get("headline", "No title")
            summary = article.get("summary", "No summary")
            url = article.get("url", "No URL")
            date = article.get("dateline", "No date")

            all_articles.append({"title": title, "summary": summary, "url": url})

            print(f"\nArticle {total_articles + i}")
            print(f"Title   : {title}")
            print(f"Summary : {summary}")
            print(f"URL     : {url}")
            print(f"Date    : {date}")


    except ValueError:
        print("Error: Response is not valid JSON.")
        
else:
    print(f"Request failed with status code {response.status_code}")
    
total_articles = len(all_articles)
print(f"\n✅ Total articles fetched: {total_articles}")





Status Code: 200
Returned Data:
{'numFound': 46, 'start': 0, 'docs': [{'uid': '34359807', 'uid_int': 34359807, 'customerGUID': 'b6150206-d9b1-4963-8907-22b7695c0477', 'url': 'https://my.intelligence2day.com/cc/view/article/?a=298ee1c9a6cdff743102e65d127877fb', 'modified': "{ts '2025-05-13 14:54:1747148086'}", 'docHash': '298ee1c9a6cdff743102e65d127877fb', 'AI_category': ['/Business & Industrial/Agriculture & Forestry'], 'AI_entity_CONSUMER_GOOD': ['VQ95'], 'AI_entity_EVENT': ['cleaning', 'journey', 'cleaning'], 'AI_entity_LOCATION': ['Canadian', 'dairy facilities'], 'AI_entity_ORGANIZATION': ['dairy producer', 'Copeland'], 'AI_entity_OTHER': ['journey', 'energy', 'cows', 'milk cartons', 'lot', 'water', 'pasteurization', 'energy', 'water', 'pasteurization', 'milk cartons', 'cows', 'production processes', 'lot', 'fossil fuel use', 'performance', 'Vilter VQ95 #industrialheatpump', 'waste heat'], 'AI_sentiment': 0.20000000298, 'AI_meta': ['{"journey": {"link": "", "mid": ""}, "energy": {"l

In [82]:
import pandas as pd
test_df = pd.DataFrame(all_articles)
test_df

Unnamed: 0,title,summary,url
0,The journey from cows to milk cartons requires...,The journey from cows to milk cartons requires...,https://my.intelligence2day.com/cc/view/articl...
1,"In our latest podcast episode, we talk all abo...","In our latest podcast episode, we talk all abo...",https://my.intelligence2day.com/cc/view/articl...
2,Copeland expands US partnership with Mysa smar...,Copeland strengthens collaboration with Mysa t...,https://my.intelligence2day.com/cc/view/articl...
3,Copeland News,"Copeland , formerly known as Emerson Climate ...",https://my.intelligence2day.com/cc/view/articl...
4,Five rooftop units equipped with #Frascold sem...,Five rooftop units equipped with #Frascold sem...,https://my.intelligence2day.com/cc/view/articl...
5,Sustainability is more than a goal at Copeland...,Sustainability is more than a goal at Copeland...,https://my.intelligence2day.com/cc/view/articl...
6,Copeland expands U.S. smart thermostat offerin...,Copeland has announced the expansion of its p...,https://my.intelligence2day.com/cc/view/articl...
7,A2L refrigerants present unique considerations...,A2L refrigerants present unique considerations...,https://my.intelligence2day.com/cc/view/articl...
8,Copeland is proud to announce the expansion of...,Copeland is proud to announce the expansion of...,https://my.intelligence2day.com/cc/view/articl...
9,Copeland Expands Partnership with Canadian-bas...,"ST. LOUIS (May 12, 2025) \u2013 Copeland, a g...",https://my.intelligence2day.com/cc/view/articl...


In [83]:
test_df['title'] = test_df['title'].apply(unicode_handling)
test_df['summary'] = test_df['summary'].apply(unicode_handling)
print(test_df['title'].iloc[0])

The journey from cows to milk cartons requires a lot of energy and hot water, whether it’s for pasteurization, cleaning or 


In [84]:
test_df.to_excel("../updated.xlsx", index=False)

testing html tag cleaning method

In [8]:
test_df['title'] = test_df['title'].apply(remove_html_and_script)
test_df['summary'] = test_df['summary'].apply(remove_html_and_script)


In [9]:
test_df.to_excel("../updated.xlsx", index=False)

Testing unicode method

In [57]:
import pandas as pd
test_df = pd.DataFrame(all_articles)
test_df

Unnamed: 0,title,summary,url
0,"In our latest podcast episode, we talk all abo...","In our latest podcast episode, we talk all abo...",https://my.intelligence2day.com/cc/view/articl...
1,Copeland News,"Copeland , formerly known as Emerson Climate ...",https://my.intelligence2day.com/cc/view/articl...
2,Copeland expands US partnership with Mysa smar...,Copeland strengthens collaboration with Mysa t...,https://my.intelligence2day.com/cc/view/articl...
3,Five rooftop units equipped with #Frascold sem...,Five rooftop units equipped with #Frascold sem...,https://my.intelligence2day.com/cc/view/articl...
4,Sustainability is more than a goal at Copeland...,Sustainability is more than a goal at Copeland...,https://my.intelligence2day.com/cc/view/articl...


In [58]:
import pandas as pd
import re

# Dictionary to map Unicode escape sequences to their symbols
unicode_map = {
    "\u0020": " ",        # Space
    "\u00A9": "©",       # Copyright symbol
    "\u00AE": "®",       # Registered trademark symbol
    "\u00B0": "°",       # Degree symbol
    "\u00A5": "¥",       # Yen symbol
    "\u20AC": "€",       # Euro symbol
    "\u20B9": "₹",       # Indian Rupee symbol
    "\u2022": "•",       # Bullet symbol
    "\u2030": "‰",       # Per mille symbol
    "\u221E": "∞",       # Infinity symbol
    "\u00A0": "\u00A0",  # Non-breaking space (kept for future use)
    "\u2026": "…",       # Ellipsis
    "\u201C": "“",       # Left double quotation mark
    "\u201D": "”",       # Right double quotation mark
    "\u2018": "‘",       # Left single quotation mark
    "\u2019": "’",       # Right single quotation mark
    "\u2013": "–",       # En dash
    "\u2014": "—",       # Em dash
    "\u2122": "™",       # Trademark symbol
    "\u2212": "−",       # Minus sign
    "\u2202": "∂",       # Partial differential symbol
    "\u2211": "∑",       # Summation symbol
    "\u2219": "⋅",       # Bullet operator
    "\u221A": "√",       # Square root
    "\u222B": "∫",       # Integral symbol
}

# Method to decode and handle unicode escape sequences
def decode_and_replace_unicode_escapes(text):
    try:
        # Decode the unicode escape sequences, ignore malformed sequences
        decoded_text = bytes(text, 'utf-8').decode('unicode_escape', errors='ignore')
    except (UnicodeDecodeError, TypeError) as e:
        print(f"Error decoding text: {e}")
        decoded_text = text  # If decoding fails, return the original text
    
    # Replace valid unicode escape sequences with their corresponding symbols
    for escape_sequence, symbol in unicode_map.items():
        decoded_text = decoded_text.replace(escape_sequence, symbol)
    
    # Remove any remaining malformed unicode escape sequences
    decoded_text = re.sub(r'\\u[0-9A-Fa-f]{4}', '', decoded_text)
    
    return decoded_text

# Apply the function to the 'title' and 'summary' columns of the dataframe
def clean_dataframe(df):
    # Apply the decoding and cleaning function to both 'title' and 'summary' columns
    df['title'] = df['title'].apply(decode_and_replace_unicode_escapes)
    df['summary'] = df['summary'].apply(decode_and_replace_unicode_escapes)
    
    return df

# Clean the dataframe before writing to an Excel file
test_df = clean_dataframe(test_df)

# Display the updated DataFrame
print(test_df['title'].iloc[0])




In our latest podcast episode, we talk all about the processes involved in refrigeration systems ❄️ Learn how to manag


In [60]:
# Save the cleaned DataFrame to an Excel file without the 'encoding' argument
test_df.to_excel("../updated.xlsx", index=False, engine='openpyxl')

In [52]:
test_df.to_excel("../updated.xlsx", index=False, engine='openpyxl')

In [37]:
print("\u00A9")

©


general testing

In [12]:
# text to be cleaned
text1 = "ebm\u2011papst Invests \u20ac30 Million in New Site in Romania"
text2 = "Let\u2019s decarbonize district heating<br><br> together at this year\u2019s Euroheat &amp; Power Congress in Prague, Czech Republic, where"
text3 = 'Mark your calendars for May 27, 2025, at 11:00 CET, as we\\u2019re hosting an online session; Retrofitting Commercial Buildings: How E'
#clean_text2 method works with text3



In [13]:
text1

'ebm‑papst Invests €30 Million in New Site in Romania'

In [84]:
# Create a DataFrame with one column and one row
data = {'Title': ['More Displacement, Same Iconic Series \u2013 06.05.2025 Secop SCE Plus Video \u2013 Behind the Scenes']}
df = pd.DataFrame(data)

pd.set_option('display.max_colwidth', None)  # Set to None to display the full content of the column

df['Title'][0] = remove_html_and_script(df['Title'][0])
df['Title'][0] = normalize_unicode(df['Title'][0])

# Display the DataFrame
df

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['Title'][0] = remove_html_and_script(df['Title'][0])
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the origi

Unnamed: 0,Title
0,"More Displacement, Same Iconic Series – 06.05.2025 Secop SCE Plus Video – Behind the Scenes"


In [79]:
output

'More Displacement, Same Iconic Series – 06.05.2025 Secop SCE Plus Video – Behind the Scenes'

Testing data cleaning on csv file records

In [1]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv("../SP_Test.csv")

# Display the first few rows of the DataFrame
df

Unnamed: 0,Title,Summary,URL
0,Our skilled trades colleagues are an essential...,Our skilled trades colleagues are an essential...,https://www.linkedin.com/feed/update/urn:li:ac...
1,Gefahrgut automatisiert im KV-Terminal abwickeln,"Projektpartner LKZ Prienn, Concroo, Duss und K...",https://www.eurotransport.de/logistik/speditio...
2,"Mark your calendars for May 27, 2025, at 11:00...","Mark your calendars for May 27, 2025, at 11:00...",https://www.linkedin.com/feed/update/urn:li:ac...
3,Refrigerant Innovations Are The Focus at Oklah...,The school is partnering with the HVAC indust...,https://www.achrnews.com/articles/164551-refri...
4,ATMO Australia: R290 Commercial HVAC Case Stud...,"ATMOsphere COO and Head of APAC, Jan Dusek, s...",https://naturalrefrigerants.com/atmo-australia...
5,Hupac beklagt Bahnprobleme : Negativer Trend i...,Der Schweizer Kombi-Operateur Hupac beklagt di...,https://www.eurotransport.de/logistik/speditio...
6,"In the high-temperature #HeatPump sector, the ...","In the high-temperature #HeatPump sector, the ...",https://www.linkedin.com/feed/update/urn:li:ac...
7,"More Displacement, Same Iconic Series \u2013 0...",Secop\u2019s new SCE Plus compressor range ex...,https://www.secop.com/updates/news/secop-sce-p...
8,J &amp; E Hall are pleased to announce Grahame...,J &amp; E Hall are pleased to announce Grahame...,https://www.linkedin.com/feed/update/urn:li:ac...
9,Streit um den Baufortschritt bei Brücken,"""In wesentlichen Punkten irreführend und besch...",https://www.eurotransport.de/logistik/verkehrs...


In [70]:
test = df

test['Title'] = test['Title'].apply(remove_html_and_script)
test['Title'] = test['Title'].apply(normalize_unicode)

test['Summary'] = test['Summary'].apply(remove_html_and_script)
test['Summary'] = test['Summary'].apply(normalize_unicode)
row = test.iloc[19]
row['Title']

'ebm\\u2011papst Invests \\u20ac30 Million in New Site in Romania'

In [72]:
row['Title'] = remove_html_and_script(row['Title'])
row['Title'] = normalize_unicode(row['Title'])
row['Title']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row['Title'] = remove_html_and_script(row['Title'])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row['Title'] = normalize_unicode(row['Title'])


'ebm\\u2011papst Invests \\u20ac30 Million in New Site in Romania'

In [73]:
# Iterate through each record in the DataFrame and apply functions in place
for index, row in test.iterrows():
    # Print original values
    original_title = row['Title']
    original_summary = row['Summary']
    print(f"Original Title (Row {index}): {original_title}")
    print(f"Original Summary (Row {index}): {original_summary}")

    # Apply the functions in place and update the DataFrame
    test.at[index, 'Title'] = normalize_unicode(remove_html_and_script(original_title))
    test.at[index, 'Summary'] = normalize_unicode(remove_html_and_script(original_summary))

    # Print cleaned values
    print(f"Cleaned Title (Row {index}): {test.at[index, 'Title']}")
    print(f"Cleaned Summary (Row {index}): {test.at[index, 'Summary']}")
    print("\n" + "-"*50)  # Separator for readability


Original Title (Row 0): Our skilled trades colleagues are an essential part of not only our company but also society. These professionals bring the buil
Original Summary (Row 0): Our skilled trades colleagues are an essential part of not only our company but also society. These professionals bring the buildings where we live, work and play to life. They show up, solve problems and keep buildings -and our business- moving forward. Thank you for your dedication to our customers!#SkilledTradesDay #NationalSkilledTradesDay
Cleaned Title (Row 0): Our skilled trades colleagues are an essential part of not only our company but also society. These professionals bring the buil
Cleaned Summary (Row 0): Our skilled trades colleagues are an essential part of not only our company but also society. These professionals bring the buildings where we live, work and play to life. They show up, solve problems and keep buildings -and our business- moving forward. Thank you for your dedication to our custom

In [63]:
# Apply cleaning functions in place on both 'Title' and 'Summary' columns
test['Title'] = test['Title'].apply(lambda x: normalize_unicode(remove_html_and_script(str(x))))
test['Summary'] = test['Summary'].apply(lambda x: normalize_unicode(remove_html_and_script(str(x))))

# Print the cleaned DataFrame
test


Unnamed: 0,Title,Summary,URL
0,Our skilled trades colleagues are an essential...,Our skilled trades colleagues are an essential...,https://www.linkedin.com/feed/update/urn:li:ac...
1,Gefahrgut automatisiert im KV-Terminal abwickeln,"Projektpartner LKZ Prienn, Concroo, Duss und K...",https://www.eurotransport.de/logistik/speditio...
2,"Mark your calendars for May 27, 2025, at 11:00...","Mark your calendars for May 27, 2025, at 11:00...",https://www.linkedin.com/feed/update/urn:li:ac...
3,Refrigerant Innovations Are The Focus at Oklah...,The school is partnering with the HVAC industr...,https://www.achrnews.com/articles/164551-refri...
4,ATMO Australia: R290 Commercial HVAC Case Stud...,"ATMOsphere COO and Head of APAC, Jan Dusek, sp...",https://naturalrefrigerants.com/atmo-australia...
5,Hupac beklagt Bahnprobleme : Negativer Trend i...,Der Schweizer Kombi-Operateur Hupac beklagt di...,https://www.eurotransport.de/logistik/speditio...
6,"In the high-temperature #HeatPump sector, the ...","In the high-temperature #HeatPump sector, the ...",https://www.linkedin.com/feed/update/urn:li:ac...
7,"More Displacement, Same Iconic Series \u2013 0...",Secop\u2019s new SCE Plus compressor range exp...,https://www.secop.com/updates/news/secop-sce-p...
8,J & E Hall are pleased to announce Grahame Kee...,J & E Hall are pleased to announce Grahame Kee...,https://www.linkedin.com/feed/update/urn:li:ac...
9,Streit um den Baufortschritt bei Brücken,"""In wesentlichen Punkten irreführend und besc...",https://www.eurotransport.de/logistik/verkehrs...


In [77]:
import pandas as pd
from bs4 import BeautifulSoup
import unicodedata

# Method to remove HTML and script tags from text
def remove_html_and_script(text):
    soup = BeautifulSoup(text, "html.parser")

    # Remove script and style tags completely
    for tag in soup(["script", "style"]):
        tag.decompose()

    return soup.get_text(strip=True)

# Method to normalize unicode characters
def normalize_unicode(text):
    normalized_text = unicodedata.normalize('NFKD', text)
    return normalized_text

# Method to clean a DataFrame column by applying both functions
def clean_column(df, column_name):
    # Apply both remove_html_and_script and normalize_unicode in sequence
    df[column_name] = df[column_name].apply(lambda x: normalize_unicode(remove_html_and_script(x)))

    return df

# Example usage
# Assuming 'df' is your DataFrame and 'Title' is the column to clean
# df = pd.read_csv("your_file.csv")  # Uncomment this to load the CSV

# Clean the 'Title' column in place
test = clean_column(test, 'Title')

# Clean the 'Summary' column in place
test = clean_column(test, 'Summary')

# Show cleaned data
print(test[['Title', 'Summary']])

print(test['Title'].iloc[7])

                                                Title  \
0   Our skilled trades colleagues are an essential...   
1    Gefahrgut automatisiert im KV-Terminal abwickeln   
2   Mark your calendars for May 27, 2025, at 11:00...   
3   Refrigerant Innovations Are The Focus at Oklah...   
4   ATMO Australia: R290 Commercial HVAC Case Stud...   
5   Hupac beklagt Bahnprobleme : Negativer Trend i...   
6   In the high-temperature #HeatPump sector, the ...   
7   More Displacement, Same Iconic Series \u2013 0...   
8   J & E Hall are pleased to announce Grahame Kee...   
9           Streit um den Baufortschritt bei Brücken   
10  IIR Highlights Natural Refrigerants as Sustain...   
11         Nagel-Group baut Logistikzentrum in Danzig   
12  AHT Launches SPI CIRCUMPOLAR Modular Pump Stat...   
13  In the quest for sustainable alternatives to g...   
14  Meet our colleague Janusz Kieruzel. Janusz beg...   
15  Alfa Laval will unveil its large capacity oliv...   
16    2025 Eurovent Summit reve

In [2]:
# writing test to a new CSV file

test1 = df.copy()

#test1['Title'] = test1.applymap(lambda x: normalize_unicode(remove_html_and_script(x)) if isinstance(x, str) else (print(f"Non-string value: {x}") or x))
test1['Title'] = test1['Title'].map((lambda x: x.encode('unicode-escape')))
test1['Summary'] = test1['Summary'].map((lambda x: x.encode('unicode-escape')))

# Write to CSV
test1.to_csv("../writes_test.csv", index=False, encoding='utf-8')