In [1]:
import requests
import pandas as pd
import time
import json
import unicodedata
from bs4 import BeautifulSoup  # Needed for cleaning HTML descriptions

# Load the CSV file with event URLs
csv_file = "helmet_all_events.csv"  # Update with your actual file path
events_df = pd.read_csv(csv_file, encoding="utf-8")  # Ensure UTF-8 encoding

# Headers for API requests
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
    "X-Requested-With": "XMLHttpRequest"
}

# Function to clean and transliterate Finnish text
def clean_finnish_text(text):
    if not text or text == "N/A":
        return "N/A"

    # Remove HTML tags
    soup = BeautifulSoup(text, "html.parser")
    clean_text = soup.text.strip()

    # Normalize and replace special Finnish characters
    transliterated_text = unicodedata.normalize("NFKD", clean_text)

    return transliterated_text

# Function to extract description from JSON API
def get_event_description(event_url):
    if event_url == "N/A" or pd.isna(event_url):
        return "N/A"

    try:
        # Convert event URL to API JSON URL
        event_id = event_url.split("=")[-1]  # Extract event ID from URL
        api_url = f"https://helmet.finna.fi/AJAX/JSON?method=getLinkedEvents&params%5Bid%5D={event_id}"
        
        response = requests.get(api_url, headers=headers)
        if response.status_code != 200:
            print(f"⚠️ Failed to fetch {event_url}, Status Code: {response.status_code}")
            return "N/A"

        data = response.json()

        # Extract description from JSON
        description_html = data.get("data", {}).get("events", {}).get("description", "N/A")

        # Clean and transliterate description
        readable_description = clean_finnish_text(description_html)

        return readable_description
    except Exception as e:
        print(f"❌ Error fetching {event_url}: {e}")
        return "N/A"

# Loop through each event and fetch descriptions
event_descriptions = []
for index, row in events_df.iterrows():
    event_url = row["Link"]
    print(f"Fetching description for: {event_url}")
    description = get_event_description(event_url)
    event_descriptions.append(description)
    time.sleep(1)  # Prevent rate limiting

# Add descriptions to DataFrame
events_df["Description"] = event_descriptions

# Save updated CSV file with proper encoding
output_csv_path = "helmet_all_events_with_descriptions.csv"
events_df.to_csv(output_csv_path, index=False, encoding="utf-8")

print(f"✅ Event descriptions saved to {output_csv_path} with readable text.")


Fetching description for: https://helmet.finna.fi/FeedContent/LinkedEvents?id=kulke:65067
Fetching description for: https://helmet.finna.fi/FeedContent/LinkedEvents?id=helsinki:agkkz3noge
Fetching description for: https://helmet.finna.fi/FeedContent/LinkedEvents?id=helsinki:agkmx2xa7a
Fetching description for: https://helmet.finna.fi/FeedContent/LinkedEvents?id=helsinki:agkmx2w4yu
Fetching description for: https://helmet.finna.fi/FeedContent/LinkedEvents?id=helsinki:agkmx2w23u
Fetching description for: https://helmet.finna.fi/FeedContent/LinkedEvents?id=helsinki:agkmx2wzoa
Fetching description for: https://helmet.finna.fi/FeedContent/LinkedEvents?id=helsinki:agkmx2wx7i
Fetching description for: https://helmet.finna.fi/FeedContent/LinkedEvents?id=helsinki:agkmx2wvqy
Fetching description for: https://helmet.finna.fi/FeedContent/LinkedEvents?id=helsinki:agkpehrrhy
Fetching description for: https://helmet.finna.fi/FeedContent/LinkedEvents?id=helsinki:agkpadwfbe
Fetching description for: ht

In [3]:
events_df

Unnamed: 0,Title,Date,Location,Link,Image URL,Description
0,Koko perheen after-ski-keidas,Päivämäärä 2.3.2025,Sijainti Stoa,https://helmet.finna.fi/FeedContent/LinkedEven...,/FeedContent/EventImage?query%5Bpage_size%5D=3...,"Oli lunta tai ei, Stoan aukiolla juhlistetaan ..."
1,Muumi-päivä,Päivämäärä 22.2.2025,Sijainti Keskustakirjasto Oodi,https://helmet.finna.fi/FeedContent/LinkedEven...,/FeedContent/EventImage?query%5Bpage_size%5D=3...,Tule viettämään mukavaa päivää muumien k...
2,Talviloma leikkipuisto Lorussa,Päivämäärä 17.2.2025 – 21.2.2025,Sijainti Keskustakirjasto Oodi,https://helmet.finna.fi/FeedContent/LinkedEven...,/FeedContent/EventImage?query%5Bpage_size%5D=3...,Leikkipuisto Loru järjestää monipuolista ma...
3,Talviloma leikkipuisto Lorussa,Päivämäärä 21.2.2025,Sijainti Keskustakirjasto Oodi,https://helmet.finna.fi/FeedContent/LinkedEven...,/FeedContent/EventImage?query%5Bpage_size%5D=3...,Leikkipuisto Loru järjestää monipuolista ma...
4,Talviloma leikkipuisto Lorussa,Päivämäärä 20.2.2025,Sijainti Keskustakirjasto Oodi,https://helmet.finna.fi/FeedContent/LinkedEven...,/FeedContent/EventImage?query%5Bpage_size%5D=3...,Leikkipuisto Loru järjestää monipuolista ma...
...,...,...,...,...,...,...
11295,Ville Aalto: Superseder - An Environment,Päivämäärä 19.6.2024,Sijainti MUU Helsinki Nykytaidekeskus,https://helmet.finna.fi/FeedContent/LinkedEven...,/FeedContent/EventImage?query%5Bpage_size%5D=3...,Superseder - An environment on immersiivinen a...
11296,Ville Aalto: Superseder - An Environment,Päivämäärä 18.6.2024,Sijainti MUU Helsinki Nykytaidekeskus,https://helmet.finna.fi/FeedContent/LinkedEven...,/FeedContent/EventImage?query%5Bpage_size%5D=3...,Superseder - An environment on immersiivinen a...
11297,Ville Aalto: Superseder - An Environment,Päivämäärä 16.6.2024,Sijainti MUU Helsinki Nykytaidekeskus,https://helmet.finna.fi/FeedContent/LinkedEven...,/FeedContent/EventImage?query%5Bpage_size%5D=3...,Superseder - An environment on immersiivinen a...
11298,Ville Aalto: Superseder - An Environment,Päivämäärä 15.6.2024,Sijainti MUU Helsinki Nykytaidekeskus,https://helmet.finna.fi/FeedContent/LinkedEven...,/FeedContent/EventImage?query%5Bpage_size%5D=3...,Superseder - An environment on immersiivinen a...


In [5]:
pip install pandas deep-translator

Collecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
   ---------------------------------------- 0.0/42.3 kB ? eta -:--:--
   ---------------------------------------- 42.3/42.3 kB 2.0 MB/s eta 0:00:00
Installing collected packages: deep-translator
Successfully installed deep-translator-1.11.4
Note: you may need to restart the kernel to use updated packages.


In [9]:
import pandas as pd
from deep_translator import GoogleTranslator

# Load the CSV file
df = pd.read_csv('helmet_all_events_with_descriptions.csv')

# Initialize the translator
translator = GoogleTranslator(source='fi', target='en')

# Function to translate text
def translate_text(text, index):
    if pd.isna(text):
        # print(f"Skipping row {index} due to NaN value.")
        return text
    try:
        return translator.translate(text)
    except Exception as e:
        print(f"Error translating row {index}: {e}")
        return text

# Apply translation to the 'Description' column with index
df['Description_Translated'] = [translate_text(desc, idx) for idx, desc in df['Description'].items()]

# Save the updated DataFrame to a new CSV file
df.to_csv('translated_file.csv', index=False)

print("Translation completed. The new file is saved as 'translated_file.csv'.")


Skipping row 1278 due to NaN value.
Skipping row 1279 due to NaN value.
Skipping row 1280 due to NaN value.
Skipping row 1281 due to NaN value.
Skipping row 1282 due to NaN value.
Skipping row 1283 due to NaN value.
Skipping row 1284 due to NaN value.
Skipping row 1285 due to NaN value.
Skipping row 1286 due to NaN value.
Skipping row 1287 due to NaN value.
Skipping row 1288 due to NaN value.
Skipping row 1468 due to NaN value.
Skipping row 1759 due to NaN value.
Skipping row 1934 due to NaN value.
Skipping row 2087 due to NaN value.
Skipping row 2092 due to NaN value.
Skipping row 3150 due to NaN value.
Skipping row 3874 due to NaN value.
Skipping row 3875 due to NaN value.
Skipping row 4243 due to NaN value.
Skipping row 4244 due to NaN value.
Skipping row 4245 due to NaN value.
Skipping row 4246 due to NaN value.
Skipping row 4247 due to NaN value.
Skipping row 4248 due to NaN value.
Skipping row 4249 due to NaN value.
Skipping row 4820 due to NaN value.
Skipping row 5784 due to NaN

In [None]:
# 1278 1279 1280 1281 1282 1283 1284  1285 1286 1287 1288 1468 1759  1934 2087 2092 
# 3150 3874 3875 4243 4244 4245 4246 4247 4248 4249 4820 5784 5991 6003 6168 6176 6913
# 6925 7150 7202  7425 7426 7427  7428 7429 8425 9292 9440 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713  9714 9715 9716  9717 9718 10291 

In [15]:
#### scraping some rows which has nan values for descrition 
import requests
import pandas as pd
import time
import json
import unicodedata
from bs4 import BeautifulSoup  # Needed for cleaning HTML descriptions

# Load the CSV file with event URLs
csv_file = "helmet_all_events_with_descriptions.csv"  # Update with your actual file path
events_df = pd.read_csv(csv_file, encoding="utf-8")  # Ensure UTF-8 encoding

# Clean column names (remove extra spaces)
events_df.columns = events_df.columns.str.strip()

# List of row indexes to re-scrape
indexes_to_scrape = [
 1289,
 1469,
 1760,
 1935,
 2088,
 2093,
 3151,
 3876,
 4250,
 4821,
 5785,
 5992,
 6004,
 6169,
 6177,
 6914,
 6926,
 7151,
 7203,
 7430,
 8426,
 9293,
 9441,
 9719,
 10292]


# Headers for API requests
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
    "X-Requested-With": "XMLHttpRequest"
}

# Function to clean and transliterate Finnish text
def clean_finnish_text(text):
    if not text or text == "N/A":
        return "N/A"

    # Remove HTML tags
    soup = BeautifulSoup(text, "html.parser")
    clean_text = soup.text.strip()

    # Normalize and replace special Finnish characters
    transliterated_text = unicodedata.normalize("NFKD", clean_text)

    return transliterated_text

# Function to extract description from JSON API
def get_event_description(event_url):
    if event_url == "N/A" or pd.isna(event_url):
        return "N/A"

    try:
        # Convert event URL to API JSON URL
        event_id = event_url.split("=")[-1]  # Extract event ID from URL
        api_url = f"https://helmet.finna.fi/AJAX/JSON?method=getLinkedEvents&params%5Bid%5D={event_id}"
        
        response = requests.get(api_url, headers=headers)
        if response.status_code != 200:
            print(f"⚠️ Failed to fetch {event_url}, Status Code: {response.status_code}")
            return "N/A"

        data = response.json()

        # Extract description from JSON
        description_html = data.get("data", {}).get("events", {}).get("description", "N/A")

        # Clean and transliterate description
        readable_description = clean_finnish_text(description_html)

        return readable_description
    except Exception as e:
        print(f"❌ Error fetching {event_url}: {e}")
        return "N/A"

# Filter only the rows that need to be re-scraped
rows_to_scrape = events_df[events_df["Index"].isin(indexes_to_scrape)]

# Loop through each filtered row and fetch descriptions
for index, row in rows_to_scrape.iterrows():
    event_url = row["Link"]
    print(f"Fetching description for index {index}: {event_url}")
    description = get_event_description(event_url)
    events_df.at[index, "Description"] = description  # Update the DataFrame with the new description
    time.sleep(1)  # Prevent rate limiting

# Save updated CSV file with proper encoding
output_csv_path = "helmet_all_events_with_updated_descriptions.csv"
events_df.to_csv(output_csv_path, index=False, encoding="utf-8")

print(f"✅ Updated descriptions saved to {output_csv_path}.")


Fetching description for index 1288: https://helmet.finna.fi/FeedContent/LinkedEvents?id=helsinki:agkhgwg5lq
Fetching description for index 1468: https://helmet.finna.fi/FeedContent/LinkedEvents?id=helsinki:agkiiga2ea
Fetching description for index 1759: https://helmet.finna.fi/FeedContent/LinkedEvents?id=helsinki:agkg5xy7ma
Fetching description for index 1934: https://helmet.finna.fi/FeedContent/LinkedEvents?id=espoo_le:agkgumxg7e
Fetching description for index 2087: https://helmet.finna.fi/FeedContent/LinkedEvents?id=espoo_le:agkgtqz3gq
Fetching description for index 2092: https://helmet.finna.fi/FeedContent/LinkedEvents?id=espoo_le:agkgtqz37i
Fetching description for index 3150: https://helmet.finna.fi/FeedContent/LinkedEvents?id=kulke:65250
Fetching description for index 3875: https://helmet.finna.fi/FeedContent/LinkedEvents?id=helsinki:agj46kyvky
Fetching description for index 4249: https://helmet.finna.fi/FeedContent/LinkedEvents?id=helsinki:agj3txoowm
Fetching description for in

In [3]:
events_df.isna().sum()

Index            0
Title           31
Date             5
Location        26
Link             0
Image URL      500
Description     25
dtype: int64

In [13]:
events_df[events_df['Description'].isna()]['Index'].tolist()

[1289,
 1469,
 1760,
 1935,
 2088,
 2093,
 3151,
 3876,
 4250,
 4821,
 5785,
 5992,
 6004,
 6169,
 6177,
 6914,
 6926,
 7151,
 7203,
 7430,
 8426,
 9293,
 9441,
 9719,
 10292]