### Import libraries

In [1]:
import pandas as pd
import unicodedata
import json
import ast

from collections import Counter

### Read dataframe

In [2]:
df = pd.read_csv("../data/datasets/dataset_cleaned.csv")

# Convert the string representation to an actual list
df['phenomena'] = df['phenomena'].apply(ast.literal_eval)

### Remove rows where phenoma is not list of strings

In [3]:
# Filter out rows where the 'phenomena' column does not contain a list of strings
df = df[df['phenomena'].apply(lambda x: isinstance(x, list) and all(isinstance(item, str) for item in x))]

### Remove diacritics and lowercase all the elements

In [4]:
def remove_diacritics(input_str):
    """
    Removes diacritics from a given string.
    """
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return ''.join([c for c in nfkd_form if not unicodedata.combining(c)])

def lowercase_diacritics_remove(text):
    """
    Lowercase and remove diacritics for each word in the list.
    """
    # Ensure that the input is a list
    if not isinstance(text, list):
        raise ValueError(f"Expected a list, but got {type(text)}")

    processed_list = [remove_diacritics(word.lower()) for word in text]
    return processed_list

# Apply the function to the 'phenomena' column
df['clean_processed_phenomena'] = df['phenomena'].apply(lowercase_diacritics_remove)

### Remove uncommon phenomena from our dataframe

In [5]:
flattened_list = [word for sublist in df['clean_processed_phenomena'] for word in sublist]
word_counts = Counter(flattened_list)
valid_words = {word for word, count in word_counts.items() if count >= 10}

mask = df['clean_processed_phenomena'].apply(lambda x: all(word in valid_words for word in x))
df = df[mask]

### Count all the occurences

In [6]:
# Flatten the lists in the 'clean_processed_phenomena' column
flattened_list = [word for sublist in df['clean_processed_phenomena'] for word in sublist]

# Use Counter to get the counts of each string
word_counts = Counter(flattened_list)

### Map the related events together)

In [7]:
# Convert the dictionary to JSON
json_filename = "mappings/phenomena_mappings.json"

# Read back the JSON as a dictionary
with open(json_filename, "r") as json_file:
    phenomena_mappings = json.load(json_file)

In [8]:
def consolidate_terms(phenomena_list):
    return [phenomena_mappings.get(phenomenon, phenomenon) for phenomenon in phenomena_list]

df['clean_processed_phenomena'] = [consolidate_terms(phenomena) for phenomena in df['clean_processed_phenomena']]

### Remove non-event related words

In [9]:
non_meteorological_words = [
    'alunecare', 'cer variabil', 'cer senin', 'cer acoperit', 
    'cod portocaliu', 'cod galben', 'cutremur', 'cer noros', 
    'eroziune', 'gheata', 'ghetus', 'instabilitate', 'insorit', 'mixt',
    'mixta', 'posibil grindina', 'posibil vijelii', 'soare', 'vreme frumoasa',
    'nori', 'vreme schimbatoare', 'vreme inchisa', 'vreme insorita', 'temperaturi in crestere',
    'incalzire', 'racire', 'racire accentuata'
]

# Remove the non-meteorological words from the 'clean_processed_phenomena' column
df['clean_processed_phenomena'] = df['clean_processed_phenomena'].apply(lambda x: [word for word in x if word not in non_meteorological_words])

### Translate events

In [10]:
# Translation dictionary
translation_dict = {
    'ninsori': 'snowfalls',
    'lapovita': 'sleet',
    'ploi': 'rain',
    'vant': 'wind',
    'fulguieli': 'lightning',
    'inundatie': 'flood',
    'polei': 'glaze ice',
    'ger': 'frost',
    'ceata': 'fog',
    'canicula': 'heatwave',
    'negura': 'mist',
    'descarcari electrice': 'electrical discharges',
    'cald': 'warmth',
    'averse': 'showers',
    'frig': 'cold',
    'furtuna': 'storm',
    'grindina': 'hail',
    'vijelie': 'gust',
    'innorari': 'cloudiness',
    'avalansa': 'avalanche',
    'burnita': 'drizzle',
    'viscol': 'blizzard',
    'tunet': 'electrical discharges',
    'bruma': 'hoarfrost',
    'chiciura': 'rime',
    'seceta': 'drought',
    'instabilitate atmosferica': 'atmospheric instability',
    'ciclon': 'cyclone',
    'inundatii': 'floods',
    'incendii': 'fires',
    'disconfort termic': 'thermal discomfort',
    'uragan': 'hurricane',
    'tornada': 'tornado',
    'viitura': 'flash flood'
}

df['clean_processed_phenomena'] = df['clean_processed_phenomena'].apply(lambda x: [translation_dict.get(word, word) for word in x])

### Remove rows where list is empty

In [12]:
df = df[df['clean_processed_phenomena'].astype(bool)].reset_index(drop=True)

### Export dataset

In [13]:
df.to_csv('../data/datasets/dataset_phenomena_cleaned.csv', index=False)