In [None]:
# Imports of libraries and data 

import pandas as pd

# The Cleaned Dataset
cleaned_df = pd.read_csv("updated_cleaned_data.csv")

# The Translated Dataset
translated_df = pd.read_csv("translated_descriptors.csv") # Prepare a translated dataset with English descriptions

# Isolating the columns needed
translated_df = translated_df[['WONUM', 'Description_EN', 'Observations_EN']]

In [3]:
# Merge the cleaned_df with translated_df on WONUM, keeping all rows from cleaned_df
cleaned_df = cleaned_df.drop(columns=['Description_EN', 'Observations_EN'], errors='ignore')

# Now merge with the translated data
cleaned_df = cleaned_df.merge(translated_df[['WONUM', 'Description_EN', 'Observations_EN']], 
                              on='WONUM', 
                              how='left', 
                              suffixes=('', '_translated'))

# Optional: Check for any failed merges (where translation was missing)
missing_translations = cleaned_df[cleaned_df['Description_EN'].isna() | cleaned_df['Observations_EN'].isna()]
if not missing_translations.empty:
    print(f"Warning: {len(missing_translations)} records are missing translations.")
    # Uncomment below to inspect
    # print(missing_translations[['WONUM', 'Description_EN', 'Observations_EN']])

In [4]:
# Convert Event Date to datetime
cleaned_df['Event Date'] = pd.to_datetime(cleaned_df['Event Date'])

# Reorder columns: Place Description_EN after Description, Observations_EN after Observations
cols = cleaned_df.columns.tolist()
new_cols = []
inserted_desc_en = False
inserted_obs_en = False

for col in cols:
    new_cols.append(col)
    if col == 'Description' and 'Description_EN' in cols and not inserted_desc_en:
        new_cols.append('Description_EN')
        inserted_desc_en = True
    elif col == 'Observations' and 'Observations_EN' in cols and not inserted_obs_en:
        new_cols.append('Observations_EN')
        inserted_obs_en = True

# Remove duplicates while preserving order
seen = set()
final_cols = []
for c in new_cols:
    if c not in seen:
        final_cols.append(c)
        seen.add(c)

# Reassign columns
cleaned_df = cleaned_df[final_cols]

# Sort by Event Date
cleaned_df = cleaned_df.sort_values(by='Event Date', ascending=True).reset_index(drop=True)

In [None]:
# Saving the updated cleaned DataFrame to a new CSV file
cleaned_df.to_csv("input_dataset.csv", index=False)