### Import libraries

In [1]:
import pandas as pd
import warnings
import ast

warnings.simplefilter(action = "ignore", category = FutureWarning)

### Read datasets

In [2]:
df_counties = pd.read_excel('../data/data_tabular/counties_preprocessed.xlsx')
df_phenomena = pd.read_excel('../data/data_crawl/meteo_history.xlsx')

### Drop rows where phenomena was not extract correctly

In [3]:
def is_convertible_to_list(s):
    try:
        ast.literal_eval(s)
        return True
    except (ValueError, SyntaxError):
        return False

# Filter out rows that cannot be converted to lists
df_phenomena = df_phenomena[df_phenomena['region_phenomena'].apply(is_convertible_to_list)].reset_index(drop=True)

### Get the column ready to be exploded

In [4]:
# Normalize the 'region_phenomena' column to include regions and phenomena
def normalize_region_and_phenomena(row):
    region_phenomena_pairs = []
    region_phenomena_list = ast.literal_eval(row['region_phenomena'])
    for region_dict in region_phenomena_list:
        for region, phenomena in region_dict.items():
            region_phenomena_pairs.append({
                'region': region,
                'phenomena': phenomena
            })
    return region_phenomena_pairs

df_phenomena['region_phenomena_pairs'] = df_phenomena.apply(normalize_region_and_phenomena, axis=1)

### Explode the column region_phenomena_pairs and drop the unnecessary ones

In [5]:
# Explode the 'region_phenomena_pairs' column
df_phenomena = df_phenomena.explode('region_phenomena_pairs')

# Create new columns for 'region' and 'phenomena'
df_phenomena['region'] = df_phenomena['region_phenomena_pairs'].apply(lambda x: x['region'] if pd.notna(x) else None)
df_phenomena['phenomena'] = df_phenomena['region_phenomena_pairs'].apply(lambda x: x['phenomena'] if pd.notna(x) else None)

# Drop the 'region_phenomena_pairs' column as it's no longer needed
df_phenomena.drop(['region_phenomena_pairs', 'region_phenomena'], axis=1, inplace=True)
# Drop rows where phenomena is empty list
df_phenomena = df_phenomena.loc[df_phenomena['phenomena'].apply(lambda x: x != [])]

df_phenomena['article_date'] = pd.to_datetime(df_phenomena['article_date'])
df_phenomena['article_date'] = df_phenomena['article_date'].dt.strftime('%d-%m-%Y')

In [6]:
df_phenomena = df_phenomena.groupby(['article_date', 'region'])['phenomena'].sum()
df_phenomena = df_phenomena.reset_index()
df_phenomena['article_date'] = pd.to_datetime(df_phenomena['article_date'])
df_phenomena.sort_values('article_date', inplace=True)
df_phenomena.reset_index(drop=True, inplace=True)

  df_phenomena['article_date'] = pd.to_datetime(df_phenomena['article_date'])


### Perform mapping

In [7]:
regions = df_phenomena.region.value_counts(dropna=False).head(9).index.to_list()
counties = df_counties.County.value_counts(dropna=False).index.to_list()

- Muntenia - B
- Transilvania - CJ, BV
- Dobrogea - CT
- Moldova - IS
- Oltenia - DJ
- Banat - TM
- Maramureș - NaN
- Bucovina - NaN
- Crișana - NaN

In [8]:
# Create a mapping dictionary
region_mapping = {
    'Muntenia': 'B',
    'Transilvania': 'CJ, BV',
    'Dobrogea': 'CT',
    'Moldova': 'IS',
    'Oltenia': 'DJ',
    'Banat': 'TM',
    'Maramureș': 'NaN',
    'Bucovina': 'NaN',
    'Crișana': 'NaN'
}

# Map the 'region' column to the new values
df_phenomena['County'] = df_phenomena['region'].map(region_mapping)

# If you want to replace NaN strings with actual NaN values
df_phenomena['County'].replace('NaN', pd.NA, inplace=True)

# Split the 'County' values by the comma and explode the DataFrame
df_phenomena['County'] = df_phenomena['County'].str.split(', ')
df_phenomena = df_phenomena.explode('County').reset_index(drop=True)

df_phenomena = df_phenomena.dropna().reset_index(drop=True)

### Merge the 2 dataframes

In [9]:
# Make sure the datetime columns are the same
df_counties['Luna/Zi'] = df_counties['Luna/Zi'].dt.strftime('%d-%m-%Y')
df_counties['Luna/Zi'] = pd.to_datetime(df_counties['Luna/Zi'])
df = pd.merge(df_counties, df_phenomena, how='left', left_on=['Luna/Zi', 'County'], right_on=['article_date', 'County'])

  df_counties['Luna/Zi'] = pd.to_datetime(df_counties['Luna/Zi'])


In [10]:
df = pd.merge(df_counties, df_phenomena, how='inner', left_on=['Luna/Zi', 'County'], right_on=['article_date', 'County'])

### Export dataframe

In [12]:
df.to_csv('../data/datasets/dataset_merged.csv', index=False)