# Preprocessing Data (without using a model)

## [1] Importing libraries

In [1]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.append(os.path.abspath('..')) 
from src import config 
from src.loading_data import load_multiple_excel

## [2] Using the function to load files

In [2]:
# Define which files and which name files you want to load in the dictionary
files_to_load = {
    'df_ED_sentiment_1': 'earth_day_tweets_sentiment_50k_(1).xlsx',
    'df_ED_sentiment_2': 'earth_day_tweets_sentiment_50k_(2).xlsx',
    'df_fifa': 'fifa_world_cup_2022_tweets_sentiment_22k.xlsx',
    'df_generic': 'generic_27k.xlsx'
}

# Call the function to load data
dataframes = load_multiple_excel(config.RAW_DATA_PATH, files_to_load)

Attempting to load files from base path: c:\Users\loren\OneDrive\Documenti\UniMiB\Data_Processing_and_Analysis\Lezione\Lorenzo_project\src\../data/raw/
Loading: c:\Users\loren\OneDrive\Documenti\UniMiB\Data_Processing_and_Analysis\Lezione\Lorenzo_project\src\../data/raw/earth_day_tweets_sentiment_50k_(1).xlsx as 'df_ED_sentiment_1'
Successfully loaded: earth_day_tweets_sentiment_50k_(1).xlsx
Loading: c:\Users\loren\OneDrive\Documenti\UniMiB\Data_Processing_and_Analysis\Lezione\Lorenzo_project\src\../data/raw/earth_day_tweets_sentiment_50k_(2).xlsx as 'df_ED_sentiment_2'
Successfully loaded: earth_day_tweets_sentiment_50k_(2).xlsx
Loading: c:\Users\loren\OneDrive\Documenti\UniMiB\Data_Processing_and_Analysis\Lezione\Lorenzo_project\src\../data/raw/fifa_world_cup_2022_tweets_sentiment_22k.xlsx as 'df_fifa'
Successfully loaded: fifa_world_cup_2022_tweets_sentiment_22k.xlsx
Loading: c:\Users\loren\OneDrive\Documenti\UniMiB\Data_Processing_and_Analysis\Lezione\Lorenzo_project\src\../data/ra

## [3] Showing categories distributions

renaming columns

In [3]:
# --- Renaming specific columnes ---
print("Starting specific column renaming...")

# Map for df_fifa: {original_name: new_name}
fifa_rename_map = {
    'Date Created': 'date_created',
    'Number of Likes': 'number_of_likes',
    'Source of Tweet': 'source_of_tweet',
    'Tweet': 'tweet',
    'Sentiment': 'sentiment' 
}
# Map for df_generic: {original_name: new_name}
generic_rename_map = {
    'textID': 'text_id',
}

# Applies the renaming at df_fifa if exists
if 'df_fifa' in dataframes:
    print("  - Renaming columns in 'df_fifa'...")
    print(f"    - Original columns: {dataframes['df_fifa'].columns.tolist()}")
    # removing 'Unnamed: 0' column because it's exacly the index
    dataframes['df_fifa'].drop(columns=['Unnamed: 0'], inplace=True)
    dataframes['df_fifa'].rename(columns=fifa_rename_map, inplace=True)
    print(f"    - New columns: {dataframes['df_fifa'].columns.tolist()}")
else:
    print("  - 'df_fifa' not found in dataframes, skipping rename.")

# Applies the renaming at df_generic if exists
if 'df_generic' in dataframes:
    print("  - Renaming columns in 'df_generic'...")
    print(f"    - Original columns: {dataframes['df_generic'].columns.tolist()}")
    dataframes['df_generic'].rename(columns=generic_rename_map, inplace=True)
    print(f"    - New columns: {dataframes['df_generic'].columns.tolist()}")
else:
    print("  - 'df_generic' not found in dataframes, skipping rename.")

print("Column renaming finished.")

Starting specific column renaming...
  - Renaming columns in 'df_fifa'...
    - Original columns: ['Unnamed: 0', 'Date Created', 'Number of Likes', 'Source of Tweet', 'Tweet', 'Sentiment']
    - New columns: ['date_created', 'number_of_likes', 'source_of_tweet', 'tweet', 'sentiment']
  - Renaming columns in 'df_generic'...
    - Original columns: ['textID', 'text', 'sentiment']
    - New columns: ['text_id', 'text', 'sentiment']
Column renaming finished.


for the things that we discovered in the eda notebook, we can look for the categories of each column of each dataframe

In [4]:
# 1. Defining columns that we want to analyze for each original DataFrame
#    The keys must match the names used in the 'dataframes' dictionary
columns_per_df = {
    'df_ED_sentiment_1': ['sentiment', 'emotion'],
    'df_ED_sentiment_2': ['sentiment', 'emotion'],
    'df_fifa': ['sentiment'],
    'df_generic': ['sentiment'] 
}

# 2. Joining the original name_mapping with the new desired names
name_mapping = {
    'df_ED_sentiment_1': 'cat_sent_1',
    'df_ED_sentiment_2': 'cat_sent_2',
    'df_fifa': 'fifa_sent',
    'df_generic': 'generic_sent' 
}

# 3. Initializing a dictionary to contain the new DataFrames
unique_categories_dfs = {}

print("Start creating DataFrames with unique values specified for DataFrame...")

# 4. Repeats on every original DataFrame that needs to be processed (using name_mapping)
for original_name, new_name in name_mapping.items():
    
    # Controlling if the original DataFrame exists in the 'dataframes' dictionary
    if original_name in dataframes:
        df_original = dataframes[original_name]
        print(f"\nProcessing: '{original_name}' -> '{new_name}'")
        
        # 5. Recovering the specific column list for this DataFrame
        if original_name in columns_per_df:
            columns_to_analyze_for_this_df = columns_per_df[original_name]
            print(f"  - Columns to analyze: {columns_to_analyze_for_this_df}")
        else:
            # If for some reason we didn't specified the columns for a df that we want to map
            print(f"  - Attention: didn't found specified columns '{original_name}'. Skipped.")
            continue 

        # Temporary dictionary to contain the unique value Series for this df
        unique_series_dict = {}
        
        # 6. Repeats on the specific columns of this DataFrame
        for col_name in columns_to_analyze_for_this_df:
            # Checks if the column exists on the current DataFrame
            if col_name in df_original.columns:
                try:
                    # Picks unique values, removes NaN 
                    # We use .astype(str) before unique() to manage multiple types or objects
                    # but for numeric columns like 'number_of_likes', couldn't be ideal
                    # If a column is surely numeric, we should treat it in a different way
                    # Here we try a generic approach:
                    unique_values = df_original[col_name].dropna().unique()
                    
                    print(f"    - Column '{col_name}': found {len(unique_values)} unique values.")
                    
                    # Creates a Series with these unique values.
                    unique_series_dict[col_name] = pd.Series(unique_values, name=col_name)

                except Exception as e:
                     print(f"    - Error: couldn't process the column '{col_name}' in '{original_name}': {e}")
            else:
                print(f"    - Column '{col_name}' not found in '{original_name}'. Skipped.")
                
        # 7. Creates the new DataFrame concatening the Series as columns
        if unique_series_dict: # Goes on only if we found/processed at least a column
            # We use pd.concat as before. It will manage different lenghts filling with NaN.
            df_new_unique = pd.concat(unique_series_dict.values(), axis=1, keys=unique_series_dict.keys())
            
            # 8. Adding the new DataFrame at the result dictionary.
            unique_categories_dfs[new_name] = df_new_unique
            print(f"  -> Created DataFrame '{new_name}' with shape {df_new_unique.shape}")
        else:
            print(f"  -> Didn't found/processed any valid column for '{original_name}'. didn't create '{new_name}' Dataframe.")

    else:
        # If the original name isn't an accepted key for the 'dataframes' dictionary
        print(f"\nAttention: original DataFrame '{original_name}' not found in 'dataframes' dictionary. Skipped.")

print("\n--- Operation complete ---")

Start creating DataFrames with unique values specified for DataFrame...

Processing: 'df_ED_sentiment_1' -> 'cat_sent_1'
  - Columns to analyze: ['sentiment', 'emotion']
    - Column 'sentiment': found 2 unique values.
    - Column 'emotion': found 4 unique values.
  -> Created DataFrame 'cat_sent_1' with shape (4, 2)

Processing: 'df_ED_sentiment_2' -> 'cat_sent_2'
  - Columns to analyze: ['sentiment', 'emotion']
    - Column 'sentiment': found 2 unique values.
    - Column 'emotion': found 4 unique values.
  -> Created DataFrame 'cat_sent_2' with shape (4, 2)

Processing: 'df_fifa' -> 'fifa_sent'
  - Columns to analyze: ['sentiment']
    - Column 'sentiment': found 3 unique values.
  -> Created DataFrame 'fifa_sent' with shape (3, 1)

Processing: 'df_generic' -> 'generic_sent'
  - Columns to analyze: ['sentiment']
    - Column 'sentiment': found 3 unique values.
  -> Created DataFrame 'generic_sent' with shape (3, 1)

--- Operation complete ---


categories distribution for each column of each DataFrame

In [5]:
print("--- Unique values count for every column of each DataFrame ---")

# Repeats on every couple name-DataFrame in the 'dataframes' dictionary
for df_name, current_df in dataframes.items():
    
    print(f"\n--- DataFrame: '{df_name}' ---")
    print(f"Counting unique values for each column:")
    
    # Repeats on every column name present in the current DataFrame
    for col_name in current_df.columns:
        
        # Elaborates the number of unique non-missing values for the current column
        unique_count = current_df[col_name].dropna().nunique()
        
        # Prints the column name and the count
        print(f"  - Column '{col_name}': {unique_count} unique values")
        
    print("-" * 30) # Aggiunge una linea separatrice per leggibilità tra i DataFrame

print("\n--- Fine Conteggio Globale ---")

--- Unique values count for every column of each DataFrame ---

--- DataFrame: 'df_ED_sentiment_1' ---
Counting unique values for each column:
  - Column 'text': 27009 unique values
  - Column 'hash_tags': 8566 unique values
  - Column 'account_tags': 13643 unique values
  - Column 'sentiment': 2 unique values
  - Column 'emotion': 4 unique values
------------------------------

--- DataFrame: 'df_ED_sentiment_2' ---
Counting unique values for each column:
  - Column 'text': 26185 unique values
  - Column 'hash_tags': 8233 unique values
  - Column 'account_tags': 13209 unique values
  - Column 'sentiment': 2 unique values
  - Column 'emotion': 4 unique values
------------------------------

--- DataFrame: 'df_fifa' ---
Counting unique values for each column:
  - Column 'date_created': 14412 unique values
  - Column 'number_of_likes': 271 unique values
  - Column 'source_of_tweet': 109 unique values
  - Column 'tweet': 22360 unique values
  - Column 'sentiment': 3 unique values
--------