In [1]:
import pandas as pd
import numpy as np

In [13]:
# load the dataset
warriner_df = pd.read_csv('../data/wordlists/warriner.csv', low_memory=False)

print(warriner_df.head())
print(warriner_df.info()) 

# clean the column names (remove spaces, make lowercase)
warriner_df.columns = [col.strip().lower().replace('.', '_') for col in warriner_df.columns]
warriner_df = warriner_df.loc[:, ~warriner_df.columns.str.contains('^Unnamed')]

# print column names to inspect them
print("Column names after cleaning:", warriner_df.columns.tolist())

# select only the columns we need (word and the main valence, arousal, dominance scores)
# use lowercase names since we converted all column names to lowercase
warriner_clean = warriner_df[['word', 'v_mean_sum', 'a_mean_sum', 'd_mean_sum']]

# rename columns
warriner_clean = warriner_clean.rename(columns={
    'v_mean_sum': 'valence_score',
    'a_mean_sum': 'arousal_score',
    'd_mean_sum': 'dominance_score'
})

# check for and handle missing values 
print(warriner_clean.isnull().sum())
warriner_clean = warriner_clean.dropna()

# normalize the scores to match with the current scale if needed
print("Valence range:", warriner_clean['valence_score'].min(), "to", warriner_clean['valence_score'].max())
print("Arousal range:", warriner_clean['arousal_score'].min(), "to", warriner_clean['arousal_score'].max())
print("Dominance range:", warriner_clean['dominance_score'].min(), "to", warriner_clean['dominance_score'].max())

# save the cleaned dataset

warriner_clean.to_csv('../data/cleaned/warriner_clean.csv', index=False)

          Word                   V.Mean.Sum  V.SD.Sum  V.Rat.Sum  A.Mean.Sum  \
0     1   aardvark                     6.26      2.21         19        2.41   
1     2   abalone                      5.30      1.59         20        2.65   
2     3   abandon                      2.84      1.54         19        3.73   
3     4   abandonment                  2.63      1.74         19        4.95   
4     5   abbey                        5.85      1.69         20        2.20   

   A.SD.Sum  A.Rat.Sum  D.Mean.Sum  D.SD.Sum  ...  A.Rat.L  A.Mean.H   A.SD.H  \
0      1.40         22        4.27      1.75  ...       11      2.55   1.29     
1      1.90         20        4.95      1.79  ...       12      2.38   1.92     
2      2.43         22        3.32      2.50  ...       11      3.82   2.14     
3      2.64         21        2.64      1.81  ...       14      5.29   2.63     
4      1.70         20        5.00      2.02  ...        9      2.55   1.92     

   A.Rat.H  D.Mean.L  D.SD.L  D.