In [51]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [52]:
# Set random seeds for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [53]:
data_path = "../data/dataset_task2.csv"

In [54]:
test_df = pd.read_csv(data_path)

print("Number of documents:", test_df.shape[0])
print("Number of features:", test_df.shape[1])

test_df.columns

Number of documents: 5475
Number of features: 4


Index(['id', 'country', 'keyword', 'post content'], dtype='object')

In [55]:
test_df.head()

Unnamed: 0,id,country,keyword,post content
0,R0000,Uruguay,gay,"creo q es esta, la de u r gay y otra más que n..."
1,R0001,Uruguay,transgénero,Que tal peligroso es ser gay/trans?\nLes tengo...
2,R0002,Uruguay,transexual,Es un mundo de diferencia entre ser homosexual...
3,R0003,Uruguay,lesbiana,Te cuento mi perspectiva como lesbiana viviend...
4,R0004,Uruguay,transexuales,"No es peligroso, en general. Pueden sufrir dis..."


In [56]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5475 entries, 0 to 5474
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            5475 non-null   object
 1   country       5475 non-null   object
 2   keyword       5475 non-null   object
 3   post content  5475 non-null   object
dtypes: object(4)
memory usage: 171.2+ KB


In [57]:
# Country values
test_df["country"].value_counts()

country
Uruguay                 1770
Panama                  1058
Peru                     580
Paraguay                 541
Puerto Rico              378
Guatemala                376
Venezuela                232
Honduras                 154
El Salvador              100
Republica Dominicana     100
Nicaragua                 69
Bolivia                   49
Ecuador                   46
Costa Rica                17
Cuba                       5
Name: count, dtype: int64

In [58]:
# Keyword values
test_df["keyword"].value_counts()

keyword
trans            1309
gay              1177
LGBT              860
gays              675
lgbt              346
lesbianas         145
lesbiana          144
bisexual          128
queer              88
transexuales       70
transgénero        68
bisexuales         63
transexual         55
travestis          52
Trans              39
travesti           39
Gay                33
asexual            31
pansexual          24
transgenero        24
intersexual        11
Gays               11
intersexuales      10
Bisexual            8
Queer               8
pansexuales         7
TRANS               7
Lgbt                6
GAY                 5
asexuales           5
Asexual             4
Travesti            3
Transgenero         3
Pansexual           3
GAYS                2
LgBT                2
Bisexuales          2
queers              1
Lesbianas           1
Asexuales           1
Lesbiana            1
Intersexual         1
Transexual          1
Transexuales        1
Transgénero         1
Na

### Text Cleaning

In [59]:
import re
import html

def clean_reddit_text(text):
    """
    Clean and normalize Reddit post text for sentiment analysis
    """
    if not isinstance(text, str):
        return ""
    
    # Unescape HTML entities
    text = html.unescape(text)
    
    # Remove URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    
    # Remove subreddit references and user mentions
    text = re.sub(r'/?[ru]/[A-Za-z0-9_-]+', '', text)
    
    # Remove markdown formatting
    text = re.sub(r'\*\*|\*|~~|__|_|\[|\]|\(|\)|>', '', text)
    
    # Handle [deleted] and [removed]
    text = re.sub(r'\[deleted\]|\[removed\]', '', text)
    
    # Remove edit notes
    text = re.sub(r'edit\s*:', '', text, flags=re.IGNORECASE)
    
    # Clean up whitespace and newlines
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

In [60]:
# Clean texts
test_df["post content"] = test_df["post content"].apply(clean_reddit_text)
# To lowercase
test_df["post content"] = test_df["post content"].str.lower()

print("Number of documents after cleaning:", test_df.shape[0])

Number of documents after cleaning: 5475


In [61]:
# Remove empty texts
test_df = test_df[test_df["post content"].str.strip() != ""]
print("Number of documents after removing empty texts:", test_df.shape[0])

Number of documents after removing empty texts: 5475


In [62]:
test_df.head()

Unnamed: 0,id,country,keyword,post content
0,R0000,Uruguay,gay,"creo q es esta, la de u r gay y otra más que n..."
1,R0001,Uruguay,transgénero,que tal peligroso es ser gay/trans? les tengo ...
2,R0002,Uruguay,transexual,es un mundo de diferencia entre ser homosexual...
3,R0003,Uruguay,lesbiana,te cuento mi perspectiva como lesbiana viviend...
4,R0004,Uruguay,transexuales,"no es peligroso, en general. pueden sufrir dis..."


In [63]:
test_df["post content"][0]

'creo q es esta, la de u r gay y otra más que no me acuerdo acá hay un articulo sobre todas las menciones de uy en los simpson'

In [64]:
text_length=test_df["post content"].apply(lambda text: len(text.split()))
text_length.describe(percentiles=[0.25, 0.50, 0.75, 0.90, 0.95, 0.99])

count    5475.000000
mean       97.989041
std       147.993694
min         1.000000
25%        28.000000
50%        59.000000
75%       115.000000
90%       212.000000
95%       305.600000
99%       621.560000
max      4346.000000
Name: post content, dtype: float64

In [65]:
# Save cleaned data
test_df.to_csv("../data/cleaned_dataset_task2.csv", index=False)