## Text Preprocessing for Italian Documents

### Import packages

In [None]:
import pandas as pd
import re
import string
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from collections import Counter

### Download NLTK resources

In [None]:
# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

### Import dataset from Hugging Face (optional)

In [None]:
#pip install datasets

# from datasets import load_dataset

# Load the CHANGE-IT dataset from Hugging Face
# dataset = load_dataset("gsarti/change_it", split="train")

# Convert Hugging Face dataset to Pandas DataFrame
# df = dataset.to_pandas()


### Sample **change-it** public dataset (optional)

In [None]:
# I generate the datasets sampling change-it datasets (you don't need to run this code)

# Load datasets
# df_repubblica = pd.read_csv("change-it/change-it.repubblica.train.csv", sep=',')
# df_ilgiornale = pd.read_csv("change-it/change-it.ilgiornale.train.csv", sep=',')

# Estract a 1% sample
# df_repubblica_sample = df_repubblica.sample(frac=0.01, random_state=42)
# df_ilgiornale_sample = df_ilgiornale.sample(frac=0.01, random_state=42)

# Salva the sample
# df_repubblica_sample.to_csv("data/repubblica_sample.csv", index=False)
# df_ilgiornale_sample.to_csv("data/ilgiornale_sample.csv", index=False)

### Load dataset (stored in **data** folder)

In [None]:
df = pd.read_csv("data/repubblica_sample.csv")

In [None]:
# Define Italian stopwords
stop_words = set(stopwords.words('italian'))

### Text cleaning (without stemming)

In [None]:
def clean_text_no_stemming(text):
    # 1. Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    # 2. Remove URLs and email addresses
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'\S+@\S+', '', text)

    # 3. Lowercase
    text = text.lower()

    # 4. Remove punctuation (keep apostrophes and dashes)
    text = re.sub(r"[^\w\s'-]", '', text)

    # 5. Remove digits
    text = re.sub(r'\d+', '', text)

    # 6. Tokenize and remove stopwords
    tokens = nltk.word_tokenize(text, language="italian")
    tokens = [word for word in tokens if word not in stop_words]

    return ' '.join(tokens)


### Text cleaning (stemming)

In [None]:
# Create italian stemmer
stemmer = SnowballStemmer("italian")

def apply_stemming(text):
    tokens = text.split()
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(stemmed_tokens)


### Try methods on a sample sentence (playground)

In [None]:
sample_text = "I giornalisti stavano scrivendo articoli molto interessanti sull'economia italiana."

# Fase 1: cleaning
cleaned = clean_text_no_stemming(sample_text)
print("Clean text:", cleaned)

# Fase 2: stemming
stemmed = apply_stemming(cleaned)
print("After stemming:", stemmed)


### Apply text cleaning to the dataset

In [None]:
# Apply text cleaning
df['cleaned_text'] = df['full_text'].apply(clean_text_no_stemming)
df['stemmed_text'] = df['cleaned_text'].apply(apply_stemming)


In [None]:
df

In [None]:

# --- Optional: remove most/least frequent words ---

# Get word frequency across all tweets
all_words = ' '.join(df['cleaned_text']).split()
word_freq = Counter(all_words)

# Identify top 10 most common and least common words
most_common = set([word for word, freq in word_freq.most_common(10)])
least_common = set([word for word, freq in word_freq.most_common()[-10:]])

# Function to remove frequent and rare words
def remove_common_rare_words(text):
    tokens = text.split()
    return ' '.join([word for word in tokens if word not in most_common and word not in least_common])

# Apply additional cleaning step
df['final_text'] = df['cleaned_text'].apply(remove_common_rare_words)

# Show result
print(df[['full_text', 'cleaned_text', 'final_text']].head())


### Compute word frequency

In [None]:
# Join all cleaned texts into a single list of words
all_words = ' '.join(df['cleaned_text']).split()

# Count the frequency of each word
word_freq = Counter(all_words)

# Show the 10 most frequent words
print("Most frequent words:")
print(word_freq.most_common(10))

# Show the 10 least frequent words
print("\nLeast frequent words:")
print(word_freq.most_common()[-10:])


📊 Visualize word frequency

In [None]:
# Extract the 10 most frequent words and their counts
common_words, common_counts = zip(*word_freq.most_common(10))

# Plot the word frequencies
plt.figure(figsize=(10, 5))
plt.bar(common_words, common_counts)
plt.title("Top 10 Most Frequent Words")
plt.ylabel("Frequency")
plt.xticks(rotation=45)
plt.show()


🧹 Define frequent and rare word sets

In [None]:
# Create sets of most and least frequent words
most_common = set([word for word in word_freq.most_common(10)])
least_common = set([word for word in word_freq.most_common()[-10:]])

print("Words to remove (most frequent):", most_common)
print("Words to remove (least frequent):", least_common)


🛠️ Define the filtering function

In [None]:
# Function to remove both most and least frequent words from a text
def remove_common_rare_words(text):
    tokens = text.split()
    return ' '.join([word for word in tokens if word not in most_common and word not in least_common])


🧪 Apply filtering and compare results

In [None]:
# Apply the filtering function to the cleaned texts
df['final_text'] = df['cleaned_text'].apply(remove_common_rare_words)

# Show comparison between original, cleaned, and final versions
df[['full_text', 'cleaned_text', 'final_text']].head()

☁️ Generate word clouds (before and after)

In [None]:
from wordcloud import WordCloud

# Create word cloud from cleaned_text (before filtering)
text_before = ' '.join(df['cleaned_text'])
wordcloud_before = WordCloud(width=800, height=400, background_color='white').generate(text_before)

# Create word cloud from final_text (after filtering)
text_after = ' '.join(df['final_text'])
wordcloud_after = WordCloud(width=800, height=400, background_color='white').generate(text_after)


🖼️ Display the word clouds side by side

In [None]:
import matplotlib.pyplot as plt

# Display side-by-side comparison
plt.figure(figsize=(16, 6))

# Before
plt.subplot(1, 2, 1)
plt.imshow(wordcloud_before, interpolation='bilinear')
plt.title('Before Removing Common/Rare Words')
plt.axis('off')

# After
plt.subplot(1, 2, 2)
plt.imshow(wordcloud_after, interpolation='bilinear')
plt.title('After Removing Common/Rare Words')
plt.axis('off')

plt.tight_layout()
plt.show()
