In [1]:
import kaggle
import os
from groq import Groq
import zipfile
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
import traceback
from tqdm import tqdm
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from dotenv import load_dotenv
import torch_directml
import emoji
from concurrent.futures import ProcessPoolExecutor, as_completed
from functools import partial
from joblib import Parallel, delayed
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

In [10]:
# Inizializzare DirectML
device = torch_directml.device()

# Esempio di tensore su GPU AMD tramite DirectML
x = torch.tensor([1.0, 2.0, 3.0, 4.0], device=device)
print(x)

# Esegui qualche operazione
y = x * 2
print(y)

tensor([1., 2., 3., 4.], device='privateuseone:0')
tensor([2., 4., 6., 8.], device='privateuseone:0')


In [11]:
def download_kaggle_dataset(dataset, path):
    kaggle.api.dataset_download_files(dataset, path=path, unzip=True)

# Specifica il dataset che vuoi scaricare
dataset = "danofer/sarcasm"  # Dataset di sarcasmo

# Specifica la directory in cui salvare il dataset
save_dir = "./datasets/"

# Crea la directory se non esiste
os.makedirs(save_dir, exist_ok=True)

# Scarica il dataset
print(f"Downloading {dataset}...")
download_kaggle_dataset(dataset, save_dir)

print("Download completato!")

# Lista i file scaricati
print("\nFile scaricati:")
for file in os.listdir(save_dir):
    print(f"- {file}")

Downloading danofer/sarcasm...
Dataset URL: https://www.kaggle.com/datasets/danofer/sarcasm
Download completato!

File scaricati:
- test-balanced.csv
- test-unbalanced.csv
- train-balanced-sarc.csv.gz
- train-balanced-sarcasm.csv


In [12]:
try:
    import torch_directml
    device = torch_directml.device()
    print("Utilizzo del dispositivo DirectML")
except ImportError:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"DirectML non disponibile, utilizzo di {device}")

Utilizzo del dispositivo DirectML


In [51]:
import os
import pandas as pd
import re
import emoji
from collections import Counter

df_file = os.path.join("datasets", "train-balanced-sarcasm.csv")

# Funzione per pulire il testo
def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text)
    text = re.sub(r'http\S+', '', text)  # Rimuove URL
    text = re.sub(r'@\w+', '', text)  # Rimuove menzioni
    text = re.sub(r'#\w+', '', text)  # Rimuove hashtag
    text = re.sub(r'\d+', '', text)  # Rimuove numeri
    text = re.sub(r'[^\w\s]', '', text)  # Rimuove punteggiatura
    return text.lower().strip()

# Funzione per verificare se il testo è valido
def is_valid_text(text):
    # Rimuove emoji
    text = emoji.replace_emoji(text, '')
    # Rimuove spazi extra
    text = re.sub(r'\s+', ' ', text).strip()
    # Verifica se il testo ha almeno 3 parole e più di 10 caratteri
    words = text.split()
    
    # Verifica se ci sono parole ripetute più di 5 volte
    word_counts = Counter(words)
    if any(count > 5 for count in word_counts.values()):
        return False
    
    return len(words) >= 3 and len(text) > 10 and not text.isnumeric()

# Carica i dati
print("Caricamento dei dati...")
df = pd.read_csv(df_file)
df = df[["label", "comment"]]

# Pulizia del dataset
print("Pulizia del dataset in corso...")
df['cleaned_comment'] = df['comment'].apply(clean_text)
df['is_valid'] = df['cleaned_comment'].apply(is_valid_text)
df = df[df['is_valid']]
df = df.drop('is_valid', axis=1)
df['comment'] = df['comment'].astype(str)

# Campionamento
sample = 200000
df = df.head(sample)

df.to_csv("train-balanced-sarcasm-cleaned.csv", index=False)

print("Dataset pulito e filtrato. Dimensioni del nuovo DataFrame:", df.shape)
print("\nPrime 5 righe del DataFrame pulito:")
print(df.head())

Caricamento dei dati...
Pulizia del dataset in corso...
Dataset pulito e filtrato. Dimensioni del nuovo DataFrame: (200000, 3)

Prime 5 righe del DataFrame pulito:
   label                                            comment  \
1      0  You do know west teams play against west teams...   
2      0  They were underdogs earlier today, but since G...   
3      0  This meme isn't funny none of the "new york ni...   
4      0                    I could use one of those tools.   
5      0  I don't pay attention to her, but as long as s...   

                                     cleaned_comment  
1  you do know west teams play against west teams...  
2  they were underdogs earlier today but since gr...  
3  this meme isnt funny none of the new york nigg...  
4                     i could use one of those tools  
5  i dont pay attention to her but as long as she...  


In [23]:
# Carica i modelli pre-addestrati e i tokenizer
print("Caricamento dei modelli e dei tokenizer...")
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(model_name1).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name1)

# Crea i pipeline per l'analisi del sentiment
sentiment_pipeline = pipeline("sentiment-analysis", model=model1, tokenizer=tokenizer1, device=device)


Caricamento dei modelli e dei tokenizer...




In [24]:
import pandas as pd
from tqdm import tqdm

# Funzione per troncare il testo alla lunghezza massima accettata dal modello
def truncate_text(text, max_length=512):
    words = text.split()
    if len(words) > max_length:
        return " ".join(words[:max_length])
    return text

# Funzione per analizzare il sentiment
def analyze_sentiment(text):
    try:
        truncated_text = truncate_text(text)
        result = sentiment_pipeline(truncated_text)[0]
        label = result['label']
        if label == 'LABEL_0':
            return 'Negativo'
        elif label == 'LABEL_1':
            return 'Neutro'
        else:
            return 'Positivo'
    except Exception as e:
        print(f"Errore nell'analisi del sentiment per il testo: {text[:50]}...")
        print(f"Errore: {str(e)}")
        return "Errore"

# Funzione per processare il dataframe
def process_dataframe(df):
    tqdm.pandas()
    
    # Analizza il sentiment
    df['sentiment'] = df['cleaned_comment'].progress_apply(analyze_sentiment)
    
    return df



# Processa il dataframe
print("Analisi del sentiment in corso...")
df_processed = process_dataframe(df)

# Mostra la distribuzione del sentiment
print("\nDistribuzione del sentiment:")
print(df_processed['sentiment'].value_counts())

# Mostra alcuni esempi
print("\nAlcuni esempi di analisi:")
sample_size = min(10, len(df_processed))
for _, row in df_processed.sample(sample_size).iterrows():
    print(f"Commento originale: {row['comment'][:100]}...")
    print(f"Commento pulito: {row['cleaned_comment'][:100]}...")
    print(f"Sentiment: {row['sentiment']}")
    print(f"Sarcastico: {'Sì' if row['label'] == 1 else 'No'}\n")

# Salva il risultato
output_file = 'df_with_sentiment.csv'
df_processed.to_csv(output_file, index=False)
print(f"\nDataset con analisi del sentiment salvato in {output_file}")

# Statistiche finali
print(f"\nRighe totali: {len(df_processed)}")
print(f"Distribuzione del sentiment:")
print(df_processed['sentiment'].value_counts(normalize=True).mul(100).round(2).astype(str) + '%')

Analisi del sentiment in corso...


100%|██████████| 200000/200000 [43:16<00:00, 77.03it/s]



Distribuzione del sentiment:
sentiment
Neutro      103211
Negativo     68539
Positivo     28250
Name: count, dtype: int64

Alcuni esempi di analisi:
Commento originale: Socialism has had such a historically good track record how could you not love it!...
Commento pulito: socialism has had such a historically good track record how could you not love it...
Sentiment: Positivo
Sarcastico: Sì

Commento originale: I'm not sure who is the bigger hero here, the person who originally wrote the question or the person...
Commento pulito: im not sure who is the bigger hero here the person who originally wrote the question or the person w...
Sentiment: Neutro
Sarcastico: No

Commento originale: Let's be honest, that's the real most frequent misconception regarding rogue decks....
Commento pulito: lets be honest thats the real most frequent misconception regarding rogue decks...
Sentiment: Negativo
Sarcastico: No

Commento originale: Am I reading correctly that the MTSO report is dated in January 

In [50]:
df_processed = pd.read_csv(os.path.join("datasets", "df_with_sentiment.csv"))

df_processed = df_processed.drop('cleaned_comment', axis=1, errors='ignore')
# Dizionario per la mappatura
mappatura = {
    'Neutro': 'Neutral',
    'Negativo': 'Negative',
    'Positivo': 'Positive'
}

df_processed['sentiment'] = df_processed['sentiment'].map(mappatura)

# Verifica il risultato
print(df_processed['sentiment'].value_counts(dropna=False))

df_processed.to_csv(output_file, index=False)


sentiment
Neutral     103211
Negative     68539
Positive     28250
Name: count, dtype: int64


In [3]:
def adjust_sentiment(row):
    if row['label'] == 1:  # Se è sarcasmo
        if row['sentiment'] == 'Positivo':
            return 'Negativo'
        elif row['sentiment'] == 'Negativo':
            return 'Positivo'
    return row['sentiment']

# Carica il file CSV
print("Caricamento del file CSV...")
df = pd.read_csv(os.path.join("datasets", "df_with_sentiment.csv"))

# Applica l'aggiustamento del sentiment
print("Aggiustamento del sentiment in base al sarcasmo...")
tqdm.pandas()
df['adjusted_sentiment'] = df.progress_apply(adjust_sentiment, axis=1)

# Statistiche finali
print("Distribuzione del sentiment originale:")
print(df['sentiment'].value_counts(normalize=True).mul(100).round(2).astype(str) + '%')

print("\nDistribuzione del sentiment aggiustato:")
print(df['adjusted_sentiment'].value_counts(normalize=True).mul(100).round(2).astype(str) + '%')

# Salva il risultato
output_file = 'df_with_adjusted_sentiment.csv'
df.to_csv(output_file, index=False)
print(f"\nDataset con sentiment aggiustato salvato in {output_file}")

Caricamento del file CSV...
Aggiustamento del sentiment in base al sarcasmo...


100%|██████████| 200000/200000 [00:01<00:00, 151171.72it/s]


Distribuzione del sentiment originale:
sentiment
Neutral     51.61%
Negative    34.27%
Positive    14.12%
Name: proportion, dtype: object

Distribuzione del sentiment aggiustato:
adjusted_sentiment
Neutral     51.61%
Negative    34.27%
Positive    14.12%
Name: proportion, dtype: object

Dataset con sentiment aggiustato salvato in df_with_adjusted_sentiment.csv


In [7]:
# Ensure the 'graphs' folder exists
os.makedirs('graphs', exist_ok=True)

# Load the dataset
df = pd.read_csv(os.path.join("datasets", "df_with_adjusted_sentiment.csv"))

# Function to create and save word clouds
def create_wordcloud(text, title, filename):
    wordcloud = WordCloud(width=1600, height=800, 
                          background_color='white', 
                          colormap='viridis', 
                          contour_width=3, 
                          contour_color='steelblue').generate(text)
    
    plt.figure(figsize=(20, 10))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title, fontsize=24, pad=20)
    plt.tight_layout(pad=0)
    plt.savefig(os.path.join('graphs', filename), dpi=300, bbox_inches='tight')
    plt.close()

# Word clouds for sarcasm
create_wordcloud(' '.join(df[df['label'] == 1]['comment']), 'Common Words in Sarcastic Comments', 'sarcastic_wordcloud.png')
create_wordcloud(' '.join(df[df['label'] == 0]['comment']), 'Common Words in Non-Sarcastic Comments', 'non_sarcastic_wordcloud.png')

# Word clouds for sentiment
for sentiment in ['Positive', 'Neutral', 'Negative']:
    create_wordcloud(' '.join(df[df['sentiment'] == sentiment]['comment']), 
                     f'Common Words in {sentiment} Comments', 
                     f'{sentiment.lower()}_wordcloud.png')

# Correlation between sarcasm and sentiment
correlation = df['label'].corr(pd.get_dummies(df['sentiment'])['Positive'])
print(f"Correlation between sarcasm and positive sentiment: {correlation:.4f}")

# Visualize sentiment distribution for sarcastic and non-sarcastic comments
plt.figure(figsize=(12, 8))
sns.set_style("whitegrid")
sns.set_palette("deep")

ax = sns.countplot(data=df, x='sentiment', hue='label', 
                   hue_order=[0, 1],
                   order=['Positive', 'Neutral', 'Negative'])

plt.title('Sentiment Distribution for Sarcastic and Non-Sarcastic Comments', fontsize=16, pad=20)
plt.xlabel('Sentiment', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.legend(title='Sarcasm', labels=['Non-Sarcastic', 'Sarcastic'], title_fontsize='13', fontsize='12')

# Add value labels on top of each bar
for container in ax.containers:
    ax.bar_label(container, fontsize=10, padding=3)

plt.tight_layout()
plt.savefig(os.path.join('graphs', 'sentiment_distribution.png'), dpi=300, bbox_inches='tight')
plt.close()

print("All visualizations have been created and saved in the 'graphs' folder.")

Correlation between sarcasm and positive sentiment: -0.0158
All visualizations have been created and saved in the 'graphs' folder.
