In [27]:
import kaggle
import os
from groq import Groq
import zipfile
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
import traceback
from tqdm import tqdm
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from dotenv import load_dotenv
import torch_directml
import emoji
from concurrent.futures import ProcessPoolExecutor, as_completed
from functools import partial
from joblib import Parallel, delayed
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from collections import Counter
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.util import ngrams
import nltk
import os

In [6]:
# Inizializzare DirectML
device = torch_directml.device()

# Esempio di tensore su GPU AMD tramite DirectML
x = torch.tensor([1.0, 2.0, 3.0, 4.0], device=device)
print(x)

# Esegui qualche operazione
y = x * 2
print(y)

tensor([1., 2., 3., 4.], device='privateuseone:0')
tensor([2., 4., 6., 8.], device='privateuseone:0')


In [7]:
def download_kaggle_dataset(dataset, path):
    kaggle.api.dataset_download_files(dataset, path=path, unzip=True)

# Specifica il dataset che vuoi scaricare
dataset = "danofer/sarcasm"  # Dataset di sarcasmo

# Specifica la directory in cui salvare il dataset
save_dir = "./datasets/"

# Crea la directory se non esiste
os.makedirs(save_dir, exist_ok=True)

# Scarica il dataset
print(f"Downloading {dataset}...")
download_kaggle_dataset(dataset, save_dir)

print("Download completato!")

# Lista i file scaricati
print("\nFile scaricati:")
for file in os.listdir(save_dir):
    print(f"- {file}")

Downloading danofer/sarcasm...
Dataset URL: https://www.kaggle.com/datasets/danofer/sarcasm
Download completato!

File scaricati:
- test-balanced.csv
- test-unbalanced.csv
- train-balanced-sarc.csv.gz
- train-balanced-sarcasm.csv


In [8]:
try:
    import torch_directml
    device = torch_directml.device()
    print("Utilizzo del dispositivo DirectML")
except ImportError:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"DirectML non disponibile, utilizzo di {device}")

Utilizzo del dispositivo DirectML


In [9]:
df_file = os.path.join("datasets", "train-balanced-sarcasm.csv")

# Function to clean the text
def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text)
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text.lower().strip()

# Function to check if the text is valid
def is_valid_text(text):
    # Remove emojis
    text = emoji.replace_emoji(text, '')
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Check if the text has at least 3 words and more than 10 characters
    words = text.split()
    
    # Check if there are words repeated more than 5 times
    word_counts = Counter(words)
    if any(count > 5 for count in word_counts.values()):
        return False
    
    return len(words) >= 3 and len(text) > 10 and not text.isnumeric()

# Load the data
print("Loading data...")
df = pd.read_csv(df_file)
df = df[["label", "comment"]]

# Clean the dataset
print("Cleaning the dataset...")
df['cleaned_comment'] = df['comment'].apply(clean_text)
df['is_valid'] = df['cleaned_comment'].apply(is_valid_text)
df = df[df['is_valid']]
df = df.drop('is_valid', axis=1)
df['comment'] = df['comment'].astype(str)

# Balanced sampling
print("Performing balanced sampling...")
sample_size = 100000  # 100,000 per class, 200,000 total
df_sarcastic = df[df['label'] == 1].sample(sample_size, random_state=42)
df_non_sarcastic = df[df['label'] == 0].sample(sample_size, random_state=42)
df_balanced = pd.concat([df_sarcastic, df_non_sarcastic]).sample(frac=1, random_state=42).reset_index(drop=True)

# Save the dataset
output_file = os.path.join("datasets", "train-balanced-sarcasm-cleaned.csv")
df_balanced.to_csv(output_file, index=False)

print("Cleaned and balanced dataset saved. New DataFrame dimensions:", df_balanced.shape)
print("\nFirst 5 rows of the cleaned DataFrame:")
print(df_balanced.head())

# Verify balance
sarcastic_count = df_balanced['label'].sum()
non_sarcastic_count = len(df_balanced) - sarcastic_count
print(f"\nSarcastic comments: {sarcastic_count}")
print(f"Non-sarcastic comments: {non_sarcastic_count}")

Loading data...
Cleaning the dataset...
Performing balanced sampling...
Cleaned and balanced dataset saved. New DataFrame dimensions: (200000, 3)

First 5 rows of the cleaned DataFrame:
   label                                            comment  \
0      0  I've seen people drop gold bars, it's definite...   
1      1                             smart, powerful logic.   
2      0                I'm sure OP appreciates your input.   
3      1  Because they are soldiers of fascist junta who...   
4      1  It's a conspiracy, the CIA pre-popped Hillary'...   

                                     cleaned_comment  
0  ive seen people drop gold bars its definitely ...  
1                               smart powerful logic  
2                  im sure op appreciates your input  
3  because they are soldiers of fascist junta who...  
4  its a conspiracy the cia prepopped hillarys pi...  

Sarcastic comments: 100000
Non-sarcastic comments: 100000


In [13]:
# Carica i modelli pre-addestrati e i tokenizer
print("Caricamento dei modelli e dei tokenizer...")
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Crea i pipeline per l'analisi del sentiment
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=device)

Caricamento dei modelli e dei tokenizer...


In [15]:
# Function to truncate text to the maximum length accepted by the model
def truncate_text(text, max_length=512):
    words = text.split()
    if len(words) > max_length:
        return " ".join(words[:max_length])
    return text

# Function to analyze sentiment considering sarcasm
def analyze_sentiment_with_sarcasm(text, is_sarcastic):
    try:
        truncated_text = truncate_text(text)
        result = sentiment_pipeline(truncated_text)[0]
        label = result['label']
        
        if is_sarcastic:
            # Invert sentiment for sarcastic comments
            if label == 'LABEL_0':
                return 'Positive'
            elif label == 'LABEL_2':
                return 'Negative'
            else:
                return 'Neutral'
        else:
            if label == 'LABEL_0':
                return 'Negative'
            elif label == 'LABEL_1':
                return 'Neutral'
            else:
                return 'Positive'
    except Exception as e:
        print(f"Error in sentiment analysis for text: {text[:50]}...")
        print(f"Error: {str(e)}")
        return "Error"

# Function to process the dataframe
def process_dataframe(df):
    tqdm.pandas()
    
    # Analyze sentiment considering sarcasm
    df['sentiment'] = df.progress_apply(lambda row: analyze_sentiment_with_sarcasm(row['cleaned_comment'], row['label'] == 1), axis=1)
    
    return df

# Process the dataframe
print("Analyzing sentiment...")
df_processed = process_dataframe(df_balanced)

# Show sentiment distribution
print("\nSentiment distribution:")
print(df_processed['sentiment'].value_counts())

# Show some examples
print("\nSome analysis examples:")
sample_size = min(10, len(df_processed))
for _, row in df_processed.sample(sample_size).iterrows():
    print(f"Original comment: {row['comment'][:100]}...")
    print(f"Cleaned comment: {row['cleaned_comment'][:100]}...")
    print(f"Sentiment: {row['sentiment']}")
    print(f"Sarcastic: {'Yes' if row['label'] == 1 else 'No'}\n")

# Save the result
output_file = os.path.join('datasets', 'df_with_sentiment_and_sarcasm.csv')
df_processed.to_csv(output_file, index=False)
print(f"\nDataset with sentiment analysis considering sarcasm saved in {output_file}")

# Final statistics
print(f"\nTotal rows: {len(df_processed)}")
print(f"Sentiment distribution:")
print(df_processed['sentiment'].value_counts(normalize=True).mul(100).round(2).astype(str) + '%')

Analyzing sentiment...


100%|██████████| 200000/200000 [38:48<00:00, 85.88it/s]



Sentiment distribution:
sentiment
Neutral     100649
Positive     52906
Negative     46445
Name: count, dtype: int64

Some analysis examples:
Original comment: That idea sounds retarted....
Cleaned comment: that idea sounds retarted...
Sentiment: Positive
Sarcastic: Yes

Original comment: Sipsface looks like a wizard from a storybook....
Cleaned comment: sipsface looks like a wizard from a storybook...
Sentiment: Neutral
Sarcastic: No

Original comment: For portable APs, you make sure to throw as much shit on top of the patient's chest as possible....
Cleaned comment: for portable aps you make sure to throw as much shit on top of the patients chest as possible...
Sentiment: Positive
Sarcastic: Yes

Original comment: Are you on mac?...
Cleaned comment: are you on mac...
Sentiment: Neutral
Sarcastic: No

Original comment: Do you still get paid?...
Cleaned comment: do you still get paid...
Sentiment: Neutral
Sarcastic: Yes

Original comment: Easily Danny Rose, it seems every non-spurs fa

In [19]:
# Initialize VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to truncate text to the maximum length accepted by the model
def truncate_text(text, max_length=512):
    words = text.split()
    if len(words) > max_length:
        return " ".join(words[:max_length])
    return text

# Function to analyze sentiment with VADER considering sarcasm
def analyze_sentiment_vader_with_sarcasm(text, is_sarcastic):
    try:
        truncated_text = truncate_text(text)
        sentiment_scores = sia.polarity_scores(truncated_text)
        compound_score = sentiment_scores['compound']
        
        if is_sarcastic:
            # Invert sentiment for sarcastic comments
            compound_score = -compound_score
        
        if compound_score >= 0.05:
            return 'Positive'
        elif compound_score <= -0.05:
            return 'Negative'
        else:
            return 'Neutral'
    except Exception as e:
        print(f"Error in sentiment analysis for text: {text[:50]}...")
        print(f"Error: {str(e)}")
        return "Error"

# Function to process the dataframe
def process_dataframe(df):
    tqdm.pandas()
    
    # Analyze sentiment considering sarcasm
    df['sentiment_vader'] = df.progress_apply(lambda row: analyze_sentiment_vader_with_sarcasm(row['cleaned_comment'], row['label'] == 1), axis=1)
    
    return df

# Process the dataframe
print("Analyzing sentiment using VADER...")
df_processed = process_dataframe(df_balanced)

# Show sentiment distribution
print("\nSentiment distribution (VADER):")
print(df_processed['sentiment_vader'].value_counts())

# Show some examples
print("\nSome analysis examples:")
sample_size = min(10, len(df_processed))
for _, row in df_processed.sample(sample_size).iterrows():
    print(f"Original comment: {row['comment'][:100]}...")
    print(f"Cleaned comment: {row['cleaned_comment'][:100]}...")
    print(f"Sentiment: {row['sentiment_vader']}")
    print(f"Sarcastic: {'Yes' if row['label'] == 1 else 'No'}\n")

# Save the result
output_file = os.path.join('datasets', 'df_with_sentiment_vader_and_sarcasm.csv')
df_processed.to_csv(output_file, index=False)
print(f"\nDataset with VADER sentiment analysis considering sarcasm saved in {output_file}")

# Final statistics
print(f"\nTotal rows: {len(df_processed)}")
print(f"Sentiment distribution (VADER):")
print(df_processed['sentiment_vader'].value_counts(normalize=True).mul(100).round(2).astype(str) + '%')

Analyzing sentiment using VADER...


100%|██████████| 200000/200000 [00:08<00:00, 23618.18it/s]



Sentiment distribution (VADER):
sentiment_vader
Neutral     70195
Negative    65833
Positive    63972
Name: count, dtype: int64

Some analysis examples:
Original comment: You didn't take very many pictures apparently....
Cleaned comment: you didnt take very many pictures apparently...
Sentiment: Neutral
Sarcastic: Yes

Original comment: My guess is the patriarchy protects them from it....
Cleaned comment: my guess is the patriarchy protects them from it...
Sentiment: Negative
Sarcastic: Yes

Original comment: What a terrible advice...
Cleaned comment: what a terrible advice...
Sentiment: Negative
Sarcastic: No

Original comment: Mother of Lions!...
Cleaned comment: mother of lions...
Sentiment: Neutral
Sarcastic: No

Original comment: Ive given up on guitar a decade ago, please teach me theory so I can feel good about myself....
Cleaned comment: ive given up on guitar a decade ago please teach me theory so i can feel good about myself...
Sentiment: Negative
Sarcastic: Yes

Original co

In [28]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter
import os
import re

# Create graphs directory if it doesn't exist
os.makedirs('graphs', exist_ok=True)

# Load the datasets
df1 = pd.read_csv('datasets/df_with_sentiment_and_sarcasm.csv')
df2 = pd.read_csv('datasets/df_with_sentiment_vader_and_sarcasm.csv')

# Custom sentence tokenizer
def simple_sentence_tokenize(text):
    return re.split(r'(?<=[.!?])\s+', text)

# Custom word tokenizer
def simple_word_tokenize(text):
    return re.findall(r'\w+', text.lower())

# 1. Distribution Analysis
plt.figure(figsize=(12, 6))
plt.subplot(121)
df1['sentiment'].value_counts().plot(kind='bar')
plt.title('Sentiment Distribution (RoBERTa)')
plt.subplot(122)
df2['sentiment_vader'].value_counts().plot(kind='bar')
plt.title('Sentiment Distribution (VADER)')
plt.tight_layout()
plt.savefig('graphs/sentiment_distribution_comparison.png')
plt.close()

# 2. Wordcloud
def create_wordcloud(text, title):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(text))
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    plt.tight_layout(pad=0)
    plt.savefig(f'graphs/{title.lower().replace(" ", "_")}.png')
    plt.close()

create_wordcloud(df1['cleaned_comment'], 'Wordcloud (All Comments)')
create_wordcloud(df1[df1['label'] == 1]['cleaned_comment'], 'Wordcloud (Sarcastic Comments)')
create_wordcloud(df1[df1['label'] == 0]['cleaned_comment'], 'Wordcloud (Non-Sarcastic Comments)')

# 3. N-grams Analysis
def get_ngrams(text, n):
    words = simple_word_tokenize(text)
    return zip(*[words[i:] for i in range(n)])

def get_top_ngrams(text, n=1, top=10):
    try:
        all_ngrams = [ngram for comment in text for ngram in get_ngrams(comment, n)]
        ngrams_freq = Counter(all_ngrams)
        return ngrams_freq.most_common(top)
    except Exception as e:
        print(f"Error in get_top_ngrams: {str(e)}")
        return []

def plot_ngrams(df, n, title, sentiment_column='sentiment'):
    plt.figure(figsize=(15, 10))
    for i, sentiment in enumerate(['Positive', 'Neutral', 'Negative']):
        top_ngrams = get_top_ngrams(df[df[sentiment_column] == sentiment]['cleaned_comment'], n)
        if top_ngrams:
            plt.subplot(3, 1, i+1)
            sns.barplot(y=[' '.join(ng[0]) for ng in top_ngrams], x=[ng[1] for ng in top_ngrams], orient='h')
            plt.title(f'Top {n}-grams ({sentiment})')
            plt.xlabel('Frequency')
            plt.ylabel('N-gram')
    plt.tight_layout()
    plt.savefig(f'graphs/{title.lower().replace(" ", "_")}.png')
    plt.close()

for n in [1, 2, 3]:
    plot_ngrams(df1, n, f'Top {n}-grams (RoBERTa)')
    plot_ngrams(df2, n, f'Top {n}-grams (VADER)', sentiment_column='sentiment_vader')

# 4. Sentiment Comparison
sentiment_comparison = pd.crosstab(df1['sentiment'], df2['sentiment_vader'])
plt.figure(figsize=(10, 8))
sns.heatmap(sentiment_comparison, annot=True, fmt='d', cmap='YlGnBu')
plt.title('Sentiment Comparison: RoBERTa vs VADER')
plt.tight_layout()
plt.savefig('graphs/sentiment_comparison_heatmap.png')
plt.close()

# 5. Sarcasm vs Sentiment
plt.figure(figsize=(12, 5))
plt.subplot(121)
sns.countplot(data=df1, x='sentiment', hue='label')
plt.title('Sarcasm vs Sentiment (RoBERTa)')
plt.subplot(122)
sns.countplot(data=df2, x='sentiment_vader', hue='label')
plt.title('Sarcasm vs Sentiment (VADER)')
plt.tight_layout()
plt.savefig('graphs/sarcasm_vs_sentiment.png')
plt.close()

print("Analysis complete. All plots have been saved in the 'graphs' folder.")

Analysis complete. All plots have been saved in the 'graphs' folder.
