In [45]:
import torch
import tensorflow as tf

print(f"PyTorch version: {torch.__version__}")
print(f"TensorFlow version: {tf.__version__}")

PyTorch version: 2.5.1
TensorFlow version: 2.18.0


In [None]:
import os
import pandas as pd

parquet_path = 'models_comparison/reviews.parquet'

if not os.path.exists(parquet_path):
    print(f"Parquet file not found at path: {parquet_path}")
    exit()

try:
    df = pd.read_parquet(parquet_path)
    print("Parquet file successfully loaded.")
except Exception as e:
    print(f"Error loading Parquet file: {e}")
    exit()

print("Available columns:", df.columns.tolist())
if 'content' not in df.columns:
    print("The 'content' column was not found in the Parquet file.")
    exit()

Plik Parquet został pomyślnie wczytany.
Dostępne kolumny: ['review_id', 'at', 'content', 'score', 'app_name']


In [47]:
print(df['content'].apply(type).value_counts())

content
<class 'str'>    698688
Name: count, dtype: int64


## obliczenie sentymentu dla 4 modeli

In [48]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import pandas as pd
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import os
from tqdm import tqdm

if torch.backends.mps.is_available():
    print("MPS is enabled")


MPS is enabled


### zwracanie wszystkich wartosci

In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tqdm import tqdm
import os

if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("MPS (Metal Performance Shaders) device is available. Using GPU acceleration.")
else:
    device = torch.device("cpu")
    print("No GPU device found. Using CPU.")

parquet_path = 'models_comparison/reviews.parquet'

if not os.path.exists(parquet_path):
    print(f"Parquet file not found at path: {parquet_path}")
    exit()

try:
    df = pd.read_parquet(parquet_path)
    print("Parquet file successfully loaded.")
except Exception as e:
    print(f"Error loading Parquet file: {e}")
    exit()

print("Available columns:", df.columns.tolist())
if 'content' not in df.columns:
    print("The 'content' column was not found in the Parquet file.")
    exit()

analyzer = SentimentIntensityAnalyzer()

print("Loading DistilBERT model...")
distilbert_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
distilbert_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
distilbert_model.to(device)
distilbert_model.eval()

print("Loading Cardiff NLP RoBERTa model...")
roberta_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
roberta_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
roberta_model.to(device)
roberta_model.eval()

tqdm.pandas()

def analyze_textblob(text):
    try:
        blob = TextBlob(str(text))
        sentiment = blob.sentiment  # returns Sentiment(polarity, subjectivity)

        return {
            'textblob_pattern_polarity': sentiment.polarity,
            'textblob_pattern_subjectivity': sentiment.subjectivity
        }
    except Exception as e:
        print(f"TextBlob error for text: {text}\nError: {e}")
        return {
            'textblob_pattern_polarity': None,
            'textblob_pattern_subjectivity': None
        }

# Function to analyze sentiment using VADER
def analyze_vader(text):
    try:
        scores = analyzer.polarity_scores(str(text))
        return {
            'vader_neg': scores['neg'],
            'vader_neu': scores['neu'],
            'vader_pos': scores['pos'],
            'vader_compound': scores['compound']
        }
    except Exception as e:
        print(f"VADER error for text: {text}\nError: {e}")
        return {
            'vader_neg': None,
            'vader_neu': None,
            'vader_pos': None,
            'vader_compound': None
        }

# Function to analyze sentiment using DistilBERT in batches
def analyze_distilbert_batch(texts, batch_size=32):
    sentiments = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Analyzing DistilBERT in Batches"):
        batch = texts[i:i+batch_size]
        try:
            inputs = distilbert_tokenizer(batch, return_tensors="pt", truncation=True, padding=True, max_length=512)
            inputs = {key: value.to(device) for key, value in inputs.items()}

            with torch.no_grad():
                outputs = distilbert_model(**inputs)

            probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
            probs = probabilities.tolist()

            for prob in probs:
                negative_prob, positive_prob = prob

                sentiments.append({
                    'distilbert_negative_prob': negative_prob,
                    'distilbert_positive_prob': positive_prob
                })
        except Exception as e:
            print(f"DistilBERT error for batch {i}-{i+batch_size}: {e}")
            for _ in batch:
                sentiments.append({
                    'distilbert_negative_prob': None,
                    'distilbert_positive_prob': None
                })
    return sentiments

# Function to analyze sentiment using Cardiff NLP RoBERTa in batches
def analyze_roberta_batch(texts, batch_size=32):
    sentiments = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Analyzing RoBERTa in Batches"):
        batch = texts[i:i+batch_size]
        try:
            inputs = roberta_tokenizer(batch, return_tensors="pt", truncation=True, padding=True, max_length=512)
            inputs = {key: value.to(device) for key, value in inputs.items()}

            with torch.no_grad():
                outputs = roberta_model(**inputs)

            probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
            probs = probabilities.tolist()

            for prob in probs:
                negative_prob, neutral_prob, positive_prob = prob
                sentiment_label = max(
                    ["NEGATIVE", "NEUTRAL", "POSITIVE"],
                    key=lambda x: {"NEGATIVE": negative_prob, "NEUTRAL": neutral_prob, "POSITIVE": positive_prob}[x],
                )
                sentiments.append({
                    'roberta_negative_prob': negative_prob,
                    'roberta_neutral_prob': neutral_prob,
                    'roberta_positive_prob': positive_prob,
                    'roberta_sentiment_label': sentiment_label
                })
        except Exception as e:
            print(f"RoBERTa error for batch {i}-{i+batch_size}: {e}")
            for _ in batch:
                sentiments.append({
                    'roberta_negative_prob': None,
                    'roberta_neutral_prob': None,
                    'roberta_positive_prob': None,
                    'roberta_sentiment_label': None
                })
    return sentiments


print("Analyzing sentiment using TextBlob...")
textblob_results = df['content'].progress_apply(analyze_textblob).tolist()
textblob_df = pd.DataFrame(textblob_results)
df = pd.concat([df.reset_index(drop=True), textblob_df.reset_index(drop=True)], axis=1)

print("Analyzing sentiment using VADER...")
vader_results = df['content'].progress_apply(analyze_vader).tolist()
vader_df = pd.DataFrame(vader_results)
df = pd.concat([df.reset_index(drop=True), vader_df.reset_index(drop=True)], axis=1)

print("Analyzing sentiment using DistilBERT...")
distilbert_results = analyze_distilbert_batch(df['content'].tolist())
distilbert_df = pd.DataFrame(distilbert_results)
df = pd.concat([df.reset_index(drop=True), distilbert_df.reset_index(drop=True)], axis=1)

print("Analyzing sentiment using Cardiff NLP RoBERTa...")
roberta_results = analyze_roberta_batch(df['content'].tolist())
roberta_df = pd.DataFrame(roberta_results)
df = pd.concat([df.reset_index(drop=True), roberta_df.reset_index(drop=True)], axis=1)


print(df.head())

output_path = 'models_comparison/reviews_with_sentiments.parquet'
try:
    df.to_parquet(output_path, index=False)
    print(f"Results have been saved to: {output_path}")
except Exception as e:
    print(f"Error saving Parquet file:{e}")

MPS (Metal Performance Shaders) device is available. Using GPU acceleration.
Parquet file successfully loaded.
Available columns: ['review_id', 'at', 'content', 'score', 'app_name']
Loading DistilBERT model...
Loading Cardiff NLP RoBERTa model...


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Analyzing sentiment using TextBlob...


100%|██████████| 698688/698688 [00:44<00:00, 15646.27it/s]


Analyzing sentiment using VADER...


100%|██████████| 698688/698688 [00:16<00:00, 42523.40it/s]


Analyzing sentiment using DistilBERT...


Analyzing DistilBERT in Batches: 100%|██████████| 21834/21834 [30:15<00:00, 12.03it/s]  


Analyzing sentiment using Cardiff NLP RoBERTa...


Analyzing RoBERTa in Batches: 100%|██████████| 21834/21834 [1:05:41<00:00,  5.54it/s]


                              review_id          at  \
0  4423e3f0-6002-469c-bf91-239a1ba1d998  2024-11-23   
1  16d24c6a-d3ea-4558-9a6b-2694beb581ec  2024-11-23   
2  14098fe6-6649-4fb4-aeb1-f1b36fdcca6f  2024-11-23   
3  6b215132-0eaf-40f7-a7f4-758c8fbf35b1  2024-11-23   
4  b57b2878-92ba-4468-a3dc-722f2c61b51b  2024-11-23   

                                             content  score app_name  \
0                                      No comment 😭😔      5  Netflix   
1  lately the sounds go thru but no picture...its...      2  Netflix   
2                                 This is very good👍      5  Netflix   
3  Why auto payment with bank is activated after ...      1  Netflix   
4  Keeps updating every 2 days and suddenly canno...      1  Netflix   

   textblob_pattern_polarity  textblob_pattern_subjectivity  vader_neg  \
0                       0.00                           0.00      0.500   
1                      -0.55                           0.75      0.205   
2             