In [45]:
import torch
import tensorflow as tf

print(f"PyTorch version: {torch.__version__}")
print(f"TensorFlow version: {tf.__version__}")

PyTorch version: 2.5.1
TensorFlow version: 2.18.0


In [46]:
import os
import pandas as pd

# Ścieżka do pliku Parquet
parquet_path = 'models_comparison/reviews.parquet'

# Sprawdzenie, czy plik Parquet istnieje
if not os.path.exists(parquet_path):
    print(f"Plik Parquet nie został znaleziony pod ścieżką: {parquet_path}")
    exit()

# Wczytaj dane z pliku Parquet
try:
    df = pd.read_parquet(parquet_path)
    print("Plik Parquet został pomyślnie wczytany.")
except Exception as e:
    print(f"Wystąpił błąd podczas wczytywania pliku Parquet: {e}")
    exit()

# Sprawdzenie, czy kolumna 'content' istnieje
print("Dostępne kolumny:", df.columns.tolist())
if 'content' not in df.columns:
    print("Kolumna 'content' nie została znaleziona w pliku Parquet.")
    exit()

Plik Parquet został pomyślnie wczytany.
Dostępne kolumny: ['review_id', 'at', 'content', 'score', 'app_name']


In [47]:
print(df['content'].apply(type).value_counts())

content
<class 'str'>    698688
Name: count, dtype: int64


## obliczenie sentymentu dla 4 modeli

In [48]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import pandas as pd
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import os
from tqdm import tqdm

# Check and set the device
if torch.backends.mps.is_available():
    print("MPS is enabled")


MPS is enabled


### zwracanie wszystkich wartosci

In [1]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tqdm import tqdm
import os

# ----------------------------
# 1. Set Up Device for MacBook M2
# ----------------------------

# Check and set the device to MPS (Metal Performance Shaders) if available
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("MPS (Metal Performance Shaders) device is available. Using GPU acceleration.")
else:
    device = torch.device("cpu")
    print("No GPU device found. Using CPU.")

# ----------------------------
# 2. Load and Validate Data
# ----------------------------

# Path to the Parquet file
parquet_path = 'models_comparison/reviews.parquet'

# Check if the Parquet file exists
if not os.path.exists(parquet_path):
    print(f"Parquet file not found at path: {parquet_path}")
    exit()

# Load data from the Parquet file
try:
    df = pd.read_parquet(parquet_path)
    print("Parquet file successfully loaded.")
except Exception as e:
    print(f"Error loading Parquet file: {e}")
    exit()

# Check if the 'content' column exists
print("Available columns:", df.columns.tolist())
if 'content' not in df.columns:
    print("The 'content' column was not found in the Parquet file.")
    exit()

# ----------------------------
# 3. Initialize Sentiment Analyzers
# ----------------------------

# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Initialize DistilBERT tokenizer and model
print("Loading DistilBERT model...")
distilbert_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
distilbert_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
distilbert_model.to(device)
distilbert_model.eval()

# Initialize Cardiff NLP RoBERTa model
print("Loading Cardiff NLP RoBERTa model...")
roberta_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
roberta_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
roberta_model.to(device)
roberta_model.eval()

# Initialize tqdm pandas extension
tqdm.pandas()

# ----------------------------
# 4. Define Sentiment Analysis Functions
# ----------------------------

# Function to analyze sentiment using TextBlob with PatternAnalyzer only
def analyze_textblob(text):
    try:
        blob = TextBlob(str(text))
        sentiment = blob.sentiment  # returns Sentiment(polarity, subjectivity)

        return {
            'textblob_pattern_polarity': sentiment.polarity,
            'textblob_pattern_subjectivity': sentiment.subjectivity
        }
    except Exception as e:
        print(f"TextBlob error for text: {text}\nError: {e}")
        return {
            'textblob_pattern_polarity': None,
            'textblob_pattern_subjectivity': None
        }

# Function to analyze sentiment using VADER
def analyze_vader(text):
    try:
        scores = analyzer.polarity_scores(str(text))
        return {
            'vader_neg': scores['neg'],
            'vader_neu': scores['neu'],
            'vader_pos': scores['pos'],
            'vader_compound': scores['compound']
        }
    except Exception as e:
        print(f"VADER error for text: {text}\nError: {e}")
        return {
            'vader_neg': None,
            'vader_neu': None,
            'vader_pos': None,
            'vader_compound': None
        }

# Function to analyze sentiment using DistilBERT in batches
def analyze_distilbert_batch(texts, batch_size=32):
    sentiments = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Analyzing DistilBERT in Batches"):
        batch = texts[i:i+batch_size]
        try:
            inputs = distilbert_tokenizer(batch, return_tensors="pt", truncation=True, padding=True, max_length=512)
            inputs = {key: value.to(device) for key, value in inputs.items()}

            with torch.no_grad():
                outputs = distilbert_model(**inputs)

            probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
            probs = probabilities.tolist()

            for prob in probs:
                negative_prob, positive_prob = prob
                sentiment_score = positive_prob - negative_prob

                if positive_prob > negative_prob:
                    sentiment_label = "POSITIVE"
                elif positive_prob < negative_prob:
                    sentiment_label = "NEGATIVE"
                else:
                    sentiment_label = "NEUTRAL"

                sentiments.append({
                    'distilbert_negative_prob': negative_prob,
                    'distilbert_positive_prob': positive_prob,
                    'distilbert_sentiment_score': sentiment_score,
                    'distilbert_sentiment_label': sentiment_label
                })
        except Exception as e:
            print(f"DistilBERT error for batch {i}-{i+batch_size}: {e}")
            for _ in batch:
                sentiments.append({
                    'distilbert_negative_prob': None,
                    'distilbert_positive_prob': None,
                    'distilbert_sentiment_score': None,
                    'distilbert_sentiment_label': None
                })
    return sentiments

# Function to analyze sentiment using Cardiff NLP RoBERTa in batches
def analyze_roberta_batch(texts, batch_size=32):
    sentiments = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Analyzing RoBERTa in Batches"):
        batch = texts[i:i+batch_size]
        try:
            inputs = roberta_tokenizer(batch, return_tensors="pt", truncation=True, padding=True, max_length=512)
            inputs = {key: value.to(device) for key, value in inputs.items()}

            with torch.no_grad():
                outputs = roberta_model(**inputs)

            probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
            probs = probabilities.tolist()

            for prob in probs:
                negative_prob, neutral_prob, positive_prob = prob
                sentiment_label = max(
                    ["NEGATIVE", "NEUTRAL", "POSITIVE"],
                    key=lambda x: {"NEGATIVE": negative_prob, "NEUTRAL": neutral_prob, "POSITIVE": positive_prob}[x],
                )
                sentiments.append({
                    'roberta_negative_prob': negative_prob,
                    'roberta_neutral_prob': neutral_prob,
                    'roberta_positive_prob': positive_prob,
                    'roberta_sentiment_label': sentiment_label
                })
        except Exception as e:
            print(f"RoBERTa error for batch {i}-{i+batch_size}: {e}")
            for _ in batch:
                sentiments.append({
                    'roberta_negative_prob': None,
                    'roberta_neutral_prob': None,
                    'roberta_positive_prob': None,
                    'roberta_sentiment_label': None
                })
    return sentiments

# ----------------------------
# 5. Apply Sentiment Analyses
# ----------------------------

# Apply TextBlob sentiment analysis with progress bar
print("Analyzing sentiment using TextBlob...")
textblob_results = df['content'].progress_apply(analyze_textblob).tolist()
textblob_df = pd.DataFrame(textblob_results)
df = pd.concat([df.reset_index(drop=True), textblob_df.reset_index(drop=True)], axis=1)

# Apply VADER sentiment analysis with progress bar
print("Analyzing sentiment using VADER...")
vader_results = df['content'].progress_apply(analyze_vader).tolist()
vader_df = pd.DataFrame(vader_results)
df = pd.concat([df.reset_index(drop=True), vader_df.reset_index(drop=True)], axis=1)

# Apply DistilBERT sentiment analysis in batches with progress bar
print("Analyzing sentiment using DistilBERT...")
distilbert_results = analyze_distilbert_batch(df['content'].tolist())
distilbert_df = pd.DataFrame(distilbert_results)
df = pd.concat([df.reset_index(drop=True), distilbert_df.reset_index(drop=True)], axis=1)

# Apply Cardiff NLP RoBERTa sentiment analysis in batches with progress bar
print("Analyzing sentiment using Cardiff NLP RoBERTa...")
roberta_results = analyze_roberta_batch(df['content'].tolist())
roberta_df = pd.DataFrame(roberta_results)
df = pd.concat([df.reset_index(drop=True), roberta_df.reset_index(drop=True)], axis=1)

# ----------------------------
# 6. View and Save Results
# ----------------------------

# Display the first few rows of the DataFrame with sentiment scores
print(df.head())

# Optionally, save the results to a new Parquet file
output_path = 'models_comparison/reviews_with_sentiments.parquet'
try:
    df.to_parquet(output_path, index=False)
    print(f"Results have been saved to: {output_path}")
except Exception as e:
    print(f"Error saving Parquet file:{e}")

MPS (Metal Performance Shaders) device is available. Using GPU acceleration.
Parquet file successfully loaded.
Available columns: ['review_id', 'at', 'content', 'score', 'app_name']
Loading DistilBERT model...
Loading Cardiff NLP RoBERTa model...


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Analyzing sentiment using TextBlob...


100%|██████████| 698688/698688 [00:44<00:00, 15646.27it/s]


Analyzing sentiment using VADER...


100%|██████████| 698688/698688 [00:16<00:00, 42523.40it/s]


Analyzing sentiment using DistilBERT...


Analyzing DistilBERT in Batches: 100%|██████████| 21834/21834 [30:15<00:00, 12.03it/s]  


Analyzing sentiment using Cardiff NLP RoBERTa...


Analyzing RoBERTa in Batches: 100%|██████████| 21834/21834 [1:05:41<00:00,  5.54it/s]


                              review_id          at  \
0  4423e3f0-6002-469c-bf91-239a1ba1d998  2024-11-23   
1  16d24c6a-d3ea-4558-9a6b-2694beb581ec  2024-11-23   
2  14098fe6-6649-4fb4-aeb1-f1b36fdcca6f  2024-11-23   
3  6b215132-0eaf-40f7-a7f4-758c8fbf35b1  2024-11-23   
4  b57b2878-92ba-4468-a3dc-722f2c61b51b  2024-11-23   

                                             content  score app_name  \
0                                      No comment 😭😔      5  Netflix   
1  lately the sounds go thru but no picture...its...      2  Netflix   
2                                 This is very good👍      5  Netflix   
3  Why auto payment with bank is activated after ...      1  Netflix   
4  Keeps updating every 2 days and suddenly canno...      1  Netflix   

   textblob_pattern_polarity  textblob_pattern_subjectivity  vader_neg  \
0                       0.00                           0.00      0.500   
1                      -0.55                           0.75      0.205   
2             

### zwracanie pojedynczych wartosci sentymentu

In [41]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import pandas as pd
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import os
from tqdm import tqdm

# ----------------------------
# 1. Set Up Device for MacBook M2
# ----------------------------

# Check and set the device to MPS (Metal Performance Shaders) if available
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("MPS (Metal Performance Shaders) device is available. Using GPU acceleration.")
else:
    device = torch.device("cpu")
    print("No GPU device found. Using CPU.")

# ----------------------------
# 2. Load and Validate Data
# ----------------------------

# Path to the Parquet file
parquet_path = 'models_comparison/reviews.parquet'

# Check if the Parquet file exists
if not os.path.exists(parquet_path):
    print(f"Parquet file not found at path: {parquet_path}")
    exit()

# Load data from the Parquet file
try:
    df = pd.read_parquet(parquet_path)
    print("Parquet file successfully loaded.")
except Exception as e:
    print(f"Error loading Parquet file: {e}")
    exit()

# Check if the 'content' column exists
print("Available columns:", df.columns.tolist())
if 'content' not in df.columns:
    print("The 'content' column was not found in the Parquet file.")
    exit()

# ----------------------------
# 3. Initialize Sentiment Analyzers
# ----------------------------

# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Initialize DistilBERT tokenizer and model
print("Loading DistilBERT model...")
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model.to(device)  # Move the model to the selected device (MPS/GPU/CPU)
model.eval()      # Set the model to evaluation mode

# ----------------------------
# 4. Define Sentiment Analysis Functions
# ----------------------------

# Function to analyze sentiment using TextBlob
def analyze_textblob(text):
    try:
        blob = TextBlob(str(text))
        return blob.sentiment.polarity
    except Exception as e:
        print(f"TextBlob error for text: {text}\nError: {e}")
        return None

# Function to analyze sentiment using VADER
def analyze_vader(text):
    try:
        scores = analyzer.polarity_scores(str(text))
        return scores['compound']
    except Exception as e:
        print(f"VADER error for text: {text}\nError: {e}")
        return None

# Function to analyze sentiment using DistilBERT (calculating sentiment score by subtracting negative probability from positive probability)
def analyze_distilbert(text):
    try:
        # Tokenize the input text and move tensors to the selected device
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        inputs = {key: value.to(device) for key, value in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Apply softmax to get probabilities
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
        
        # Extract negative and positive probabilities
        negative_prob, positive_prob = probabilities[0].tolist()
        
        # Calculate sentiment score
        sentiment_score = positive_prob - negative_prob
        
        # Determine sentiment label based on the sentiment score
        if sentiment_score > 0:
            sentiment_label = "POSITIVE"
        elif sentiment_score < 0:
            sentiment_label = "NEGATIVE"
        else:
            sentiment_label = "NEUTRAL"
        
        return sentiment_score
    except Exception as e:
        print(f"DistilBERT error for text: {text}\nError: {e}")
        return None

# Function to analyze sentiment using DistilBERT in batches
def analyze_distilbert_batch(texts, batch_size=32):
    sentiments = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Analyzing DistilBERT in Batches"):
        batch = texts[i:i+batch_size]
        try:
            # Tokenize the batch and move tensors to the selected device
            inputs = tokenizer(batch, return_tensors="pt", truncation=True, padding=True, max_length=512)
            inputs = {key: value.to(device) for key, value in inputs.items()}
            
            with torch.no_grad():
                outputs = model(**inputs)
            
            # Apply softmax to get probabilities
            probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
            probs = probabilities.tolist()
            
            for prob in probs:
                negative_prob, positive_prob = prob
                sentiment_score = positive_prob - negative_prob
                
                if sentiment_score > 0:
                    sentiment_label = "POSITIVE"
                elif sentiment_score < 0:
                    sentiment_label = "NEGATIVE"
                else:
                    sentiment_label = "NEUTRAL"
                
                sentiments.append(sentiment_score)
        except Exception as e:
            print(f"DistilBERT error for batch {i}-{i+batch_size}: {e}")
            sentiments.extend([None]*len(batch))
    return sentiments

# ----------------------------
# 5. Apply Sentiment Analyses
# ----------------------------

# Apply TextBlob sentiment analysis
print("Analyzing sentiment using TextBlob...")
df['sentiment_textblob'] = df['content'].apply(analyze_textblob)

# Apply VADER sentiment analysis
print("Analyzing sentiment using VADER...")
df['sentiment_vader'] = df['content'].apply(analyze_vader)

# Apply DistilBERT sentiment analysis in batches
print("Analyzing sentiment using DistilBERT...")
df['sentiment_distilbert'] = analyze_distilbert_batch(df['content'].tolist())

# ----------------------------
# 6. View and Save Results
# ----------------------------

# Display the first few rows of the DataFrame with sentiment scores
print(df.head())

# Optionally, save the results to a new Parquet file
output_path = 'models_comparison/reviews_with_sentiments.parquet'
try:
    df.to_parquet(output_path, index=False)
    print(f"Results have been saved to: {output_path}")
except Exception as e:
    print(f"Error saving Parquet file: {e}")

MPS (Metal Performance Shaders) device is available. Using GPU acceleration.
Parquet file successfully loaded.
Available columns: ['review_id', 'at', 'content', 'score', 'app_name']
Loading DistilBERT model...
Analyzing sentiment using TextBlob...
Analyzing sentiment using VADER...
Analyzing sentiment using DistilBERT...


Analyzing DistilBERT in Batches:  37%|███▋      | 7989/21834 [10:31<18:15, 12.64it/s]  


KeyboardInterrupt: 

## pierwsze podejscie

In [5]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# Load the tokenizer and the model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

def analyze_sentiment(text):
    """
    Perform sentiment analysis using DistilBERT.

    Args:
        text (str): Input text to analyze.

    Returns:
        dict: Sentiment label and score.
    """
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Get logits and apply softmax to compute probabilities
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=-1)

    # Extract probabilities and determine label
    negative_prob, positive_prob = probabilities[0].tolist()
    sentiment_label = "POSITIVE" if positive_prob > negative_prob else "NEGATIVE"
    sentiment_score = positive_prob if sentiment_label == "POSITIVE" else -negative_prob

    return {
        "label": sentiment_label,
        "score": sentiment_score,
    }

text = "this app is alright but not great"
result = analyze_sentiment(text)

print(f"Text: {text}")
print(f"Sentiment: {result['label']}")
print(f"Score: {result['score']}")

Text: this app is alright but not great
Sentiment: NEGATIVE
Score: -0.9938241243362427


## drugie podejscie

In [6]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# Load the tokenizer and the model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model.eval()  # Set model to evaluation mode

def analyze_sentiment(text):
    """
    Perform sentiment analysis using DistilBERT with a subtraction approach.

    Args:
        text (str): Input text to analyze.

    Returns:
        dict: Sentiment label and score.
    """
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Get logits and apply softmax to compute probabilities
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=-1)

    # Extract probabilities for negative and positive sentiments
    negative_prob, positive_prob = probabilities[0].tolist()

    # Calculate sentiment score by subtracting negative probability from positive probability
    sentiment_score = positive_prob - negative_prob

    # Determine sentiment label based on the sentiment score
    if sentiment_score > 0:
        sentiment_label = "POSITIVE"
    elif sentiment_score < 0:
        sentiment_label = "NEGATIVE"
    else:
        sentiment_label = "NEUTRAL"

    return {
        "label": sentiment_label,
        "score": sentiment_score,
    }

# Example usage
text = "this app is alright but not great"
result = analyze_sentiment(text)

print(f"Text: {text}")
print(f"Sentiment: {result['label']}")
print(f"Score: {result['score']}")

Text: this app is alright but not great
Sentiment: NEGATIVE
Score: -0.9876482994295657
