In [1]:
import pandas as pd
import openpyxl
import csv
import warnings
warnings.filterwarnings("ignore")
import spacy

In [2]:
from glob import glob

In [2]:
all_excel_files = glob("english_comments/*.xlsx")

df = pd.concat([pd.read_excel(excel_file,index_col=[0]) for excel_file in all_excel_files],ignore_index = True)

In [4]:
df['body'] = df['body'].map(lambda x: x.lower())

In [10]:
#downloaded Amazon_book_reviews from Kaggle
import zipfile
from io import BytesIO

with zipfile.ZipFile("archive.zip", "r") as f:
    for name in f.namelist():
        #print(name)
        if name=='Books_rating.csv':
            with f.open(name) as zd:
                amazon_reviews = pd.read_csv(zd)

In [None]:
amazon_reviews.columns

In [12]:
#sample is chosen for training the model
amazon_sample=amazon_reviews.sample(n=10000)

In [None]:
#preprocessing the text
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove URLs, numbers, and special characters
    text = re.sub(r'http\\S+|www\\.\\S+', ' ', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\\s]', ' ', text)        # Remove special characters and numbers
    text = re.sub(r'\\s+', ' ', text).strip() 
    # Remove extra whitespace
    # Tokenize and lemmatize
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing to the datasets
amazon_sample['review/text'] = amazon_sample['review/text'].apply(preprocess_text)
df['body_c'] = df['body'].apply(preprocess_text)

#remove non-ascii characters
amazon_sample['review/text']=amazon_sample['review/text'].str.replace(r'\\x[0-9a-f]{2}', '', regex=True)
df['body_c']=df['body_c'].str.replace(r'\\x[0-9a-f]{2}', '', regex=True)

In [21]:
import torch
torch.cuda.empty_cache()

In [None]:
#sentiment analysis

In [None]:
#import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Device configuration
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# Map ratings to sentiment
def map_sentiment(rating):
    if rating <= 2:
        return 0  # Negative
    elif rating == 3:
        return 1  # Neutral
    else:
        return 2  # Positive

# Process the data
amazon_sample['sentiment'] = amazon_sample['review/score'].apply(map_sentiment)

X_train, X_test, y_train, y_test = train_test_split(
    amazon_sample['review/text'],
    amazon_sample['sentiment'],
    test_size=0.2,
    random_state=42,
    stratify=amazon_sample['sentiment']
)

# Prepare Hugging Face datasets
train_dataset = Dataset.from_pandas(pd.DataFrame({'text': X_train, 'label': y_train}))
test_dataset = Dataset.from_pandas(pd.DataFrame({'text': X_test, 'label': y_test}))

# Load pre-trained tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)
model.to(device)

# Tokenize the data
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=256)

train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Keep necessary columns only
train_dataset = train_dataset.remove_columns(['text', '__index_level_0__'])
test_dataset = test_dataset.remove_columns(['text', '__index_level_0__'])

# Define a custom compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1).numpy()
    return {
        "accuracy": accuracy_score(labels, predictions),
        "precision": precision_score(labels, predictions, average='weighted'),
        "recall": recall_score(labels, predictions, average='weighted'),
        "f1": f1_score(labels, predictions, average='weighted')
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    save_total_limit=2,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,  # Add compute_metrics for detailed evaluation
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

# Train and evaluate the model
trainer.train()
trainer.evaluate()

# Predict on Reddit comments
reddit_comments_tokenized = tokenizer(
    list(df['body_c']),
    truncation=True,
    padding=True,
    max_length=256,
    return_tensors='pt'
)
reddit_comments_tokenized = {key: val.to(device) for key, val in reddit_comments_tokenized.items()}

with torch.no_grad():
    outputs = model(**reddit_comments_tokenized)
predictions = torch.argmax(outputs.logits, dim=1).cpu().numpy()

# Map predictions back to labels
sentiment_labels = {0: 'negative', 1: 'neutral', 2: 'positive'}
df['predicted_sentiment'] = [sentiment_labels[pred] for pred in predictions]


In [None]:
#emotion extraction

In [None]:
def preprocess_comment(text):
    # Lowercase the text
    text = text.lower()
    # Remove URLs, numbers, and special characters
    text = re.sub(r'http\\S+|www\\.\\S+', ' ', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\\s]', ' ', text)        # Remove special characters and numbers
    text = re.sub(r'\\s+', ' ', text).strip() 
    
df['body_c'] = df['body'].apply(preprocess_comment)

#remove non-ascii characters
df['body_c']=df['body_c'].str.replace(r'\\x[0-9a-f]{2}', '', regex=True)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
model = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base")

def get_emotion_scores(text):
    # Tokenize the text with padding and truncation
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    
    # Perform inference without updating gradients (for efficiency)
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Apply softmax to the output logits to get probabilities
    scores = torch.nn.functional.softmax(outputs.logits, dim=-1)
    
    # Map label indices to the corresponding emotion labels
    emotion_labels = model.config.id2label
    return {emotion_labels[i]: score.item() for i, score in enumerate(scores[0])}

# Apply the function to the 'body_c' column to get emotion scores
df['emotion_scores'] = df['body_c'].apply(get_emotion_scores)

# Optionally, extract the top predicted emotion based on the highest score
df['predicted_emotion'] = df['emotion_scores'].apply(lambda x: max(x, key=x.get))
