Get only review text for sentiment analysis

In [None]:
import pandas as pd
import re
import math

# Read CSV files
data = pd.read_csv("./kinton_reviews.csv")
train = pd.read_csv("./data/train.csv")

In [None]:
# Check the shape of the dataframes
print(data.shape)
print(train.shape)

# Display the first 3 rows of the data dataframe
data.head(3)

In [None]:
# Atract relevant columns and drop rows with missing values
reviews1 = data[["review_rating", "review_text"]].dropna()

# Display the first 5 rows of the reviews1 dataframe
reviews1.head(5)

In [None]:
!pip install deep-translator langid textblob

Start building sentiment analysis model

In [None]:
from textblob import TextBlob
from deep_translator import GoogleTranslator
import langid

def getPolarity(text):
    text = str(text)
    try:
        lang = langid.classify(text)[0]
    except Exception:
        lang = 'en'
    
    if lang != 'en':
        try:
            text = GoogleTranslator(source='auto', target='en').translate(text=text)
        except Exception:
            pass  # If translation fails, proceed with the original text

    return TextBlob(text).sentiment.polarity

In [None]:
# Sentiment analysis using TextBlob
sample = reviews1.sample(5, random_state=42).copy()
sample["polarity"] = sample["review_text"].apply(getPolarity)
sample

In [None]:
reviews1["polarity"] = reviews1["review_text"].apply(getPolarity)

# Classify sentiment based on polarity
reviews1["sentiment_model"] = reviews1["polarity"].apply(
    lambda x: 'positive' if x > 0.15 else ('negative' if x < -0.15 else 'neutral')
)

Calculate prediction accuracy

In [None]:
# Classify true sentiment based on review_rating
reviews1["sentiment_true"] = reviews1["review_rating"].apply(
    lambda x: 'positive' if x > 3 else ('negative' if x < 3 else 'neutral')
)

# Calculate accuracy
reviews1["accuracy"] = reviews1["sentiment_model"] == reviews1["sentiment_true"]
accuracy = reviews1["accuracy"].mean() * 100

print(f"Prediction Accuracy (All Data): {accuracy:.2f}%")

Visualization

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8,5))
sns.boxplot(x="review_rating", y="polarity", data=reviews1)
plt.title("Distribution of Sentiment Polarity by Star Rating")
plt.xlabel("Review Rating (Stars)")
plt.ylabel("Polarity")
plt.show()