In [None]:
import pandas as pd

data = pd.read_csv("./kinton_reviews.csv")

In [None]:
from textblob import TextBlob

# Assuming the reviews are in a column named 'review'
data['sentiment'] = data['review_text'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)

In [None]:
import matplotlib.pyplot as plt

# Create a new column for sentiment label
data['sentiment_label'] = data['sentiment'].apply(lambda x: 'positive' if x > 0 else 'negative')

# Group by sentiment label and calculate average review_rating
avg_rating_by_sentiment = data.groupby('sentiment_label')['review_rating'].mean()

print(avg_rating_by_sentiment)

# Visualize the correlation
avg_rating_by_sentiment.plot(kind='bar', color=['green', 'red'])
plt.ylabel('Average Review Rating')
plt.title('Average Review Rating by Sentiment')
plt.show()

In [None]:
num_with_review_text = data['review_text'].notnull().sum()
percentage_with_review_text = num_with_review_text / len(data) * 100

print(f"Rows with review_text: {num_with_review_text} ({percentage_with_review_text:.2f}%)")

In [None]:
import re

# Define a regex pattern for English words (a-z, A-Z)
english_word_pattern = re.compile(r'[a-zA-Z]+')

# Create a mask for reviews containing at least one English word
english_comment_mask = data['review_text'].apply(lambda x: bool(english_word_pattern.search(str(x))) if pd.notnull(x) else False)

english_count = english_comment_mask.sum()
non_english_count = (~english_comment_mask & data['review_text'].notnull()).sum()
english_percentage = english_count / num_with_review_text * 100
non_english_percentage = non_english_count / num_with_review_text * 100

print(f"English comments: {english_count} ({english_percentage:.2f}%)")
print(f"Non-English comments: {non_english_count} ({non_english_percentage:.2f}%)")

In [None]:
# Display non-English comments
for idx, comment in non_english_comments.items():
    print(f"Original (index {idx}): {comment}\n")
