In [0]:
reviews = spark.sql("SELECT * FROM reviews_2")
reviews.show()

In [0]:
reviews.select("Text").show()

In [0]:
reviews.select("Text").first()[0]

In [0]:
import matplotlib.pyplot as plt
score_counts = reviews.groupBy("Score").count().orderBy("Score").toPandas()

score_counts

In [0]:
plt.figure(figsize=(10, 6))
plt.bar(score_counts["Score"], score_counts["count"], color="skyblue")
plt.xlabel("Score")
plt.ylabel("Review Count")
plt.title("Review Count Based on Score")
#plt.xticks(score_counts["Score"])
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

In [0]:
from pyspark.sql.functions import when

reviews = reviews.withColumn("ReviewSentiment", when(reviews["Score"] <=3, "Negative Review").otherwise("Positive Review"))
reviews.show()

In [0]:
review_count = reviews.groupBy("ReviewSentiment").count().orderBy("ReviewSentiment").toPandas()
review_count

In [0]:
plt.figure(figsize=(8, 6))
plt.bar(review_count["ReviewSentiment"], review_count["count"], color="skyblue")
plt.xlabel("ReviewSentiment")
plt.ylabel("Review Count")
plt.title("Review Count Based on Score")
#plt.xticks(score_counts["Score"])
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

In [0]:
!pip install nltk

In [0]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

nltk.download("stopwords")

def preprocess_text(text):
    #Convert to lowercase 
    text = text.lower()
    #Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    #Tokenization 
    tokens = nltk.word_tokenize(text)
    #Stopword removal
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]
    #Stemming
    lemmatize = WordNetLemmatizer()
    tokens = [lemmatize.lemmatize(word) for word in tokens]
    #join tokens back into text
    return ' '.join(tokens)

In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
#Register preprocessing function as udf 
preprocess_udf = udf(preprocess_text, StringType())

In [0]:
#Apply pre

#Apply preprocessing to each review
preprocessed_reviews = [preprocess_text(reviews) for review in reviews["Text"]]

#Vectorization using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit.transform(preprocessed_reviews)
y = reviews.ReviewSentiment
#Splitting data into training & test data
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Display preprocessed reviews
print("Preprocessed reviews: ")
for review in preprocessed_reviews:
    print(review)

#Display TF-IDF vectors
print("\n TF-IDF vectors:")
print(X.toarray())

#Display training and test data shapes
print("\nTraining Data Shape: ", X_train.shape)
print("\nTest Data Shape: ", X_test.shape)