# Import Packages

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType, ArrayType
from pyspark.sql.functions import udf
from nltk.stem import SnowballStemmer
from pyspark.sql.types import ArrayType, StringType
from pyspark.ml.feature import RegexTokenizer
from pyspark.ml.feature import StopWordsRemover
from nltk.stem import WordNetLemmatizer
from pyspark.sql.functions import trim, lower, col
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from pyspark.sql.functions import udf



Adding the final review csv file 

In [7]:
%cd Big_Data_IMDb/

# add the csv files here otherwise the code wont work, cannot push it to GitHub
# sadly, too large
!mkdir movie_reviews
%pwd

!mv /Users/bognarlili/Desktop/final_reviews_data.csv movie_reviews/

!ls movie_reviews/

/Users/bognarlili/Big_Data_IMDb/Big_Data_IMDb
mkdir: movie_reviews: File exists
mv: /Users/bognarlili/Desktop/final_reviews_data.csv: No such file or directory
final_reviews_data.csv


Loading CSV into Spark

In [8]:
spark = SparkSession.builder \
    .appName("IMDb Reviews Analysis") \
    .getOrCreate()

df = spark.read.csv("movie_reviews/final_reviews_data.csv", header=True, inferSchema=True)
df.show(5)  


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/13 17:10:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/03/13 17:10:36 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/03/13 17:10:36 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
                                                                                

+---------+------------------+--------+----+-----+------------------+--------------------+--------------------+------------+--------------------+
|   tconst|       movie_title|numVotes|year|label|tomatometer_status|         review_type|               genre|review_label|              review|
+---------+------------------+--------+----+-----+------------------+--------------------+--------------------+------------+--------------------+
|tt0009369|            mickey|  1119.0|1918|false|              NULL|                NULL|                NULL|        NULL|['Mack Sennett ha...|
|tt0010600|          the doll|  1898.0|1919| true|              NULL|                NULL|                NULL|        NULL|['"The Doll" is a...|
|tt0011439| the mark of zorro|  2439.0|1920| true|             Fresh|['Rotten', 'Fresh...|Action & Adventur...|       Fresh|['It was such a s...|
|tt0011607|the parson's widow|  1264.0|1920| true|              NULL|                NULL|                NULL|        NULL|

# Cleaning

Remove entries with missing review data as these movies are not present in the IMDB database and therefore represent incomplete data points that would skew our analysis

In [9]:
# check the number of the rows
# remove entries with missing review data
# drop data before 1930
print(f"original nr of rows: {df.count()}")
df = df.dropna(subset=["review"])
df = df.filter(df["year"] >= 1930)
print(f"nr of rows after cleaning: {df.count()}")
df.show(5)


original nr of rows: 8061
nr of rows after cleaning: 7914
+---------+--------------------+--------+----+-----+------------------+-----------+-----+------------+--------------------+
|   tconst|         movie_title|numVotes|year|label|tomatometer_status|review_type|genre|review_label|              review|
+---------+--------------------+--------+----+-----+------------------+-----------+-----+------------+--------------------+
|tt0016029|  the little colonel|  1646.0|1935| true|              NULL|       NULL| NULL|        NULL|['THE LITTLE COLO...|
|tt0017961|           happiness|  1080.0|1935| true|              NULL|       NULL| NULL|        NULL|['I really love s...|
|tt0020298|         queen kelly|  3226.0|1932| true|              NULL|       NULL| NULL|        NULL|['I'd imagine tha...|
|tt0020768|           city girl|  3199.0|1930| true|              NULL|       NULL| NULL|        NULL|['Silent film may...|
|tt0021309|the story of the fox|     NaN|1937| true|              NULL|   

# Preparing for Sentiment Analysis

In [10]:
# Import necessary modules
from pyspark.sql.functions import col, udf
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from pyspark.sql.types import ArrayType, StringType

class TextProcessor:
    """
    A class to handle text preprocessing steps including:
    - Tokenization (splitting text into words)
    - Stopword removal (removing common words like "the", "is", etc.)
    - Stemming (reducing words to their root form)
    - Lemmatization (reducing words to their base form)
    """

    def __init__(self):
        """
        Constructor that initializes the stemmer and lemmatizer objects.
        - SnowballStemmer: Used for stemming words.
        - WordNetLemmatizer: Used for lemmatizing words.
        """
        self.stemmer = SnowballStemmer(language="english")
        self.lemmatizer = WordNetLemmatizer()

    def tokenize(self, df, input_col, output_col):
        """
        Tokenizes the specified column into words using RegexTokenizer.

        """
        tokenizer = RegexTokenizer(
            inputCol=input_col, 
            outputCol=output_col, 
            pattern="\\W+",   # Split on non-word characters
            toLowercase=True  # Convert text to lowercase
        )
        return tokenizer.transform(df)
    
    def remove_stopwords(self, df, input_col, output_col):
        """
        Removes stopwords (e.g., "the", "and", "is") using StopWordsRemover.
            output_col (str): Name of the new column with stopwords removed.
        """
        remover = StopWordsRemover(
            inputCol=input_col,
            outputCol=output_col
        )
        return remover.transform(df)
    
    @staticmethod
    @udf(ArrayType(StringType()))
    def stemming(words):
        """
        Applies stemming using SnowballStemmer to reduce words to their root form.

        """
        if words is not None:
            stemmer = SnowballStemmer(language="english")
            return [stemmer.stem(word) for word in words]
        return None
    
    @staticmethod
    @udf(ArrayType(StringType()))
    def lemmatizing(words):
        """
        Applies lemmatization using WordNetLemmatizer to reduce words to their base form.

        """
        if words is not None:
            lemmatizer = WordNetLemmatizer()
            return [lemmatizer.lemmatize(word) for word in words]
        return None



# Processing Pipeline

In [11]:
#processor instance
processor = TextProcessor()

# Initialize SparkSession 
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("IMDb Reviews Analysis") \
    .getOrCreate()

# Load CSV file into a PySpark DataFrame
df = spark.read.csv("movie_reviews/final_reviews_data.csv", header=True, inferSchema=True)

#cleaning pipeline in one call

def run_cleaning(df):
    """
    Executes all preprocessing steps:
    1. Tokenization
    2. Stopword removal
    3. Stemming
    4. Lemmatization
    
    Args:
        df (DataFrame): Input PySpark DataFrame.

    Returns:
        DataFrame: Processed DataFrame with cleaned text data.
    """
    processor = TextProcessor()
    
    # Execute steps in order
    df = processor.tokenize(df, input_col="review", output_col="review_words")
    df = processor.remove_stopwords(df, input_col="review_words", output_col="review_clean")
    df = df.withColumn("review_stemmed", processor.stemming(col("review_clean")))
    df = df.withColumn("review_lemmatized", processor.lemmatizing(col("review_stemmed")))

    return df

df_cleaned = run_cleaning(df)
df_cleaned.show(truncate=False)


[Stage 12:>                                                         (0 + 1) / 1]

+---------+---------------------------+--------+----+-----+------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [16]:
file_path = "NLP_review_cleaning.ipynb"

# Open and close the file to save it
with open(file_path, "a") as f:
    pass

print(f"✅ {file_path} has been saved.")


✅ NLP_review_cleaning.ipynb has been saved.
