In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, regexp_replace
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline

# Set up Spark session
spark = SparkSession.builder.appName('IMDBReviewClassifier').getOrCreate()


In [3]:
# Load the dataset
df = spark.read.csv(r'C:\Users\douaa\OneDrive\Desktop\DSS Program\DSS 2 Big Data\Datasets\IMDB Dataset.csv', header=True, inferSchema=True)

In [4]:
# Data preprocessing
# Lowercase and remove special characters
df = df.withColumn('review', lower(col('review')))
df = df.withColumn('review', regexp_replace(col('review'), '[^a-zA-Z\s]', ''))

In [5]:
# Tokenize the text
tokenizer = Tokenizer(inputCol='review', outputCol='words')
df = tokenizer.transform(df)

In [6]:
# Remove stop words 
remover = StopWordsRemover(inputCol='words', outputCol='filtered')
df = remover.transform(df)

IllegalArgumentException: 'StopWordsRemover_8ca649623abb parameter locale given invalid value en_AE.'

In [7]:
# Convert sentiments to numerical labels
indexer = StringIndexer(inputCol='sentiment', outputCol='label')
df = indexer.fit(df).transform(df)

In [8]:
# Feature engineering
vectorizer = CountVectorizer(inputCol='filtered', outputCol='rawFeatures')
idf = IDF(inputCol='rawFeatures', outputCol='features')

# Model development
(train, test) = df.randomSplit([0.8, 0.2], seed=42)

lr = LogisticRegression(featuresCol='features', labelCol='label')

pipeline = Pipeline(stages=[tokenizer, vectorizer, idf, indexer, lr])

In [9]:
# Hyperparameter tuning and cross-validation
paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5]) \
    .build()


In [10]:
# Hyperparameter tuning and cross-validation
paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5]) \
    .build()

In [40]:
crossval = CrossValidator(estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=BinaryClassificationEvaluator(),
    numFolds=3)

# Train the model
cvModel = crossval.fit(train)

# Evaluate the model
predictions = cvModel.transform(test)
evaluator = BinaryClassificationEvaluator()
accuracy = evaluator.evaluate(predictions)

print(f"Model Accuracy: {accuracy}")

# Stop the Spark session
spark.stop()

IllegalArgumentException: 'Output column words already exists.'