In [None]:
# Load modules
import pandas as pd
import sys
import os
from zipfile import ZipFile
import matplotlib.pyplot as plt; plt.rcParams["font.family"] = "Palatino Linotype"
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
import seaborn as sns
import emoji
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from imojify import imojify
import gdown

## Pyspark modules
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression, NaiveBayes
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
# Set paths
results_dir = "<path_to_result_directory>"
plots_dir = os.path.join(results_dir, "Plots")

# Download and unzip Steam3.zip to parse the raw reviews
raw_reviews_url = "https://drive.google.com/file/d/1oQaRv-7VOoAfhsCW4HDk3xUx5bzRJOf7/view?usp=sharing"
gdown.download(raw_reviews_url, "Steam3.zip", quiet=False, fuzzy=True)
with ZipFile("Steam3.zip", 'r') as zObject:  
    zObject.extractall(path="<path>")

raw_reviews_dir = os.path.join(results_dir, "<path_to_Steam3_directory>")

In [None]:
# If dataframe has already been saved, don't run again.
createDf = False
if createDf:
    data = []
    for root, dirs, files in os.walk(raw_reviews_dir):
        for file in files:
            if file.startswith('part-'):  
                with open(os.path.join(root, file), 'r') as f:
                    for line in f:
                        try:
                            record = json.loads(line)
                            for key in record:
                                if isinstance(record[key], str):
                                    record[key] = record[key].strip()
                                    record[key] = record[key].replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
                            data.append(record)
                        except json.JSONDecodeError:
                            continue

    df = pd.DataFrame(data)
    df = df.fillna("")
    df.to_csv(os.path.join(results_dir, 'DF_Streaming.csv'), index=False)

In [7]:
# Read the streaming data saved to a dataframe earlier
df_pandas_orig = pd.read_csv("DF_Streaming.csv")
# Remove NA values
df_pandas = df_pandas_orig.dropna(inplace=False)

In [8]:
# Create Spark session for DF processing
# Set environment variables for Spark
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
spark = SparkSession.builder.appName('Spark_Predictions').getOrCreate()

In [9]:
# Create PySpark DataFrame from Pandas
df_spark = spark.createDataFrame(df_pandas)
df_spark.printSchema()
print(df_spark.count())
print(df_spark.columns) # 'review_id', 'app_id', 'review_text', 'label'

root
 |-- review_id: long (nullable = true)
 |-- app_id: long (nullable = true)
 |-- review_text: string (nullable = true)
 |-- label: long (nullable = true)

7111
['review_id', 'app_id', 'review_text', 'label']


In [15]:
# Barplot of label distribution

label_counts = df_pandas["label"].value_counts().sort_values()

fig, ax = plt.subplots(figsize = (12,8), dpi=300)
bars = ax.bar(label_counts.index, label_counts, color = ["Red", "Green"])
plt.title("Distribution of Reviews", fontsize=32, fontweight="bold")
plt.xlabel('Label', fontsize=24, fontweight="bold")
plt.ylabel('Number of Reviews', fontsize=24, fontweight="bold")
ax.set_xticks([0,1])
plt.xticks(fontsize=18)
ax.set_xticklabels(["Downvote (0)", "Upvote (1)"])
plt.yticks(fontsize=18)
plt.grid(axis='y', linestyle=":")
ax.bar_label(bars, fontsize=20)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.savefig(os.path.join(plots_dir, "Review_Distribution.jpg"), bbox_inches='tight')
plt.close()

In [19]:
# Most frequent words in reviews:
stop_words = set(stopwords.words('english'))

def word_counts(reviews):
    word_freq = {}    
    for review in reviews:
        tokens = word_tokenize(review.lower())
        for token in tokens:
            if token in stop_words or len(token) <= 2 or token in ["n't"]:
                continue
            word_freq[token] = word_freq.get(token,0) + 1    
    return sorted(word_freq.items(), key=lambda item: item[1], reverse=True)

## Reviews for each label
upvoted_reviews = list(df_pandas[df_pandas["label"] == 1]["review_text"])
downvoted_reviews = list(df_pandas[df_pandas["label"] == 0]["review_text"])

In [20]:
# Generate bar plot for the most frequently used words
def plotMostFrequentWords(reviews, filename, label):
    freqs = word_counts(reviews)[:10]
    words = [i[0] for i in freqs]
    counts = [i[1] for i in freqs]

    plt.figure(figsize=(18,8), dpi=300)
    ax = sns.barplot(x=words, y=counts)

    plt.title("Most Frequent Words Used in " + label + " Reviews", fontsize=32, fontweight="bold")
    plt.ylabel('Count', fontsize=24, fontweight="bold")
    plt.xlabel('Word', fontsize=24, fontweight="bold")
    plt.yticks(fontsize=18)
    plt.xticks(fontsize=18)
    plt.grid(axis='y', linestyle=":")
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    plt.savefig(filename, bbox_inches='tight')
    plt.close()

plotMostFrequentWords(upvoted_reviews, os.path.join(plots_dir, "Frequent_Words_Upvoted.jpg"), "Upvoted")
plotMostFrequentWords(downvoted_reviews, os.path.join(plots_dir, "Frequent_Words_Downvoted.jpg"), "Downvoted")

In [None]:
# Get emojis from reviews
def get_emojis(reviews):
    return set([char for char in reviews if char in emoji.UNICODE_EMOJI['en']])

def emoji_frequencies(reviews):    
    emoji_freqs = {}
    unique_emoji = get_emojis(reviews.lower())
    tokens = word_tokenize(reviews)
    for token in tokens:
        if token in unique_emoji:
            emoji_freqs[token] = emoji_freqs.get(token,0)+1    
    return sorted(emoji_freqs.items(), key=lambda x: x[1], reverse=True)

upvoted_reviews_joined = " ".join([review for review in upvoted_reviews])
emojis_upvoted = emoji_frequencies(upvoted_reviews_joined)
print("Emojis in Upvoted Reviews", emojis_upvoted)

downvoted_reviews_joined = " ".join([review for review in downvoted_reviews])
emojis_downvoted = emoji_frequencies(downvoted_reviews_joined)
print("Emojis in Downvoted Reviews", emojis_downvoted)

In [23]:
def plotEmojis(df_emojis, label, offset, fig_size, filename="", downvote_yticks=False):
    def offset_image(cords, emoji, ax):
        img = plt.imread(imojify.get_img_path(emoji))
        im = OffsetImage(img, zoom=0.12)
        im.image.axes = ax
        ab = AnnotationBbox(im, (cords[0], cords[1]),  frameon=False, pad=0)
        ax.add_artist(ab)

    fig, ax = plt.subplots(figsize=fig_size, dpi=300)
    bars = ax.bar(range(len(df_emojis)), df_emojis.Count, width=0.5, align="center")
    ax.set_xticks(range(len(df_emojis)))
    ax.set_xticklabels([])
    ax.tick_params(axis='x', which='major', pad=26)
    ax.set_ylim((0, ax.get_ylim()[1]+10))

    for i, e in enumerate(df_emojis.Emoji):
        offset_image([i, df_emojis.Count[i]+offset], e, ax)
    plt.title("Emojis in " + label + " Reviews", fontsize=42, fontweight="bold")
    plt.ylabel('Count', fontsize=32, fontweight="bold")
    plt.xlabel('Emoji', fontsize=32, fontweight="bold")
    if downvote_yticks:
        plt.yticks([0,1], fontsize=18)
    else:
        plt.yticks(fontsize=18)
    plt.grid(axis='y', linestyle=":")
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    plt.savefig(filename, bbox_inches='tight')
    plt.close()

df_emojis_upvoted = pd.DataFrame({"Emoji": dict(emojis_upvoted).keys(), "Count": dict(emojis_upvoted).values()})
df_emojis_downvoted = pd.DataFrame({"Emoji": dict(emojis_downvoted).keys(), "Count": dict(emojis_downvoted).values()})

print(sum(dict(emojis_upvoted).values()), sum(dict(emojis_downvoted).values())) # 109, 4

plotEmojis(df_emojis_upvoted, "Upvoted", 10, (24,8), os.path.join(plots_dir, "Emojis_Upvoted_Reviews.jpg"), False)
plotEmojis(df_emojis_downvoted, "Downvoted", 2, (24,4), os.path.join(plots_dir, "Emojis_Downvoted_Reviews.jpg"), True)

In [None]:
# Split Data into training and test sets
training_data, test_data = df_spark.randomSplit([0.8, 0.2], seed = 100)
print("Number of reviews in training data:", training_data.count())
print("Number of reviews in test data:", test_data.count())

In [30]:
# Pipeline with Naive Bayes Model
stop_words = set(stopwords.words('english'))
tokenizer = Tokenizer(inputCol="review_text", outputCol="words") # stop words
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(list(stop_words))
count_vectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

nb_model = NaiveBayes(smoothing=1)
pipeline_nb = Pipeline(stages=[tokenizer, stopwords_remover, count_vectors, nb_model])

fitted_pipeline_nb = pipeline_nb.fit(training_data)
predictions_nb = fitted_pipeline_nb.transform(test_data)
predictions_nb.show(5)

evaluator_nb = BinaryClassificationEvaluator(labelCol="label")
evaluator_nb.evaluate(predictions_nb) # 0.60

+---------+-------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|review_id| app_id|         review_text|label|               words|            filtered|            features|       rawPrediction|         probability|prediction|
+---------+-------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|124141594|1730650|UPDATE!!!: After ...|    1|[update!!!:, afte...|[update!!!:, patc...|(5479,[0,1,3,5,6,...|[-942.53567424640...|[0.99248081640494...|       0.0|
|131364642|1269300|Fun concept, best...|    1|[fun, concept,, b...|[fun, concept,, b...|(5479,[3,31,72,23...|[-29.796736230375...|[0.04980778661868...|       1.0|
|134329345|2202690|The game is very ...|    1|[the, game, is, v...|[game, fun,, firs...|(5479,[1,6,8,9,11...|[-243.47902462196...|[5.74579169533525...|       1.0|
|135846672|1269300|rea

0.6007698064792045

In [33]:
# Pipeline with Logistic Regression Model
lr_model = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0.0)
pipeline_lr = Pipeline(stages=[tokenizer, stopwords_remover, count_vectors, lr_model])

fitted_pipeline_lr = pipeline_lr.fit(training_data)
predictions_lr = fitted_pipeline_lr.transform(test_data)
predictions_lr.show(5)

evaluator_lr = BinaryClassificationEvaluator(labelCol="label")
evaluator_lr.evaluate(predictions_lr) # 0.89

+---------+-------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|review_id| app_id|         review_text|label|               words|            filtered|            features|       rawPrediction|         probability|prediction|
+---------+-------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|124141594|1730650|UPDATE!!!: After ...|    1|[update!!!:, afte...|[update!!!:, patc...|(5479,[0,1,3,5,6,...|[1.60330029427770...|[0.83247914148083...|       0.0|
|131364642|1269300|Fun concept, best...|    1|[fun, concept,, b...|[fun, concept,, b...|(5479,[3,31,72,22...|[-1.3592463995703...|[0.20436280939831...|       1.0|
|134329345|2202690|The game is very ...|    1|[the, game, is, v...|[game, fun,, firs...|(5479,[1,6,8,9,11...|[-2.1462889918074...|[0.10467851233992...|       1.0|
|135846672|1269300|rea

0.8878716870404028

In [None]:
# Save the fitted model
fitted_pipeline_lr.write().overwrite().save('Models/score_classifier_lr')