# K-means model
## Netflix Rating Predictions
#### Lauren Neal - ln9bv
#### Melanie Sattler - ms9py
#### Nick Thompson - nat3fa
#### Nima Beheshti - nb9pp

In [21]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.sql import functions as f
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml import Pipeline 

# Loads data
df = spark.read.csv("processed_all.txt", header=True, inferSchema=True)
df.show(5)

+-------+--------+------+
|user_id|movie_id|rating|
+-------+--------+------+
|1488844|       1|   3.0|
| 822109|       1|   5.0|
| 885013|       1|   4.0|
|  30878|       1|   4.0|
| 823519|       1|   3.0|
+-------+--------+------+
only showing top 5 rows



#### Creat new variable that shows ratings 4+ as 1 and three of lower as 0. Will use this for metric evaluation

In [22]:
df = df.withColumn('classification', f.when(f.col('rating') >= 4, 1).otherwise(0))
df.show(5)

+-------+--------+------+--------------+
|user_id|movie_id|rating|classification|
+-------+--------+------+--------------+
|1488844|       1|   3.0|             0|
| 822109|       1|   5.0|             1|
| 885013|       1|   4.0|             1|
|  30878|       1|   4.0|             1|
| 823519|       1|   3.0|             0|
+-------+--------+------+--------------+
only showing top 5 rows



#### Order the dataframe by user_id so we can test using all movies seen by a specific user id

In [23]:
df.createOrReplaceTempView('df')
df = spark.sql("SELECT * FROM df ORDER BY user_id")
df.show(5)

+-------+--------+------+--------------+
|user_id|movie_id|rating|classification|
+-------+--------+------+--------------+
|      6|    3290|   5.0|             1|
|      6|    3423|   4.0|             1|
|      6|    3315|   3.0|             0|
|      6|    3226|   3.0|             0|
|      6|    3320|   3.0|             0|
+-------+--------+------+--------------+
only showing top 5 rows



#### Find count of dataframe and limit observations to 500,000 rows due to time it takes to run test and evaluations

In [24]:
df.count()

100480507

In [25]:
use = df.limit(500000)

#### Create training and test splits

In [26]:
seed = 314
weights = [.8, .2]
(training, test) = use.randomSplit(weights, seed)

In [27]:
training.show(5)

+-------+--------+------+--------------+
|user_id|movie_id|rating|classification|
+-------+--------+------+--------------+
|      6|      30|   3.0|             0|
|      6|     157|   3.0|             0|
|      6|     173|   4.0|             1|
|      6|     191|   2.0|             0|
|      6|     241|   3.0|             0|
+-------+--------+------+--------------+
only showing top 5 rows



In [28]:
training.count()

399828

In [29]:
test.count()

100172

#### Add in additional dataset that provides some more info on the movies in the dataframe

In [30]:
add_movie_feats = spark.read.csv("movies_addtl_features.csv", header=True, inferSchema=True)
add_movie_feats.show(5)

+---+------+--------------------+-------+------+-------------------+--------------------+--------------------+--------------------+
| id|  year|               title|Runtime|Rating|          Directors|             Writers|Production companies|              Genres|
+---+------+--------------------+-------+------+-------------------+--------------------+--------------------+--------------------+
|  1|2003.0|     Dinosaur Planet|   50.0|   7.7|Pierre de Lespinois|Mike Carrol-Mike ...|                null|Documentary-Anima...|
|  2|2004.0|Isle of Man TT 20...|   null|  null|               null|                null|                null|                null|
|  3|1997.0|           Character|  122.0|   7.8|      Mike van Diem|Ferdinand Bordewi...|First Floor Featu...| Crime-Drama-Mystery|
|  4|1994.0|Paula Abdul's Get...|   54.0|   8.8|      Steve Purcell|                null|                null|              Family|
|  5|2004.0|The Rise and Fall...|  360.0|   8.6|         Kevin Dunn|        

In [31]:
df2 = add_movie_feats.drop('Title', 'Rating', 'Directors', 'Writers', 'Production Companies', 'Genres')
df2.show(5)

+---+------+-------+
| id|  year|Runtime|
+---+------+-------+
|  1|2003.0|   50.0|
|  2|2004.0|   null|
|  3|1997.0|  122.0|
|  4|1994.0|   54.0|
|  5|2004.0|  360.0|
+---+------+-------+
only showing top 5 rows



#### Combine columns for year and runtime of specific movie by movie_id. Drop NA values and duplicate columns for each the training and test dataframes

In [32]:
combined_training = training.join(df2, training.movie_id == df2.id )
combined_training = combined_training.dropna()
combined_training = combined_training.drop('id')

In [33]:
combined_training.show(5)

+-------+--------+------+--------------+------+-------+
|user_id|movie_id|rating|classification|  year|Runtime|
+-------+--------+------+--------------+------+-------+
|      6|      30|   3.0|             0|2003.0|  128.0|
|      6|     173|   4.0|             1|1968.0|  130.0|
|      6|     191|   2.0|             0|2003.0|  134.0|
|      6|     241|   3.0|             0|1959.0|  136.0|
|      6|     295|   4.0|             1|1995.0|   90.0|
+-------+--------+------+--------------+------+-------+
only showing top 5 rows



In [34]:
combined_test = test.join(df2, test.movie_id == df2.id )
combined_test = combined_test.dropna()
combined_test = combined_test.drop('id')
combined_test.show(5)

+-------+--------+------+--------------+------+-------+
|user_id|movie_id|rating|classification|  year|Runtime|
+-------+--------+------+--------------+------+-------+
|      6|     175|   5.0|             1|1992.0|   99.0|
|      6|     197|   3.0|             0|2004.0|  103.0|
|      6|     329|   4.0|             1|1999.0|  130.0|
|      6|     723|   3.0|             0|1991.0|  101.0|
|      6|     872|   3.0|             0|1954.0|  207.0|
+-------+--------+------+--------------+------+-------+
only showing top 5 rows



#### Create Pipeline 

In [35]:
va0 = VectorAssembler(inputCols=["rating"], outputCol="features")
va1 = VectorAssembler(inputCols=["rating","year",'Runtime'], outputCol="features")
va2 = VectorAssembler(inputCols=["year", "Runtime"], outputCol="features")
va3 = VectorAssembler(inputCols=["rating", "Runtime"], outputCol="features")
va4 = VectorAssembler(inputCols=["rating", "year"], outputCol="features")

In [36]:
sc = StandardScaler().setInputCol("features").setOutputCol("scaled")
# sc1 = StandardScaler().setInputCol("features").setOutputCol("scaled")
# sc2 = StandardScaler().setInputCol("features").setOutputCol("scaled")
# sc3 = StandardScaler().setInputCol("features").setOutputCol("scaled")
# sc4 = StandardScaler().setInputCol("features").setOutputCol("scaled")

In [37]:
mod = KMeans(featuresCol= 'scaled').setK(2).setSeed(314).setMaxIter(10)

In [38]:
pipeline0 = Pipeline(stages=[va0, sc, mod])
pipeline1 = Pipeline(stages=[va1, sc, mod])
pipeline2 = Pipeline(stages=[va2, sc, mod])
pipeline3 = Pipeline(stages=[va3, sc, mod])
pipeline4 = Pipeline(stages=[va4, sc, mod])

#### Test models using various feature selections to see which one performs the best

In [39]:
model0 = pipeline0.fit(combined_training)
prediction0 = model0.transform(combined_test).select("classification", "prediction")
metrics = MulticlassMetrics(prediction0.rdd.map(lambda x: tuple(map(float, x))))
precision = metrics.precision(1.0)
recall = metrics.recall(1.0)
f1Score = metrics.fMeasure(1.0)
print("Summary Stats")
print("Precision = %s" % precision)
print("Recall = %s" % recall)
print("F1 Score = %s" % f1Score)

Summary Stats
Precision = 0.0
Recall = 0.0
F1 Score = 0.0


In [40]:
model1 = pipeline1.fit(combined_training)
prediction1 = model1.transform(combined_test).select("classification", "prediction")
metrics = MulticlassMetrics(prediction1.rdd.map(lambda x: tuple(map(float, x))))
precision = metrics.precision(1.0)
recall = metrics.recall(1.0)
f1Score = metrics.fMeasure(1.0)
print("Summary Stats")
print("Precision = %s" % precision)
print("Recall = %s" % recall)
print("F1 Score = %s" % f1Score)

Summary Stats
Precision = 0.001748849176834784
Recall = 0.002331814526936478
F1 Score = 0.0019986905131120993


In [41]:
model2 = pipeline2.fit(combined_training)
prediction2 = model2.transform(combined_test).select("classification", "prediction") 
metrics = MulticlassMetrics(prediction2.rdd.map(lambda x: tuple(map(float, x))))
precision = metrics.precision(1.0)
recall = metrics.recall(1.0)
f1Score = metrics.fMeasure(1.0)
print("Summary Stats")
print("Precision = %s" % precision)
print("Recall = %s" % recall)
print("F1 Score = %s" % f1Score)

Summary Stats
Precision = 0.8469857478842945
Recall = 0.5499432240886487
F1 Score = 0.6668829729986389


In [42]:
model3 = pipeline3.fit(combined_training)
prediction3 = model3.transform(combined_test).select("classification", "prediction")
metrics = MulticlassMetrics(prediction3.rdd.map(lambda x: tuple(map(float, x))))
precision = metrics.precision(1.0)
recall = metrics.recall(1.0)
f1Score = metrics.fMeasure(1.0)
print("Summary Stats")
print("Precision = %s" % precision)
print("Recall = %s" % recall)
print("F1 Score = %s" % f1Score)

Summary Stats
Precision = 0.0
Recall = 0.0
F1 Score = 0.0


In [43]:
model4 = pipeline4.fit(combined_training)
prediction4 = model4.transform(combined_test).select("classification", "prediction")
metrics = MulticlassMetrics(prediction4.rdd.map(lambda x: tuple(map(float, x))))
precision = metrics.precision(1.0)
recall = metrics.recall(1.0)
f1Score = metrics.fMeasure(1.0)
print("Summary Stats")
print("Precision = %s" % precision)
print("Recall = %s" % recall)
print("F1 Score = %s" % f1Score)

Summary Stats
Precision = 0.09158341206504915
Recall = 0.15891729742927901
F1 Score = 0.11620077535196897


#### For Loop to find the best value of k

In [44]:
k_list = [2,5,10,15,20]
precision_list = []
recall_list = []
f1Score_list = []
for k in k_list:
    va5 = VectorAssembler(inputCols=["year", "Runtime"], outputCol="features")
    sc5 = StandardScaler().setInputCol("features").setOutputCol("scaled")
    mod = KMeans(featuresCol= 'scaled').setK(k).setSeed(314).setMaxIter(10)
    pipeline5 = Pipeline(stages=[va5, sc5, mod])
    model5 = pipeline5.fit(combined_training)
    prediction5 = model5.transform(combined_test).select("classification", "prediction") 
    metrics = MulticlassMetrics(prediction5.rdd.map(lambda x: tuple(map(float, x))))
    precision = metrics.precision(1.0)
    recall = metrics.recall(1.0)
    f1Score = metrics.fMeasure(1.0)
    precision_list.append(precision)
    recall_list.append(recall)
    f1Score_list.append(f1Score)

#### Create dataframe to show results

In [45]:
results_df = spark.createDataFrame(zip(k_list,precision_list,recall_list,f1Score_list), 
                                   ["k_value", "Precision", "Recall", "F1Score"])
results_df.show()

+-------+--------------------+------------------+-------------------+
|k_value|           Precision|            Recall|            F1Score|
+-------+--------------------+------------------+-------------------+
|      2|  0.8469857478842945|0.5499432240886487| 0.6668829729986389|
|      5| 0.07027559450821155|0.6633776091081593|0.12708799098460477|
|     10| 0.01692564375741251|0.6223207686622321|0.03295499021526419|
|     15|  0.0179307294912256|0.7181964573268921|0.03498793857498676|
|     20|0.047198826059862906| 0.662528216704289|0.08811994520650766|
+-------+--------------------+------------------+-------------------+



After running models based on various combinations of feature columns, we noticed that 'rating' is not a good column to chose. Using rating in a real world recommendation setting would lead to major bias because the thought behind recommending a movie to someone they wouldn't have already seen it and therefore you wouldn't already have a rating associated with it. The model that seemed to perform the best was the model using only 'year' and 'Runtime' as features. We worked off of this model to see which k-value led to the best results and and it appears that the leader is k value of 2. We will dive further into the results and observations in our project write-up.