In [1]:
import numpy as np
import pandas as pd
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql import Row
from pyspark.sql.types import IntegerType
import time

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.evaluation import RankingMetrics, BinaryClassificationMetrics
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.ml.evaluation import BinaryClassificationEvaluator


In [2]:
import numpy as np
import pandas as pd
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql import Row
from pyspark.sql.types import IntegerType

#A class used to preprocess data and to return train/val/test splits
class DataPreprocessor():
    def __init__(self, spark, file_path) -> None:
        self.spark = spark                              #Spark Driver
        self.file_path = file_path                      #File Path to Read in Data


    #Main Method - Call this in partition_data.py to get train/val/test splits returned
    def preprocess(self, sanity_checker=False):
        """
        Goal: Save train/val/test splits to netID/scratch - all using self methods
        Step 1: self.clean_data: clean the data, format timestamp to date, and remove duplicate movie titles
        Step 2: self.create_train_val_test_splits: reformats data, drops nans, and returns train,val and test splits
        input:
        -----
        sanity_checker: boolean - Flag that decides if we call self.sanity_check()
        -----
        output: 
        train: RDD of Training Set Data
        val: RDD of Validation Set Data
        test: RDD of Validation Set Data
        """
        #Format Date Time and Deduplicate Data
        clean_data = self.clean_data()                                                  #No args need to be passed, returns RDD of joined data (movies,ratings), without duplicates
        #Get Utility Matrix
        train, val, test = self.create_train_val_test_splits(clean_data)                #Needs clean_data to run, returns train/val/test splits
        
        #Check if we should perform sanity check
        if sanity_checker:
            flag = self.sanity_check(train,val,test)
            #If flag == True we're good
            if flag:
                print("The val and test splits are disjoint!")
            #Otherwise raise exception
            else:
                raise Exception("The Validation and Test sets are not disjoint!")

        #Return train val test sets
        return train, val, test
    
    #preprocess calls this function
    def clean_data(self):
        """
        goal: for movie titles with multiple movieIDs, in the movies dataset,
        remove the duplicate IDs with the least ratings for each movie. 
        Additionally, remove those IDs from the ratings dataset, so we get a 1:1 mapping
        between movie title and movie ID

        inputs: None, however - self.file_path -> this should link to your hfs/netid/
        outputs: all_data - a RDD of joined data (movies,reviews) - deduplicated of titles that appear more than once
                this loses only 6 records (reviews from users) for small
        """

        #Import the movies data + add to schema so it can be used by SQL + header=True because there's a header
        movies = self.spark.read.csv(self.file_path + 'movies.csv', header=True, \
                                    schema='movieId INT, title STRING, genres STRING')
    
        #Same for ratings - TIMESTAMP MUST BE STRING
        ratings = self.spark.read.csv(self.file_path + 'ratings.csv', header=True, \
                    schema='userId INT, movieId INT, rating FLOAT, timestamp STRING') 
        
        #Get the MM-dd-yyyy format for timestamp values producing new column, Date
        ratings = ratings.withColumn("date",from_unixtime(col("timestamp"),"MM-dd-yyyy"))
        ratings = ratings.drop("timestamp") #Drop timestamp, we now have date

        #Join Dfs - Join Movies with Ratings on movieId, LEFT JOIN used, select only rating, userId, movieId, title and date
        joined = ratings.join(movies, ratings.movieId==movies.movieId, how='left').select(\
                            ratings.rating,ratings.userId,\
                            ratings.movieId,ratings.date,movies.title)

        #Find Movie Titles that map to multiple IDs
        dupes = joined.groupby("title").agg(countDistinct("movieId").alias("countD")).filter(col("countD")>1)

        #Isolate non-dupes into a df
        non_dupes = joined.join(dupes, joined.title==dupes.title, how='leftanti')
    
        #Get all of the dupes data - ratings, userId, ect - again from Joined
        dupes = dupes.join(joined, joined.title==dupes.title, how='inner').select(\
                                        joined.movieId,joined.rating,\
                                        joined.date,dupes.title,joined.userId)
    
        #Clean the dupes accordingly
        #Step 1: Aggregate by title/movie Id, then count userId - give alias
        #Step 2: Create a window to partition by - we iterate over titles ranking by 
        #countD (count distinct of userId) - movieId forces a deterministic ranking based off movieId
        #Step 3: Filter max_dupes so we only grab top ranking movieIds
        windowSpec = Window.partitionBy("title").orderBy("countD","movieId")
        max_dupes = dupes.groupBy(["title","movieId"]).agg(countDistinct("userId").alias("countD"))
        max_dupes = max_dupes.withColumn("dense_rank",dense_rank().over(windowSpec))
        max_dupes = max_dupes.filter(max_dupes.dense_rank=="2")
        max_dupes = max_dupes.drop("countD","dense_rank")
        
        #Get a list of movie ids ~len(5) for small - which are the ones we want to keep
        ids = list(max_dupes.toPandas()['movieId'])
        cleaned_dupes = dupes.where(dupes.movieId.isin(ids))
        
        #Reorder Columns so union works
        cleaned_dupes = cleaned_dupes.select('rating', 'userId', 'movieId', 'date', 'title')

        
        #Get the union of the non_dupes and cleaned_dupes
        clean_data = non_dupes.union(cleaned_dupes)

        #Subtract 2.5 from each review to create negative reviews
        clean_data = clean_data.withColumn("rating",col("rating")-2.5)
        
        #For testing purposes should be 100,830 for small dataset
        #print(f"The length of the combined and de-deduped joined data-set is: {len(clean_data.collect())}")

        #Repartition for efficiency:
        clean_data = clean_data.repartition(10)

        #Return clean_data -> Type: Spark RDD Ready for more computation
        return clean_data

    #Create Train Test Val Splits - .preprocess() calls this function
    def create_train_val_test_splits(self, clean_data):
        """
        Procedure: 
        Create two columns - the first will measure the specific row count for a specific user
        the other will be static fixed at the total number of reviews for that user. The row count
        is sorted by date ascending, so the first row is the oldest review.
        
        Then, subset training to be where row_count <= .6 *length, grabbing the oldest 60% of reviews, for
        all users.
        
        We then subset the remaining data into a hold out, with the goal of creating two disjoint validation
        and test data sets when looking at userId (meaning they should not have any shared userId values), 
        but still have roughly the same amount of data, or whatever percentage we want to achieve
        
        To obtain approximate equality and disjoint userId membership, for the remiaining data
        sort userId by user_review_count descending, then alternate values in that list, assigning
        half to test and half to validation.
        -----
        input: RDD created by joining ratings.csv and movies.csv - cleaned of duplicates and formatted accordingly
        -----
        -----
        output: training 60%, val 20%, test 20% splits with colums cast to integer type and na's dropped
        -----
        """
        #Type Cast the cols to numeric
        ratings = clean_data.withColumn('movieId',col('movieId').cast(IntegerType())).withColumn("userId",col("userId").cast(IntegerType()))
        #Drop nulls
        ratings = ratings.na.drop("any")
    
        #strategy, partition by userId, and userId order by date, 
        #take the first 60% of reviews for all users
        w1 = Window.partitionBy("userId")
        w2 = Window.partitionBy("userId").orderBy("date")
        ratings = (ratings.withColumn("row_num", row_number().over(w2))
                       .withColumn('length', count('userId').over(w1))
                  )

        #store in training RDD by 
        #selecting all rows where the row_count for that user <= 60% total reviews for that user
        
        training = ratings.filter("row_num <=.6*length")
        #now for validation and test set, we want those to have no users in common, but for them to
        #be approximately equal size. 
        holdout_df = ratings.filter("row_num >.6*length")
        
        #strategy, of the data not in my train set, group users by number of movies they have seen
        #sort descending
        holdout_split = holdout_df.groupBy("userId").count().orderBy("count", ascending=False).toPandas()
        
        #store the list of userIds sorted by descending total movie count
        holdout_split = list(holdout_split.userId)
        
        #partition list of userIds by taking every other index and putting it in the validation set
        val_users = holdout_split[::2]
        
        #create a validation and test set by filtering holdout data based on whether movieId isin val_users
        val = holdout_df.filter(holdout_df.userId.isin(val_users))
        test = holdout_df.filter(~holdout_df.userId.isin(val_users))

        #Repartition for efficiency
        training = training.repartition(10)
        val = val.repartition(10)
        test = test.repartition(10)
        #Return train/val/test splits
        return training, val, test

    #TO DO?? Should we enforce min_review cutoff to make sure no cold-start for any prediction?
    def enforce_min_review(self):
        pass

    #Check to train/val/test splits to make sure approx 60/20/20 split is achieved
    def sanity_check(self,train,val,test):
        """
        Method to print out the shape of train/val/test splits, and a check to make sure that
        val and test splits are disjoint (no distinct userId appears in both)
        input:
        -----
        train: RDD - Training data split created from .create_train_val_test_splits
        val: RDD - Validation data split created from .create_train_val_test_splits
        test: RDD - Testing data split created from .create_train_val_test_splits
        -----
        output:
        -----
        returnFlag: boolean - True means test and val splits are disjoint on userId
        """

        #Get observatio counts for training, val, and test sets
        training_obs = train.count()
        val_obs = val.count()
        test_obs = test.count()

        #Print them out
        print(f"Training Data Len: {training_obs} Val Len: {val_obs}, Test Len: {test_obs}")
        print(f"Partitions, Train: {train.rdd.getNumPartitions()}, Val: {val.rdd.getNumPartitions()}, Test: {test.rdd.getNumPartitions()}")
        #Check if there are any overlapping_ids in the sets
        overllaping_ids = val.join(test, test.userId==val.userId,how='inner').count()
        
        #Return True if they're disjoint, False if there's overlap
        return overllaping_ids == 0


In [13]:
import numpy as np
import pandas as pd
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.mllib.evaluation import RankingMetrics
import time
from datetime import datetime
from pyspark.mllib.evaluation import RankingMetrics
from pyspark.sql import Row
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import *
from pyspark.sql.window import Window


class Model():
    """
    Abstract Model class that will contain various methods to deploy collaborative filtering.
    Model Parameters that need to be passed thorugh:
    ### For ALS Model ###
    -----
    rank: int - Rank of latent factors used in decomposition
    maxIter: int - represents number of iterations to run algorithm
    regParam: float - Regularization Parameter
    model_save: boolean - Flag to determine if we should save the model progress or not
    -----
    ### For baseline Model ###
    -----
    min_ratings: int - Minimum number of reviews to qualify for baseline (Greater Than or Equal to be included)
    -----
    ### No Input Necessary ###
    -----
    model_size: str - Either "large" or "small" used to demarcate which dataset we are running on
    model_type: str - Which model type we intent to run, i.e. ALS or baseline
    evaluation_data_name: str - Dummy variable used to keep track of which dataset we are making predictions on, either "Val" or "Test"
    time_when_ran: datetime - Time when model was run
    time_to_fit: datetime - Time it took to fit the model
    time_to_predict: datetime - Time it took to make predictions
    metrics: dict - Dictionary used to store the various metrics calculated in self.record_metrics()
    -----
    ### Misc ###
    -----
    num_recs: int - Top X number of reccomendations to return - default set to 100
    -----
    ### Model Methods ###
    -----
    run_model: Runs the corresponding method that was passed to self.model_type
    alternatingLeastSquares: Latent Factor model which uses the Alternating Least Squares Pyspark Class to fit and predict.
    baseline: uses a baseline popularity model that returns the top X most popular movies (decided by avg rating per movie)
    record_metrics: Calculates metrics for prediction,label pairs
    save_model: Used for advanced models like ALS or extensions where we may want to save the model itself
    -----
    """
    
    #changed default min ratings to 0 from none, otherwise we get an error with None

    # Constructor for Model
    def __init__(self, model_size=None, model_type=None, rank=None, maxIter=None, regParam=None, seed=10, nonnegative=True,
                 model_save=False, num_recs=100, min_ratings=0):
        # Model Attributes
        # Dictionary to access variable methods
        self.methods = {"als": self.alternatingLeastSquares,
                        "baseline": self.baseline}
        # Top X number of reccomendations to return - set to 100, probably won't change
        self.num_recs = num_recs

        # Passed through by user
        self.model_size = model_size
        self.model_type = model_type

        # For ALS
        self.rank = rank  # Rank of latent factors used in decomposition
        self.maxIter = maxIter  # Number of iterations to run algorithm, recommended 5-20
        self.regParam = regParam  # Regularization Parameter
        # Flag used to determine whether or not we should save our model somewhere
        self.model_save = model_save

        # For baseline
        # Minimum number of reviews to qualify for baseline (Greater Than or Equal to be included)
        self.min_ratings = min_ratings

        # Add the attributes we're gonna compute when we fit and predict
        self.evaluation_data_name = None
        self.time_when_ran = None
        self.time_to_fit = None
        self.time_to_predict = None
        self.metrics = {}

    def run_model(self, train, val=None, test=None):
        """
        Run_model is what is called to fit, run, and record the metrics for respective model types.
        Function behavior is dependent on the argument passed to self.model_type.
        -----
        inputs:
        -----
        train: RDD - Training data set
        val: RDD - Validation data set
        test: RDD - Test set
        -----
        outputs:
        -----
        model_output: Variable Type - Output of whichever model ran -> check self.model_type
        -----
        """
        # Get when model was ran
        self.time_when_ran = datetime.now().strftime("%m/%d/%Y, %H:%M:%S")

        # Identify if we're predicting on the Validation Set or the Test Set
        if val:
            self.evaluation_data_name = "Val"
            evaluation_data = val
        elif test:
            self.evaluation_data_name = "Test"
            evaluation_data = test

        # Grab method for whichever model corresponds to self.model_type
        model = self.methods[self.model_type]
        # Run model on training / evaluation data
        model_output = model(train, evaluation_data)
        # Return model output
        return model_output

    # This method uses the Alternating Least Squares Pyspark Class to fit and run a model
    def alternatingLeastSquares(self, training, evaluation_data):
        """
        Builds and fits a PySpark alternatingLeastSquares latent factor model. Calls self.record_metrics(precitions,labels)
        to record the results. Some dummy variables are made to record whether or not we are using the validation set
        or the testing set. This will help us record our results accurately. Training and predicting are also timed. 
        -----
        Input: 
        training: RDD - Training data set
        evaluation_data: RDD - Either Validation data set, or Training data set
        -----
        Output: [userRecs, movieRecs] - list containing two lists, each of length == self.numrecs 
        -----
        """

        # Time the function start to finish
        start = time.time()
        # Create the model with certain params - coldStartStrategy="drop" means that we'll have no nulls in val / test set
        als = ALS(maxIter=self.maxIter, rank=self.rank, regParam=self.regParam,
                  nonnegative=False, seed=10, userCol="userId",
                  itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")

        # Fit the model
        model = als.fit(training)
        # End time and calculate delta
        end = time.time()
        self.time_to_fit = end - start

        # Time predictions as well
        start = time.time()
        # Create predictions, matrix with additional column of prediction
        predictions = model.transform(evaluation_data)
        end = time.time()
        self.time_to_predict = end - start
        

        # Generate top 10 movie recommendations for each user
        userRecs = model.recommendForAllUsers(self.num_recs)
        # Generate top 10 user recommendations for each movie
        movieRecs = model.recommendForAllItems(self.num_recs)
        
        #Add RMSE and stuff here 
        
        # Use self.record_metrics to evaluate model on RMSE, R^2, Precision at K, Mean Precision, and NDGC
        return self.record_metrics(predictions=userRecs, labels=evaluation_data)
        
        ## FIX THIS IN A BIT ##
        ##Evaluate Predictions for Regression Task##
        evaluator = RegressionEvaluator(
            labelCol="rating", predictionCol="prediction")
        # Calculate RMSE and r_2 metrics and append to metrics
        self.metrics["rmse"] = evaluator.evaluate(
            predictions, {evaluator.metricName: "rmse"})
        self.metrics["r2"] = evaluator.evaluate(
            predictions, {evaluator.metricName: "r2"})

        ##ROC Metric Evaluation##
        # For ROC Binary Classification
        # Make predictions Binary
        binary_predicts = predictions.withColumn("prediction", when(
            predictions.rating > 0, 1).otherwise(0).cast("double"))
        evaluator = BinaryClassificationEvaluator(
            rawPredictionCol='prediction', labelCol='rating', metricName='areaUnderROC')
        # Append ROC to our Metrics list
        self.metrics["ROC"] = evaluator.evaluate(binary_predicts)

        # Save model if we need to
        if self.model_save:
            self.save_model(model_type=self.model_type, model=als)

        # Return top self.num_recs movie recs for each user, top self.num_recs user recs for each movie
        return [userRecs, movieRecs]

    # Baseline model that returns top X most popular items (highest avg rating)
    def baseline(self, training, evaluation_data):
        """
        Baseline model for recommendation system. No personalization, just recommend the Top 100 movies by avg(rating)
        A movie must have at least self.min_ratings to be considered
        input:
        -----
        training: RDD - training set data
        evaluation_data: RDD - Validation set or Test set data
        self.min_ratings: int - how many ratings a movie must have in order to be considered in top 100
        -----
        output: RDD of Top 100 movieIds by avg(rating)
        """
        # Time model Fit
        start = time.time()
        # Get Top 100 Most Popular Movies - Avg(rating) becomes prediction
        top_100_movies = training.groupBy("movieId").agg(avg("rating").alias("prediction"),
                                                         count("movieId").alias("movie_count")).where(f"movie_count>={self.min_ratings}").\
            orderBy("prediction", ascending=False).limit(100)
        # Grab Distinct User Ids
        ids = evaluation_data.select("userId").distinct()
        # Cross Join Distinct userIds with Top 100 Most Popular Movies
        predictions = ids.crossJoin(top_100_movies)
        # Record end time after RDD operations
        end = time.time()
        self.time_to_fit = end - start

        # Time predictions as well
        self.time_to_predict = 0  # Recommends in constant time

        # Use self.record_metrics to evaluate model on RMSE, R^2, Precision at K, Mean Precision, and NDGC
        self.record_metrics(predictions=predictions, labels=evaluation_data, baseline=True)

        # Return The top 100 most popular movies above self.min_ratings threshold
        return top_100_movies

    def record_metrics(self, predictions, labels, baseline=False):
        """
        Method that will contain all the code to evaluate model on metrics: RMSE, R^2, ROC, Precistion At K, Mean Precision, and NDGC
        input:
        -----
        predictions: RDD - PySpark Dataframe containing the following columns at the minimum: [userId,movieId,prediction] - if not baseline model must include rating column
        labels: RDD - PySpark Dataframe containing the following columns at the minimum: [userId,movieId,rating, date]
        Baseline - a boolean indicator denoting whether we are using a baseline model or non-baseline model
        -----
        returns: 
        None - Writes the results to self.metrics dictionary
        """
        self.review_threshold = 0


        
        if baseline:
           
            predictions = predictions.select("userId","movieId")


            #jonah and joby's code here



            # Join predictions and labels, then filter to the 
            ##fix this
            denominator_p2 = labels.join(predictions, ['userId', 'movieId'], how ='inner').select('userId', 'movieId', "rating")

            numerator_p2 = denominator_p2.where(f"rating>{self.review_threshold}"\
                                               ).groupBy('userId').agg(expr('collect_list(movieId) as movieId'))

            denominator_p2 = denominator_p2.groupBy('userId').agg(expr('collect_list(movieId) as movieId'))

            ##Evalaute Predictions for Ranking Tests##

            predictions = predictions.groupBy('userId').agg(expr('collect_list(movieId) as movieId'))


           #Grab ALL (NO FILTERS YET) Validation Data userId, movieIdLists 
            all_labels = labels.select('userId', 'movieId') \
                .groupBy('userId').agg(expr('collect_list(movieId) as movieId'))

            # Only select movies that users have seen in validation data that they rated positively
            pos_labels = labels.select('userId', 'movieId', "rating").where(f"rating>{self.review_threshold}") \
                .groupBy('userId').agg(expr('collect_list(movieId) as movieId'))

            #all
            labels_all_and_predictions = predictions.join(
                all_labels, 'userId').rdd.map(lambda row: (row[1], row[2]))

            # Subset of Labels Intersection With Reccomendation
            labels_subset_and_predictions = predictions.join(
                pos_labels, 'userId').rdd.map(lambda row: (row[1], row[2]))

            #new code 
            labels_subset_and_intersections = denominator_p2.join(
                numerator_p2, 'userId').rdd.map(lambda row: (row[1], row[2]))
            #Joby and Jonah's code
             
           #should we change the return statement? Keeping it as is
           #return labels_all_and_predictions, predictions
            rankingMetrics_all = RankingMetrics(labels_all_and_predictions)
            rankingMetrics_subset = RankingMetrics(labels_subset_and_predictions)
            rankingMetrics_intersection = RankingMetrics(labels_subset_and_intersections)
           
            self.metrics["MAP - All"] = rankingMetrics_all.meanAveragePrecision
            self.metrics["MAP - Subset"] = rankingMetrics_subset.meanAveragePrecision

            self.metrics[f"meanAveragePrecisionAt{self.num_recs}All"] = rankingMetrics_all.precisionAt(
                self.num_recs)
            self.metrics[f"ndcgAt100-All"] = rankingMetrics_all.ndcgAt(self.num_recs)
        
            self.metrics[f"meanAveragePrecisionAt{self.num_recs}subset"] = rankingMetrics_subset.precisionAt(self.num_recs)
            self.metrics[f"ndcgAt100-subset"] = rankingMetrics_subset.ndcgAt(self.num_recs)
            
            self.metrics['MAP - Intersection'] = rankingMetrics_intersection.meanAveragePrecision
            
            #recallatK() does divide 
            self.metrics['Precision - Intersection'] = rankingMetrics_intersection.recallAt(self.num_recs)



            
        else:
            predictions = predictions.select("userId","recommendations.movieId")


            
    # Method to save model to const.MODEL_SAVE_FILE_PATH
    def save_model(self, model_type=None, model=None):
        """
        Inputs:
        -----
        model_type: str - string designating what type of model is being saved
        model: obj - model object that has .save method
        -----
        """
        # Make sure a non-null object was passed
        if model and model_type:
            model.save(const.MODEL_SAVE_FILE_PATH + model_type)
        # Otherwise throw error
        else:
            raise Exception("Model and or Model_type not passed through")

In [14]:
folder_path = "../../ml-latest-small/"
spark = SparkSession.builder.appName('Spark_Session_Name').getOrCreate()

In [15]:
train, val, test = DataPreprocessor(spark,folder_path).preprocess()


                                                                                

In [16]:
m = Model(rank=5, maxIter=5, regParam=0.01, min_ratings = 5)

In [17]:
preds = m.baseline(train,val)


                                                                                

In [18]:
vars(m)

{'methods': {'als': <bound method Model.alternatingLeastSquares of <__main__.Model object at 0x7fc0a9345130>>,
  'baseline': <bound method Model.baseline of <__main__.Model object at 0x7fc0a9345130>>},
 'num_recs': 100,
 'model_size': None,
 'model_type': None,
 'rank': 5,
 'maxIter': 5,
 'regParam': 0.01,
 'model_save': False,
 'min_ratings': 5,
 'evaluation_data_name': None,
 'time_when_ran': None,
 'time_to_fit': 0.1187889575958252,
 'time_to_predict': 0,
 'metrics': {'MAP - All': 0.002697937367692215,
  'MAP - Subset': 0.002948111848836311,
  'meanAveragePrecisionAt100All': 0.028754098360655747,
  'ndcgAt100-All': 0.04012963777810449,
  'meanAveragePrecisionAt100subset': 0.02769736842105263,
  'ndcgAt100-subset': 0.0418686264274653,
  'MAP - Intersection': 0.5351753445126602,
  'Precision - Intersection': 0.6166173512275188},
 'review_threshold': 0}

22/04/27 02:42:16 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 317351 ms exceeds timeout 120000 ms
22/04/27 02:42:16 WARN SparkContext: Killing executors is not supported by current scheduler.


In [10]:
vars(m)

{'methods': {'als': <bound method Model.alternatingLeastSquares of <__main__.Model object at 0x7fc080e9cd90>>,
  'baseline': <bound method Model.baseline of <__main__.Model object at 0x7fc080e9cd90>>},
 'num_recs': 100,
 'model_size': None,
 'model_type': None,
 'rank': 5,
 'maxIter': 5,
 'regParam': 0.01,
 'model_save': False,
 'min_ratings': 5,
 'evaluation_data_name': None,
 'time_when_ran': None,
 'time_to_fit': 0.12012410163879395,
 'time_to_predict': 0,
 'metrics': {'MAP - All': 0.002697937367692216,
  'MAP - Subset': 0.0029481118488363088,
  'meanAveragePrecisionAt100All': 0.02875409836065575,
  'ndcgAt100-All': 0.04012963777810446,
  'meanAveragePrecisionAt100subset': 0.027697368421052634,
  'ndcgAt100-subset': 0.0418686264274653,
  'MAP - Intersection': 0.5351753445126601,
  'Precision - Intersection': 0.6166173512275183},
 'review_threshold': 0}

In [11]:
preds.show()

[Stage 258:>                                                      (0 + 10) / 10]

+-------+------------------+-----------+
|movieId|        prediction|movie_count|
+-------+------------------+-----------+
|  92535|               2.4|          5|
|   1041|2.2142857142857144|          7|
| 187593|               2.2|          5|
|   2239|               2.1|          5|
|   1178|2.0833333333333335|          6|
|   1217|2.0714285714285716|          7|
|   3435|              2.05|         10|
|    306|              2.05|         10|
|   1299|2.0416666666666665|         12|
|   3260|               2.0|          5|
|   7981|               2.0|          6|
|    906|               2.0|          5|
|  98491|               2.0|          6|
| 104879|               2.0|          7|
|   2732|               2.0|          5|
|   3451|               2.0|          5|
|    922|1.9333333333333333|         15|
|  86345|1.9285714285714286|          7|
|   1104|1.9285714285714286|         14|
|   2360|1.9285714285714286|          7|
+-------+------------------+-----------+
only showing top

                                                                                

In [12]:
combined.record_metrics(preds, val)

NameError: name 'combined' is not defined

In [None]:
data = [([1,2,3],[1,2,3,4])]
rdd = spark.sparkContext.parallelize(data)
metrics = RankingMetrics(rdd)

metrics.recallAt(100)