In [1]:
import numpy as np
import random
import matplotlib.pyplot as plt
import pandas as pd
import json
import gzip
import re
import datetime as dt
import sys
import pdb
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [2]:
class movie():
    
    def __init__(self, read_train_test = True):
        self.processed_movie = pd.read_parquet('../processed_data.parquet')
        self.data_rating = pd.read_csv('../ratings_small.csv')
        self.rating_data_pre_processing()
        if not read_train_test:
            self.split_train_test_data()
        else:
            self.x_train = pd.read_csv('../train.csv')
            self.x_val = pd.read_csv('../valid.csv')
            self.x_test = pd.read_csv('../test.csv')
        
        self.moviesPerUsers = self.x_train.groupby("userId")["movieId"].apply(set).to_dict()
        self.usersPerMovies = self.x_train.groupby("movieId")["userId"].apply(set).to_dict()
    
    def rating_data_pre_processing(self):
        def check_movieId_exist(r):
            if r in self.processed_movie['movieId']:
                return True
            else:
                return False
        self.data_rating['exist'] = self.data_rating['movieId'].apply(check_movieId_exist)
        idx = self.data_rating[self.data_rating['exist'] == False].index
        self.data_rating.drop(idx, inplace=True)
        self.data_rating.drop(columns=['exist'], inplace=True)
        
    
    def split_train_test_data(self, method='random'):
        
        data = self.data_rating.copy()
        if method == 'time':
        
            data.sort_values(by='timestamp', inplace=True)
            n = int(data.shape[0]*0.8)
            self.x_train = data[:n]
            x_tmp = data[n:]
            self.x_val, self.x_test = train_test_split(x_tmp, test_size=0.5, random_state=50)
        elif method == 'random':
            self.x_train, x_tmp = train_test_split(data, test_size=0.2, random_state=50)
            self.x_val, self.x_test = train_test_split(x_tmp, test_size=0.5, random_state=50)
    
    def popular_movie(self, base='count'):
        '''
        base = 
            'count': movie seem by training data user
            'vote': base by vote_average
            'combine_rate': base by vote_average and rating
        '''
        assert isinstance(base, str), "base should be one of [count, vote or combine_rate] in str"
        assert base in ['count', 'vote', 'combine_rate'], "base should be one of [count, vote or combine_rate]"
        if base == 'count':
            movie_count = self.x_train.groupby('movieId').size().astype(int).to_frame(name='count').sort_values(by='count', ascending=False)
            popular_movie = np.array((movie_count.index))
            return popular_movie
        
        if base =='vote':
            popular_movie = self.processed_movie[['id', 'vote_count', 'vote_average']].copy()
            vote_average = popular_movie['vote_average'].mean()
            popular_movie['baseline_score'] = popular_movie.apply(self.get_popular_score_vote, axis=1, m=100, c = vote_average)
            popular_movie = popular_movie.sort_values(by='baseline_score', ascending=False)
            return popular_movie['id'].astype(int).values
        if base == 'combine_rate':
            movie_count = self.x_train.groupby("movieId")["rating"].agg(['mean', 'count']).rename(columns={"mean":"rating"})
            movie_count['id'] = movie_count.index.astype(str)
            rate_average = movie_count['rating'].mean()
            movie_count['rate_score'] = movie_count.apply(self.get_popular_score_rating, axis=1, m=100, c = rate_average)
            
            popular_movie = self.processed_movie[['movieId', 'vote_count', 'vote_average']].copy()
            popular_movie.rename(columns={'movieId':'id'}, inplace=True)
            popular_movie['id'] = popular_movie['id'].astype(str)
            vote_average = popular_movie['vote_average'].mean()
            popular_movie['vote_score'] = popular_movie.apply(self.get_popular_score_vote, axis=1, m=100, c = vote_average)
            
            popular_movie_combine = pd.merge(popular_movie[['id','vote_score', 'vote_count']], movie_count[['id', 'rate_score', 'count']], on='id', how='outer')
            popular_movie_combine.dropna(inplace=True)
            popular_movie_combine['vote_score'] = popular_movie_combine['vote_score'].div(2)
            popular_movie_combine['score'] = popular_movie_combine.apply(self.count_score, axis = 1)
            popular_movie_combine.sort_values(by='score', ascending=False, inplace=True)
            
            
            return popular_movie_combine['id'].astype(int).values
    
    @staticmethod
    def get_popular_score_vote(x, m, c):
        v = x['vote_count']
        R = x['vote_average']
        return ((v/(v+m)) * R) + ((m/(m+v)) * c)
    @staticmethod
    def get_popular_score_rating(x, m, c):
        r = x['count']
        R = x['rating']
        return ((r/(r+m)) * R) + ((m/(m+r)) * c)
    def count_recall_at(self, user, prediction, k):
        top_partition = prediction[:k]
        watched = self.moviesPerUsers.get(user)
        if watched != None:
            return sum([m in watched for m in top_partition])/k
        else:
            return 0
    def evaluate_model(self, pred, n=5):
        test_set = self.x_test.copy()
        test_set['recall@'+str(n)] = test_set['userId'].apply(self.count_recall_at, prediction=pred, k=n)
        eval_res = test_set['recall@'+str(n)].mean()
        return eval_res
    @staticmethod
    def count_score(x):
        r = x['rate_score']
        rc = x['count']
        vc = x['vote_count']
        v = x['vote_score']
        score = r*(rc/(rc+vc)) + v*(vc/(rc+vc))
        return score
    
m1 = movie(read_train_test = True)
pred1 = m1.popular_movie(base='vote')
print(m1.evaluate_model(pred1, n=5))
pred2 = m1.popular_movie(base='count')
print(m1.evaluate_model(pred2, n=5))
pred3 = m1.popular_movie(base='combine_rate')
print(m1.evaluate_model(pred3, n=5))

0.028454545454545007
0.5417727272727381
0.4960909090909048


### split train, test data

In [3]:
m2 = movie(read_train_test = False)
#m2.x_train
m2.x_train.reset_index(drop=True).to_csv('train.csv', index=False)
m2.x_val.reset_index(drop=True).to_csv('valid.csv', index=False)
m2.x_test.reset_index(drop=True).to_csv('test.csv', index=False)