In [1]:
import time
start = time.time()

import sys
import json
import numpy as np
import pandas as pd
from math import log
from dateutil.parser import parse

### Data import

In [2]:
def read_content(path):
    content = dict()
    with open(path) as f:
        for line in f.readlines()[1:]:
            item = line.split(',{')
            itemId = item[0]
            data = json.loads('{' + item[1])

            content[itemId] = data.get('Year', '').replace(' ', '') + ';;'
            content[itemId] += data.get('Title', '') + ';;'
            content[itemId] += data.get('Rated', '').replace(' ', '') + ';;'
            content[itemId] += data.get('Released', '').replace(' ', '') + ';;'
            content[itemId] += data.get('Director', '').replace(' ', '') + ';;'
            content[itemId] += data.get('Genre', '') + ';;'
            content[itemId] += data.get('Runtime', '').replace(' ', '') + ';;'
            content[itemId] += data.get('Writer', '').replace(' ', '') + ';;'
            content[itemId] += data.get('Actors', '').replace(' ', '') + ';;'
            content[itemId] += data.get('Language', '').replace(' ', '') + ';;'
            content[itemId] += data.get('Awards', '').replace(' ', '') + ';;'
            content[itemId] += data.get('Poster', '').replace(' ', '') + ';;'
            content[itemId] += data.get('Metascore', '').replace(' ', '') + ';;'
            content[itemId] += data.get('Country', '') + ';;'
            content[itemId] += data.get('imdbRating', '') + ';;'
            content[itemId] += data.get('Type', '') + ';;'
            content[itemId] += data.get('Plot', '')
    df = pd.DataFrame(content, index=[0]).transpose()
    df['itemId'] = df.index
    df = pd.concat([df, df[0].str.split(';;', expand=True)], axis=1, ignore_index=True)
    df = df.drop([0,1], axis=1)
    df = df.reset_index()
    df.columns = ['itemId','Year','Title','Rated','Released','Director','Genre','Runtime','Writer','Actors',
                  'Language','Awards','Poster','Metascore','Country','imdbRating','Type','Plot']
    return df

In [3]:
ratings = pd.read_csv('./data/ratings.csv')
targets = pd.read_csv('./data/targets.csv')
content = read_content('./data/content.csv')

In [4]:
# separate first column
sep = ratings['UserId:ItemId'].str.split(':', expand=True)
sep.columns = ['UserId', 'ItemId']
ratings = pd.concat([ratings, sep], axis=1)
# selete columns
ratings = ratings.drop(['UserId:ItemId'], axis=1)

In [5]:
# separate first column
sep = targets['UserId:ItemId'].str.split(':', expand=True)
sep.columns = ['UserId', 'ItemId']
targets = pd.concat([targets, sep], axis=1)
# selete columns
targets = targets.drop(['UserId:ItemId'], axis=1)

### Feature extraction

In [6]:
def feature_extraction(content):
    """
    FEATURE ENGINEER PROCESS
    
    promotes an extraction of features among those available in the dataset 'content.csv'
    """
    ln = content.shape[0]
    content = content.replace(['N/A', ''], 0)

    # -- Year
    content['Year'] = pd.to_numeric(content['Year'], downcast='integer')

    # -- Rated
    rated = pd.get_dummies(content['Rated'], prefix='Rated')
    content = pd.concat([content, rated], axis=1)

    # -- Released
    content['Released_year'] = 0
    content['Released_month'] = 0
    content['Released_day'] = 0
    for i in range(ln):
        date = str(content['Released'][i])
        if (date!='0'):
            prs = parse(date)
            content.loc[i, 'Released_year'] = prs.year
            content.loc[i, 'Released_month'] = prs.month
            content.loc[i, 'Released_day'] = prs.day

    # -- Runtime
    content['Runtime_min'] = 0
    content['Runtime_hour'] = 0
    for i in range(ln):
        runtime = str(content['Runtime'][i])
        if (runtime!='0'):
            runtime = runtime.split('h')
            if len(runtime)>1:
                content.loc[i, 'Runtime_min'] = int(runtime[1][:(len(runtime[1])-3)])
                content.loc[i, 'Runtime_hour'] = int(runtime[0])
            else:
                tm = int(runtime[0][:(len(runtime[0])-3)])
                minutes = tm%60
                hour = (tm-minutes)/60
                content.loc[i, 'Runtime_min'] = minutes
                content.loc[i, 'Runtime_hour'] = hour

    # -- Languages
    for i in range(ln):
        cont = content['Language'][i]
        if cont=='English':
            content.loc[i, 'Language'] = 1
        elif cont=='French':
            content.loc[i, 'Language'] = 2
        elif cont=='Japanese':
            content.loc[i, 'Language'] = 3
        elif cont=='Spanish':
            content.loc[i, 'Language'] = 4
        else:
            content.loc[i, 'Language'] = 0
    content['Language'] = pd.to_numeric(content['Language'], downcast='integer')

    # -- Metascore
    content['Metascore'] = pd.to_numeric(content['Metascore'], downcast='integer')

    # -- Country
    for i in range(ln):
        cont = content['Country'][i]
        if cont=='USA':
            content.loc[i, 'Country'] = 1
        elif cont=='UK':
            content.loc[i, 'Country'] = 2
        elif cont=='India':
            content.loc[i, 'Country'] = 3
        elif cont=='Japan':
            content.loc[i, 'Country'] = 4
        else:
            content.loc[i, 'Country'] = 0
    content['Country'] = pd.to_numeric(content['Country'], downcast='integer')

    # -- imdbRating
    content['imdbRating'] = pd.to_numeric(content['imdbRating'], downcast='integer')

    # -- Type
    content['Type'] = content['Type'].replace(['movie'], 1)
    content['Type'] = content['Type'].replace(['episode'], 2)
    content['Type'] = content['Type'].replace(['series'], 3)
    
    # --  Combining remaining categorical features
    for i in range(ln):
        content.loc[i, 'combinedFeatures'] = str(content['Genre'][i]) + ' ' +  str(content['Actors'][i])
    
    # -- Dropping columns
    content = content.drop(['Rated', 'Released', 'Runtime', 'Awards', 'Poster', 'Title',
                            'Director', 'Genre', 'Writer', 'Actors', 'Plot'], axis=1)
    
    return content

content = feature_extraction(content)

### TF-IDF

In [7]:
class TfIdf:
    def __init__(self, content, lenght):
        data = [[(word.replace(',', '').replace('.', '').replace('(' , '').replace(')', ''))
                     for word in row.lower().split()]
                     for row in content['combinedFeatures']]
        self.data_len = len(data)
        
        tfDict = []
        for row in data:
            tfDict.append(self.computeReviewTFDict(row))
            
        #Stores the review count dictionary
        countDict = self.computeCountDict(tfDict)
        
        #Stores the idf dictionary
        self.idfDict = self.computeIDFDict(countDict)
            
        tfidfDict = self.computeReviewTFIDFDict( self.computeIDFDict( self.computeCountDict(tfDict) ) )
        
        #Stores the TF-IDF dictionaries
        tfidfDict = [self.computeReviewTFIDFDict(review) for review in tfDict]
        
        wordDict = sorted(countDict.keys())
        del self.data_len,self.idfDict
        self.tfidfVector = [self.computeTFIDFVector(review, wordDict)[:lenght] for review in tfidfDict]
    
    def computeReviewTFDict(self, review):
        """ Returns a tf dictionary for each review whose keys are all
        the unique words in the review and whose values are their
        corresponding tf.
        """
        # Counts the number of times the word appears in review
        reviewTFDict = {}
        for word in review:
            if word in reviewTFDict:
                reviewTFDict[word] += 1
            else:
                reviewTFDict[word] = 1
        # Computes tf for each word
        for word in reviewTFDict:
            reviewTFDict[word] = reviewTFDict[word] / len(review)
        return reviewTFDict

    def computeCountDict(self, tfDict):
        """ Returns a dictionary whose keys are all the unique words in
        the dataset and whose values count the number of reviews in which
        the word appears.
        """
        countDict = {}
        # Run through each review's tf dictionary and increment countDict's (word, doc) pair
        for review in tfDict:
            for word in review:
                if word in countDict:
                    countDict[word] += 1
                else:
                    countDict[word] = 1
        return countDict

    def computeIDFDict(self, countDict):
        """ Returns a dictionary whose keys are all the unique words in the
        dataset and whose values are their corresponding idf.
        """
        idfDict = {}
        for word in countDict:
            idfDict[word] = log(self.data_len / countDict[word])
        return idfDict

    def computeReviewTFIDFDict(self, reviewTFDict):
        """ Returns a dictionary whose keys are all the unique words in the
        review and whose values are their corresponding tfidf.
        """
        reviewTFIDFDict = {}
        #For each word in the review, we multiply its tf and its idf.
        for word in reviewTFDict:
            reviewTFIDFDict[word] = reviewTFDict[word] * self.idfDict[word]
        return reviewTFIDFDict

    def computeTFIDFVector(self, review, wordDict):
        tfidfVector = [0.0] * len(wordDict)

        # For each unique word, if it is in the review, store its TF-IDF value.
        for i, word in enumerate(wordDict):
           if word in review:
                tfidfVector[i] = review[word]
        return tfidfVector
    
    def get_TFIDFVector(self):
        return self.tfidfVector

In [8]:
tfidfVector = TfIdf(content, lenght=2000).get_TFIDFVector()

In [9]:
content.index = content['itemId']
content = content.drop(['itemId', 'combinedFeatures'], axis=1)

In [10]:
tfidfVector = np.concatenate((content.values,tfidfVector), axis=1)

### Users/items vectors

In [11]:
ratings.head()

Unnamed: 0,Prediction,Timestamp,UserId,ItemId
0,6,1362062307,u0026762,i2171847
1,8,1362062624,u0026502,i0444778
2,6,1362062838,u0004598,i1411238
3,7,1362063503,u0031317,i1496422
4,5,1362063653,u0024257,i0118799


In [12]:
# dict of items vectors
itemsDict = dict()
i = 0
for itemID in content.index:
    itemsDict[itemID] = tfidfVector[i]
    i += 1

# dict of items classified by each user
userItems = dict()
i = 0
for userID in ratings['UserId']:
    if userID in userItems:
        if ratings['ItemId'][i] not in userItems[userID]:
            userItems[userID].append([ratings['ItemId'][i], ratings['Prediction'][i]])
    else:
        userItems[userID] = [[ratings['ItemId'][i], ratings['Prediction'][i]]]
    i += 1

### Making predictions

In [13]:
class CosineSimilarityPredict:
    """
    Prediction of item scores from user id and item
    """
    
    def __init__(self, itemsDict):
        self.similarities = dict()
        self.norms = dict()
        for key in itemsDict.keys():
            self.norms[key] = np.linalg.norm(itemsDict[key])
    
    def predict(self, userID, itemID):
        num = 0
        div = 0
        if userID not in userItems:
            return 7
        for item in userItems[userID]:
            st = itemID + item[0]
            if st not in self.similarities:
                s1 = set(itemsDict[itemID])
                s2 = itemsDict[item[0]]
                self.similarities[st] =  (np.dot(itemsDict[itemID],itemsDict[item[0]]) / (self.norms[itemID]*self.norms[item[0]])) #float(len(s1.intersection(s2)) / len(s1.union(s2)))
            num += self.similarities[st] * item[1]
            div += self.similarities[st]
        return num/div

cs = CosineSimilarityPredict(itemsDict)

In [None]:
sys.stdout.write('UserId:ItemId,Prediction\n')
for i in targets.index:
    userID = targets['UserId'][i]
    itemID = targets['ItemId'][i]
    sys.stdout.write(userID)
    sys.stdout.write(':')
    sys.stdout.write(itemID)
    sys.stdout.write(',')
    sys.stdout.write(str(cs.predict(userID, itemID)))
    sys.stdout.write('\n')

In [50]:
end = time.time()
hours, rem = divmod(end-start, 3600)
minutes, seconds = divmod(rem, 60)
print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))

00:01:40.56


In [15]:
np.mean(ratings['Prediction'])

7.296172535880619