## Overview of data:
- All 4 datasets from MovieLens are stored in the central movies.db database.
- The 4 datasets are as follows:
    - **links.csv**: links the *movieId* (what we are using as our unique identifier) with the imdbId (which will be helpful when eventually referencing IMDB for plot info and images).
    - **movies.csv**: contains the *title* and *genre* for each *movieId*.
        - *sidenote*: the actual table in the db.file is called '*movielens*'.
    - **ratings.csv**: contains the *rating* that each user (denoted by a *userId*) gave a particular movie (linked to its *movieId*) at a particular time (represented by a *timestamp* as a string). Note that each user rated multiple movies.
    - **tags.csv**: contains the *tags* that a particular user (linked to his/her *userId*) gave to a particular movie (linked to its *movieId*), also at a particular *timestamp*. Note that each user tags multiple movies, and in turn may apply multiple tags to each movie.
- In SQL commands, when joining data, all queries should be joined on the **movieId**, since this is the common, unique identifier between all tables in the database.

In [1]:
import pandas as pd
import sqlite3
import numpy as np

In [2]:
db = sqlite3.connect('data/movies.db')

In [3]:
query = '''SELECT title, genres, ratings.*, tags.tag, tags.timestamp AS ts
            FROM movielens
            JOIN ratings ON movielens.movieId = ratings.movieId
            LEFT JOIN tags ON movielens.movieID = tags.movieID AND ratings.userId = tags.userId
         '''
df = pd.read_sql(query, db)
df.head()

Unnamed: 0,title,genres,userId,movieId,rating,timestamp,tag,ts
0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,1,4.0,2000-07-30 18:45:03,,
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,1,4.0,1996-11-08 06:36:02,,
2,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,1,4.5,2005-01-25 06:52:26,,
3,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,1,2.5,2017-11-13 12:59:30,,
4,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,1,4.5,2011-05-18 05:28:03,,


_______________________

# Functions

_______________________

## Function that extracts the various genres (from the 'genres' column) and creates multiple genre columns out of it.
- ...where each movie/row will contain a *1* if it corresponds to that genre, and a *0* if not.

In [4]:
def make_genre_columns(dataframe):
    #function that takes the 'genres' column (assuming it exists) from a 
    #dataframe and makes multiple "one-hot-encoding" genre columns out of it
    
    genres = list(dataframe['genres'].unique())
    
    genres_split = []
    for g in genres:
        sublist = g.split('|')
        genres_split.append(sublist)

    flat_list = [item for sublist in genres_split for item in sublist]

    def unique_list(list):
        a = []
        for b in list:
            if b not in a:
                a.append(b)
        return a
    
    unique_genres = unique_list(flat_list)
    
    for g in unique_genres:
        
        col_to_add = []
        for i in list(dataframe['genres']):
            if g in i:
                col_to_add.append(1)  
            else:
                col_to_add.append(0)
    
        dataframe['Genre_{}'.format(g)] = col_to_add
        
#     del dataframe['genres']
#     #optional

    return dataframe

##### Example:

In [5]:
test = make_genre_columns(df.copy())

test.head()

Unnamed: 0,title,genres,userId,movieId,rating,timestamp,tag,ts,Genre_Adventure,Genre_Animation,...,Genre_Horror,Genre_Mystery,Genre_Sci-Fi,Genre_War,Genre_Musical,Genre_Documentary,Genre_IMAX,Genre_Western,Genre_Film-Noir,Genre_(no genres listed)
0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,1,4.0,2000-07-30 18:45:03,,,1,1,...,0,0,0,0,0,0,0,0,0,0
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,1,4.0,1996-11-08 06:36:02,,,1,1,...,0,0,0,0,0,0,0,0,0,0
2,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,1,4.5,2005-01-25 06:52:26,,,1,1,...,0,0,0,0,0,0,0,0,0,0
3,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,1,2.5,2017-11-13 12:59:30,,,1,1,...,0,0,0,0,0,0,0,0,0,0
4,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,1,4.5,2011-05-18 05:28:03,,,1,1,...,0,0,0,0,0,0,0,0,0,0


## Function that outputs a list of movieIds that satisfy given filter conditions.
- to start, this filter condition will be just the requested genre.
    - in other words, the user provides a preferred genre (via Django interface, eventually) and this function returns a list of movieIds that belong to that genre.

In [6]:
def get_movies_by_genre(dataframe, genre):
    ids = dataframe['movieId']
    bools = dataframe['Genre_{}'.format(genre)].values
    z = list(zip(ids, bools))
    list_movies = []
    for pair in z:
        if pair[1] == 1:
            list_movies.append(pair[0])
            
    def unique_list(list):
        a = []
        for b in list:
            if b not in a:
                a.append(b)
        return a
    
    return unique_list(list_movies)

##### Example:

In [7]:
adventure_movies = get_movies_by_genre(test, 'Adventure')
fantasy_movies = get_movies_by_genre(test, 'Fantasy')
horror_movies = get_movies_by_genre(test, 'Horror')
IMAX_movies = get_movies_by_genre(test, 'IMAX')

horror_movies[:10]

[12, 22, 70, 92, 93, 152, 177, 183, 188, 196]

### Combined Function: extract the various genres (from the 'genres' column), create multiple genre columns out of it, then output a list of movies that satisfy requested genre.

In [14]:
def movieIds_by_genre(dataframe, desired_genre):
    
    genres = list(dataframe['genres'].unique())
    
    genres_split = []
    for g in genres:
        sublist = g.split('|')
        genres_split.append(sublist)

    flat_list = [item for sublist in genres_split for item in sublist]

    def unique_list(list):
        a = []
        for b in list:
            if b not in a:
                a.append(b)
        return a
    
    unique_genres = unique_list(flat_list)
    
    for g in unique_genres:
        
        col_to_add = []
        for i in list(dataframe['genres']):
            if g in i:
                col_to_add.append(1)  
            else:
                col_to_add.append(0)
    
        dataframe['Genre_{}'.format(g)] = col_to_add
        
#     del dataframe['genres']
#     #optional

    ids = dataframe['movieId']
    bools = dataframe['Genre_{}'.format(desired_genre)].values
    z = list(zip(ids, bools))
    list_movies = []
    for pair in z:
        if pair[1] == 1:
            list_movies.append(pair[0])
    
    return unique_list(list_movies)

##### Example:

In [16]:
x = df.copy()
movieIds_by_genre(x, 'Documentary')

[77,
 99,
 108,
 116,
 128,
 137,
 162,
 206,
 246,
 363,
 556,
 581,
 602,
 722,
 759,
 791,
 1050,
 1111,
 1123,
 1144,
 1147,
 1189,
 1191,
 1192,
 1289,
 1310,
 1361,
 1649,
 1652,
 1797,
 1827,
 1856,
 2064,
 2323,
 2330,
 2494,
 2538,
 2659,
 2677,
 2693,
 2813,
 2824,
 2859,
 2930,
 2984,
 3002,
 3007,
 3077,
 3142,
 3182,
 3281,
 3303,
 3327,
 3539,
 3609,
 3625,
 3653,
 3677,
 3679,
 3680,
 3859,
 3865,
 3989,
 4102,
 4171,
 4217,
 4236,
 4237,
 4278,
 4304,
 4350,
 4445,
 4453,
 4458,
 4459,
 4711,
 4769,
 4783,
 4864,
 4961,
 5059,
 5137,
 5224,
 5239,
 5288,
 5325,
 5385,
 5483,
 5513,
 5619,
 5643,
 5669,
 5670,
 5684,
 5735,
 5736,
 5785,
 5820,
 5932,
 6005,
 6042,
 6062,
 6122,
 6123,
 6125,
 6195,
 6269,
 6289,
 6299,
 6306,
 6327,
 6331,
 6368,
 6375,
 6380,
 6400,
 6408,
 6433,
 6453,
 6598,
 6612,
 6667,
 6679,
 6692,
 6772,
 6775,
 6780,
 6869,
 6935,
 6938,
 6945,
 6948,
 6962,
 6963,
 6965,
 7096,
 7124,
 7141,
 7156,
 7171,
 7225,
 7256,
 7440,
 7443,
 7566,
 77

## Function for adding new user row to the tail of the existing DataFrame.

In [8]:
def add_new_user(dataframe):
    new_user_row = pd.DataFrame(np.zeros(shape=(1,len(dataframe.columns))),
                              columns=dataframe.columns)

    dfs_to_concat = [dataframe, new_user_row]
    combined = pd.concat(dfs_to_concat)
    return combined

##### Example:

In [9]:
query2 = "SELECT * FROM ratings"
df_sandbox = pd.read_sql(query2, db)
df_sandbox = df_sandbox.drop("timestamp", axis = 1)
df_sandbox = df_sandbox.set_index(["userId", "movieId"])
users_vs_movies_matrix = df_sandbox.unstack()
users_vs_movies_matrix = users_vs_movies_matrix.fillna(0)


# add_new_user(users_vs_movies_matrix).tail()
test_matrix = add_new_user(users_vs_movies_matrix)
test_matrix.tail()

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
610,5.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Function to generate dictionary for converting IMDB Id (used in Django interface) to MovieLens movieId (needed for DataFrame), and vice-versa.

In [10]:
def translator_dictionary():

    query3 = "SELECT movieId, imdbId FROM links"
    df_translator = pd.read_sql(query3, db)
    movie_IDs = list(df_translator['movieId'])
    IMDB_IDs = list(df_translator['imdbId'])


    ML_2_IMDB = dict(zip(movie_IDs, IMDB_IDs))
    IMDB_2_ML = dict(zip(IMDB_IDs, movie_IDs))
    
    return IMDB_2_ML

Format of input data (to be taken from Django interface):
- list of tuples, where each tuple represents (imdbId, Rating)

## Function for filling new user row with ratings inputted into Django interface

In [20]:
def convert_input(dataframe, django_data):
    
    translator = translator_dictionary()
    
    #Assuming that data coming from Django interface is a list of tuples
    for pair in django_data:
        movieId = translator[pair[0]]
        #dataframe['rating'][movieId][0] = pair[1] 
        #avoid above method, because it only returns a view/slice of a copy.
        dataframe.loc[:,(movieId, 0)] = pair[1] #where 0 represents userId = 0, aka the new user
    return list(dataframe.loc[0])

##### Example:

In [21]:
dummy_input = [(112572, 4.0), (113690, 5.0), (111003, 1.0), (33467, 3.0), (88763, 3.0)]
output_list = convert_input(test_matrix, dummy_input)

# [i for i in output_list if i > 0]
output_list

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0

### Combined Convert Input Function

In [26]:
def translator_dictionary():

    query3 = "SELECT movieId, imdbId FROM links"
    df_translator = pd.read_sql(query3, db)
    movie_IDs = list(df_translator['movieId'])
    IMDB_IDs = list(df_translator['imdbId'])


    ML_2_IMDB = dict(zip(movie_IDs, IMDB_IDs))
    IMDB_2_ML = dict(zip(IMDB_IDs, movie_IDs))
    
    return IMDB_2_ML

def convert_django(dataframe, django_data):
    
    new_user_row = pd.DataFrame(np.zeros(shape=(1,len(dataframe.columns))),
                              columns=dataframe.columns)

    dfs_to_concat = [dataframe, new_user_row]
    combined = pd.concat(dfs_to_concat)
    
    translator = translator_dictionary()
    
    #Assuming that data coming from Django interface is a list of tuples
    for pair in django_data:
        movieId = translator[pair[0]]
        combined.loc[:,(movieId, 0)] = pair[1] 
        #where 0 represents userId = 0, aka the new user
        
    return list(combined.loc[0])

##### Example:

In [33]:
django_test = [(112572, 4.0), (113690, 5.0), (111003, 1.0), (33467, 3.0), (88763, 3.0)]
convert_django(users_vs_movies_matrix, django_test)

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0

In [13]:
# db.close()