In [41]:
def create_user_feature(num_transactions = 100_836):
    '''
    Return a user_feature matrix
    
    Takes in the transaction list from the Movielens 100k dataset
    and replaces the userId with a feature vector representing
    the number of movies seen by the user per genre
    
    possible genres include the following:
    'IMAX', 'Adventure', 'Mystery', 'Animation', 'Documentary', 'Comedy',
       'Western', 'War', 'Film-Noir', 'Crime', 'Drama', 'Thriller', 'Fantasy',
       'Action', 'Sci-Fi', 'Children', 'Romance', 'Horror', 'Musical',
       '(no genres listed)'
       
    Input
    ---------
    none
    
    
    Output
    ---------
    user_feature (pd.DataFrame): feature_vector containing number of count of 
                                 genres seen based on ratings given by a user
                                 - each movie can have several genres
                                 - each row correspond to a transaction (user rating)
    
    
    
    '''
    import numpy as np
    import pandas as pd
    from collections import Counter
    
    raw_transaction_list = pd.read_csv('ratings.csv', nrows = num_transactions)
    transaction_list =  raw_transaction_list[['userId','movieId', 'rating']].copy()
    
    # reduce size of DataFrame for transaction_list by downcasting
    for col in transaction_list:
        if transaction_list[col].dtype == 'int64':
            transaction_list[col] = pd.to_numeric(transaction_list[col], downcast='integer')
        if transaction_list[col].dtype == 'float64':
            transaction_list[col] = pd.to_numeric(transaction_list[col], downcast='float')

    
    # preprocess movie list and genres
    movie_description = pd.read_csv('movies.csv')    
    movie_description = movie_description.set_index('movieId')
    movie_description['genre'] = movie_description['genres'].str.split('|')
    
    # extract the genres for the movie in each transaction/rating
    movie_IDs_list = transaction_list['movieId']
    transaction_list['genre'] = list(movie_description.loc[movie_IDs_list[:len(movie_IDs_list)]]['genre'])

    # count the number of genres seen by each userId
    genre_count = (transaction_list.groupby('userId')['genre']
                     .apply(list)
                     .apply(lambda x: [item for sublist in x for item in sublist])
                     .apply(Counter))
    
    # remove genre column in transaction list (just to conserve memspace)
    del transaction_list['genre']
        
    # create user_feature with count of genres per user
    user_feature = pd.DataFrame(list(genre_count)).fillna(0)
    for col in user_feature:
        user_feature[col] = pd.to_numeric(user_feature[col], downcast='integer')
        
    
    user_feature['userId'] = genre_count.index
    
    
    # re-arrange columns
    cols = user_feature.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    user_feature = user_feature[cols]
    
    # rename cols
    old_cols = user_feature.columns[1:]
    new_cols = []
    for idx, col in enumerate(cols[1:], 1):
        new_cols.append(f'u_{idx}')
    user_feature.rename(columns=dict(zip(old_cols, new_cols)), inplace=True)
    
    # replace NaNs in user_feature with 0
    user_feature = user_feature.fillna(0)
    
    return raw_transaction_list, user_feature
    
    

In [42]:
vanilla_transaction_list, user_feature_table = create_user_feature(num_transactions = 5000)

In [43]:
vanilla_transaction_list.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041
9,1,157,5.0,964984100


In [44]:
user_feature_table.head()

Unnamed: 0,userId,u_1,u_2,u_3,u_4,u_5,u_6,u_7,u_8,u_9,...,u_11,u_12,u_13,u_14,u_15,u_16,u_17,u_18,u_19,u_20
0,1,85,29,42,83,47,26,90,45,55,...,17,68,22,7,40,22,1,0,0,0
1,2,3,0,0,7,0,1,11,10,10,...,1,17,1,1,4,0,0,4,3,0
2,3,11,4,5,9,4,5,14,2,7,...,8,16,5,0,15,1,0,0,0,0
3,4,29,6,10,104,19,58,25,27,38,...,4,120,7,10,12,16,4,1,2,0
4,5,8,6,9,15,7,11,9,12,9,...,1,25,3,2,2,5,0,3,0,0


In [45]:
user_feature_table.shape

(32, 21)

In [46]:
def preprocess_string(text):
    ''' Preprocess text for tf-idf
    
    Transforms the text into lowercase and removes symbols
    and punctuations
    Removes stopwords using NLTK library
    Lemmatizes words using SnowballStemmer (NLTK Library)
    
    Input
    --------
    text (string) :  string from the Movielens synopsis dataset 
    
    
    Output
    --------
    new_text (string)  : preprocessed text for further tf-idf processing
    
    '''
    import string
    from nltk.corpus import stopwords
    from nltk.stem.snowball import SnowballStemmer # get from VP later
    from nltk.tokenize import word_tokenize
    
    
    stop_words = stopwords.words('english')
    stemmer = SnowballStemmer(language='english')
    
    text = text.lower()
    
    text = ''.join([char for char in text if char not in string.punctuation])
    
    new_text = ""
    words = word_tokenize(text)
    for word in words:
        if word not in stop_words and len(word) > 2:
            new_text = new_text + " " + stemmer.stem(word)
    
    return new_text


In [47]:
def create_item_feature(num_features = 300):
    '''
    Return item_feature matrix based on TF-IDF of Movie Synopsis
    
    Takes in the list of movies that has been rated in the MovieLens 100k
    dataset and fetches the respective synopsis for TF-IDF computation
    
       
    Input
    ---------
    num_features : number of features to be used for the TF-IDF extraction
                 : default value 300 (~sqrt[100k rows])
    
    
    Output
    ---------
    item_feature (pd.DataFrame): feature_vector from TF-IDF extracted
                            from movie synopses the TheMovieDB dataset
    
    
    
    '''
    
    import numpy as np
    import pandas as pd
    from sklearn.feature_extraction.text import TfidfVectorizer
    
    transaction_list = pd.read_csv('ratings.csv', usecols=['movieId'])
    
    # filter the unique movie IDs
    seen_movies = pd.DataFrame(transaction_list['movieId'].unique(), columns={'movieId'})
    
    # the synopsis is based on the "The Movie DB" Id system
    # links.csv has a mapping between MovieLens ID and The MovieDB Id
    movie_id_links = pd.read_csv('links.csv', usecols =['movieId','tmdbId'])
    movie_id_links = movie_id_links.dropna()
    movie_id_links.head()
    
    # get mapping between MovieLens IDs and TMDB IDs
    seen_movies = seen_movies.merge(movie_id_links, on='movieId', how='inner')
    
    # Read MetaData CSV file with movie plots/synopsis
    metadata = pd.read_csv('movies_metadata.csv', usecols=['id','overview'])
    metadata = metadata.rename(columns={'id':'tmdbId'})

    # drop movies with invalid tmbdId (e.g., date string instead of integer)
    ids1 = pd.to_numeric(metadata['tmdbId'], errors='coerce').isna()
    metadata = metadata.drop(metadata[ids1].index)

    # drop movies with NaN synopsis
    metadata = metadata.dropna()
    metadata['tmdbId'] = metadata['tmdbId'].astype(float)
    metadata = metadata.drop_duplicates(subset=['tmdbId'])

        
    # get only synopsis for movies in the transaction list
    synopsis_set = seen_movies.merge(metadata, on='tmdbId', how='inner')
    
    # preprocess synopsis strings
    synopsis_set['overview'] = synopsis_set['overview'].apply(preprocess_string)
    
    # TF-IDF processing
    tfidfvectorizer = TfidfVectorizer(analyzer='word', token_pattern = '[a-z]+\w*', stop_words='english', max_features=num_features)
    tfidf_vector = tfidfvectorizer.fit_transform(synopsis_set['overview'])
    tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=synopsis_set['movieId'], columns=tfidfvectorizer.get_feature_names_out())
    
    # normalization per column (word)
    tfidf_df = tfidf_df.apply(lambda x: (x - x.min())/(x.max() - x.min()))
    tfidf_df = tfidf_df.reset_index()
    
    # rename cols
    old_cols = tfidf_df.columns
    new_cols = []
    new_cols.append(old_cols[0])
    for idx, col in enumerate(old_cols[1:], 1):
        new_cols.append(f'i_{idx}')
    tfidf_df.rename(columns=dict(zip(old_cols, new_cols)), inplace=True)
    
    return tfidf_df
    
    

In [48]:
item_feature_table = create_item_feature(num_features = 300)

In [49]:
item_feature_table.head()

Unnamed: 0,movieId,i_1,i_2,i_3,i_4,i_5,i_6,i_7,i_8,i_9,...,i_291,i_292,i_293,i_294,i_295,i_296,i_297,i_298,i_299,i_300
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.513025,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,47,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,50,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
item_feature_table.shape

(9508, 301)

In [51]:
def get_augmented_table():
    '''
    Replace integrate user_features and item_features
    to the transaction_list
    
    Input
    ------
    none
    
    
    Output
    -------
    augmented_transaction_table  : transaction_list concatenated with user_features
                                   from genres and item_features from movie synopsis
    
    
    '''
    import pandas as pd
    transaction_list, user_feature = create_user_feature(num_transactions = 5000)
    item_feature = create_item_feature()
    augmented_tt = transaction_list.merge(user_feature, on='userId', how='left')
    augmented_tt_2 = augmented_tt.merge(item_feature, on='movieId', how='left')
    
    return augmented_tt_2
    

In [52]:
augmented_transaction_table = get_augmented_table()

In [53]:
augmented_transaction_table.head()

Unnamed: 0,userId,movieId,rating,timestamp,u_1,u_2,u_3,u_4,u_5,u_6,...,i_291,i_292,i_293,i_294,i_295,i_296,i_297,i_298,i_299,i_300
0,1,1,4.0,964982703,85,29,42,83,47,26,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,3,4.0,964981247,85,29,42,83,47,26,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,6,4.0,964982224,85,29,42,83,47,26,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,47,5.0,964983815,85,29,42,83,47,26,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,50,5.0,964982931,85,29,42,83,47,26,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [54]:
augmented_transaction_table.shape

(5000, 324)

In [55]:
augmented_transaction_table['userId'].nunique()

32