In [6]:
def preprocess_string(text):
    ''' Preprocess text for tf-idf
    
    Transforms the text into lowercase and removes symbols
    and punctuations
    Removes stopwords using NLTK library
    Lemmatizes words using SnowballStemmer (NLTK Library)
    
    Input
    --------
    text (string) :  string from the Movielens synopsis dataset 
    
    
    Output
    --------
    new_text (string)  : preprocessed text for further tf-idf processing
    
    '''
    import string
    from nltk.corpus import stopwords
    from nltk.stem.snowball import SnowballStemmer # get from VP later
    from nltk.tokenize import word_tokenize
    
    
    stop_words = stopwords.words('english')
    stemmer = SnowballStemmer(language='english')
    
    text = text.lower()
    
    text = ''.join([char for char in text if char not in string.punctuation])
    
    new_text = ""
    words = word_tokenize(text)
    for word in words:
        if word not in stop_words and len(word) > 2:
            new_text = new_text + " " + stemmer.stem(word)
    
    return new_text


In [7]:
def create_item_feature(num_features = 300):
    '''
    Return item_feature matrix based on TF-IDF of Movie Synopsis
    
    Takes in the list of movies that has been rated in the MovieLens 100k
    dataset and fetches the respective synopsis for TF-IDF computation
    
       
    Input
    ---------
    num_features : number of features to be used for the TF-IDF extraction
                 : default value 300 (~sqrt[100k rows])
    
    
    Output
    ---------
    item_feature (pd.DataFrame): feature_vector from TF-IDF extracted
                            from movie synopses the TheMovieDB dataset
    
    
    
    '''
    
    import numpy as np
    import pandas as pd
    from sklearn.feature_extraction.text import TfidfVectorizer
    
    transaction_list = pd.read_csv('ratings.csv', usecols=['movieId'])
    
    # filter the unique movie IDs
    seen_movies = pd.DataFrame(transaction_list['movieId'].unique(), columns={'movieId'})
    
    # the synopsis is based on the "The Movie DB" Id system
    # links.csv has a mapping between MovieLens ID and The MovieDB Id
    movie_id_links = pd.read_csv('links.csv', usecols =['movieId','tmdbId'])
    movie_id_links = movie_id_links.dropna()
    movie_id_links.head()
    
    # get mapping between MovieLens IDs and TMDB IDs
    seen_movies = seen_movies.merge(movie_id_links, on='movieId', how='inner')
    
    # Read MetaData CSV file with movie plots/synopsis
    metadata = pd.read_csv('movies_metadata.csv', usecols=['id','overview'])
    metadata = metadata.rename(columns={'id':'tmdbId'})

    # drop movies with invalid tmbdId (e.g., date string instead of integer)
    ids1 = pd.to_numeric(metadata['tmdbId'], errors='coerce').isna()
    metadata = metadata.drop(metadata[ids1].index)

    # drop movies with NaN synopsis
    metadata = metadata.dropna()
    metadata['tmdbId'] = metadata['tmdbId'].astype(float)
    metadata = metadata.drop_duplicates(subset=['tmdbId'])

        
    # get only synopsis for movies in the transaction list
    synopsis_set = seen_movies.merge(metadata, on='tmdbId', how='inner')
    
    # preprocess synopsis strings
    synopsis_set['overview'] = synopsis_set['overview'].apply(preprocess_string)
    
    # TF-IDF processing
    tfidfvectorizer = TfidfVectorizer(analyzer='word', token_pattern = '[a-z]+\w*', stop_words='english', max_features=num_features)
    tfidf_vector = tfidfvectorizer.fit_transform(synopsis_set['overview'])
    tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=synopsis_set['movieId'], columns=tfidfvectorizer.get_feature_names_out())
    
    # normalization per column (word)
    tfidf_df = tfidf_df.apply(lambda x: (x - x.min())/(x.max() - x.min()))
    tfidf_df = tfidf_df.reset_index()
    
    # rename cols
    old_cols = tfidf_df.columns
    new_cols = []
    new_cols.append(old_cols[0])
    for idx, col in enumerate(old_cols[1:], 1):
        new_cols.append(f'i_{idx}')
    tfidf_df.rename(columns=dict(zip(old_cols, new_cols)), inplace=True)
    
    return tfidf_df
    
    

In [8]:
item_feature_table = create_item_feature(num_features = 300)

In [9]:
item_feature_table.head()

Unnamed: 0,movieId,i_1,i_2,i_3,i_4,i_5,i_6,i_7,i_8,i_9,...,i_291,i_292,i_293,i_294,i_295,i_296,i_297,i_298,i_299,i_300
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.513025,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,47,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,50,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
item_feature_table.shape

(9508, 301)