This file contains the finished functions related to Project_607

In [4]:
def setup(dataset, tfidf=False, vec_size=5):
    '''
    setup prepares the NLP for the movie dataset by creating a cosine similarity matrix
    and a movie index for reference
    *****
    Paramters
    ---------
    dataset : dataframe,
        Movie data in a pandas dataframe
    tfidf : boolean,
        Switch that enables the use of TFIDF vectorization. Default to False
    vec_size : int,
        Size of the output Doc2Vec vectors
        
    Returns
    -------
    dataset : dataframe,
        modified dataframe, including vector column
    cossim : np.array,
        cosine similarity matrix for the entire dataset
    movie_index : dictionary,
        movie index to correlate the movie dataframe with the cosine similarity matrix
    duplicates : list,
        list of duplicate movies

    '''
    
    if tfidf:
        from sklearn.feature_extraction.text import TfidfVectorizer
        from sklearn.metrics.pairwise import linear_kernel
        
        # vectorize/embed movie descriptions
        tfidf_vectorizer = TfidfVectorizer(stop_words="english")
        vectors = tfidf_vectorizer.fit_transform(dataset.descriptions)
        
        # cosine similarity matrix
        cossim = linear_kernel(vectors, vectors)
        vectors = list(vectors.A)
        
        
    else:
        from gensim.models.doc2vec import Doc2Vec, TaggedDocument
        from gensim.test.utils import get_tmpfile
        from gensim.similarities import MatrixSimilarity
        from gensim.parsing.preprocessing import preprocess_documents
        
        # extract descriptions from dataset
        documents = preprocess_documents(dataset.descriptions.values)
        tagged_docs = [TaggedDocument(d, [i]) for i, d in enumerate(documents)]
        
        # temp file to hasten model
        tmpfile = get_tmpfile("model")
        
        # generate model
        try:
            model = Doc2Vec(vector_size=vec_size, window=3, min_count=3, workers=8, max_vocab_size=10_000_000)
        except:
            model = Doc2Vec(vector_size=vec_size, window=2, min_count=3, workers=4, max_vocab_size=10_000_000)
        #model.load("prj607_model3")
        
        # train the model
        model.build_vocab(tagged_docs)
        model.train(tagged_docs, total_examples=model.corpus_count, epochs=50)
        
        # delete temp file
        model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
        
        # save model
        modelfile = "Greene_project607_d2v_model_v1" 
        files = os.listdir()
        counter = 1
        while modelfile in files:
            counter += 1
            modelfile = modelfile[:29]+str(counter)
        model.save(modelfile)
        
        
        # vectorize/embed movie descriptions
        vectors = pd.Series([model.infer_vector(documents[x]) for x in range(len(documents))])
        
        # cosine similarity matrix
        cossim = MatrixSimilarity(vectors, num_features=vec_size)        
    
    #save vector onto dataframe
    dataset["vector"] = vectors
    
    # create movie index
    titles = dataset.titles.copy()
    titles = titles.apply(lambda x: x.lower())
    movie_index = pd.Series(dataset.index, index=titles).drop_duplicates()
    
    # generate list of duplicate movies
    duplicates = dataset.pivot_table(index="titles", aggfunc="size")
    duplicates = list(duplicates[duplicates > 1].index.str.lower())
    

    return dataset, cossim, movie_index, duplicates

In [2]:
def weighted_scoring(df, min_voters=1000):
    '''
    weighted scoring receives a DataFrame and calculates the relative weighted score of each movie
    
    Parameters
    ----------
    df : DataFrame,
        dataframe containing the movie data to score
        
    Returns
    -------
    df : DataFrame,
        modified dataframe including the weighted scores in a new column "Weighted_score"
    
    '''
    
    deets = np.array(df[["scores", "votes"]])
    cluster_score = df.loc[df.votes > min_voters, "scores"].mean()
    
    weighted_list = []
    for sample in deets:
        avg_score = sample[0]
        num_voters = sample[1]
        
        common_denominator = (num_voters + min_voters)
        wr = (num_voters * avg_score) / common_denominator + (cluster_score * min_voters) / common_denominator
        weighted_list.append(wr)
        
    df["Weighted_score"] = weighted_list
    
    df = df.sort_values(by="Weighted_score", ascending=False)
    
    return df

In [3]:
def recommend_byActor(name, n_movies=5, short=False):
    '''
    recommend_byActor takes the name of a given actor and returns movies with the actor, ranked amongst
    themselves
    ****
    Parameters
    ----------
    name : str,
        name of actor
    
    Returns
    -------
    df : DataFrame,
        movie title and other information
        
    '''
    name = name.title()
    
    # generate dataframe of movie with the actor
    df = data.loc[data.stars.str.contains(name)].reset_index()
    df.drop("index", axis=1, inplace=True)
    
    # obtain sorted, weighted scores
    df = weighted_scoring(df)
    
    # drop extraneous columns
    df.drop(["uids", "scores", "votes"], axis=1, inplace=True)
    
    # remove the searched name from the stars list
    # the actor is known to be in the returned movies, only return the names of other starring actors
    for i in range(df.shape[0]):
        tmp = df.loc[i, "stars"].replace(", ",",").split(',')
        tmp.remove(name)
        entry = ", ".join(tmp)
        df.loc[i, "stars"] = entry
    
    df.reset_index(inplace=True)
    
    df["index"] = list(df.index +1)
    
    mapper = {"index": "Rank", "titles": "Film", "genres":"Genre", "ratings":"Rating", "scores":"Score",
              "lengths":"Runtime", "directors":"Director", "stars": "Also starring",
              "descriptions": "Description", "year": "Year", "Weighted_score": "Score"}
    
    # relabel dataframe for output
    df = df.rename(columns=mapper)
    
    if short:
        df = df[["Rank", "Film", "Year", "Description"]]
    else:
        df = df[["Rank","Film","Year","Genre","Rating","Runtime","Director","Also starring","Description","Score"]]

    
    return df.loc[0:n_movies-1]

In [4]:
def recommend_byTitle(name, cossim, data, n_movies=5, tfidf=False, short=False):
    '''
    recommend_byTitle takes the name of a movie and returns the top n_movies
    with the most similar descriptions from the IMBb database

    Parameters
    ----------
    name : string
        title of the movie.
    n_movies : int, optional
        Number of films to return. The default is 10.
    short : bool, optional
        switch to return quantity of information. The default is False, returning a dataframe
        containing expanded information

    Returns
    -------
    df : DataFrame
        dataframe containing movie information. By default, it returns 10
        columns of data

    '''

    name = name.lower()
    
    # if the title is unavailable
    if name not in movie_index.keys():
        return f"{name.title()} is not in the database."

    # if there are duplicate names, choose the newest entry
    tmp = movie_index[name]
    if type(tmp) != type(1):
        print(f"{name.title()} is a duplicate.")
        tmp = tmp[len(tmp)-1]
        
        
        
    if tfidf:
        idx = tmp
    else:
        idx = data.iloc[tmp].vector
        

    # cosine similarity
    similarities = sorted(list(enumerate(cossim[idx])), reverse=True, key=lambda x: x[1])
    
    # list of the first n similar movies by description
    simlist = [entry[0] for entry in similarities if entry[0] != tmp]
    
    # retrieve indices to extract relevant data from primary dataset
    indices = simlist[0:n_movies]
    
    df = data.loc[indices]
    
    # weighted scoring for internal rankings
    df = weighted_scoring(df)
    
    df.reset_index(inplace=True)
    df["index"] = list(df.index+1)
    df.drop(["uids","scores", "votes"], axis=1, inplace=True)
    
    # relabel dataframe for output
    mapper = {"index": "Rank", "titles": "Film", "genres":"Genre", "ratings":"Rating",
              "lengths":"Runtime", "directors":"Director", "stars": "Also starring",
              "descriptions": "Description", "year": "Year", "Weighted_score": "Score"}
    
    df = df.rename(columns=mapper)
    
    # truncated dataframe
    if short:
        df = df[["Rank", "Film", "Year", "Description"]]
    else:
        df = df[["Rank","Film","Year","Genre","Rating","Runtime","Director","Also starring","Description","Score"]]
        
    return df