In [1]:
import pandas as pd
import numpy as np
from elasticsearch import Elasticsearch
import pickle
pd.options.mode.chained_assignment = None

#**Tf-Idf**

In [2]:
def computeTfIdf(atlas):
    data = {(i,j):atlas[i][j] for i in atlas.keys() for j in atlas[i].keys()}

    df = pd.DataFrame.from_dict(data, orient='index').sort_index()
    
    df["tf-idf"] = np.multiply(df["term_freq"], np.log( 9125 / df["doc_freq"]))

    df.index.names = ["movieId", "word"]
    
    tfidf = df.reset_index()
    tfidf.drop(["term_freq","doc_freq"],axis=1, inplace=True)
    tfidf.movieId = tfidf.movieId.astype("int64")
    tfidf["tf-idf"] = tfidf["tf-idf"].astype("float32")

    return tfidf



In [3]:
def Tf_Idf():
    ''' This function returns atlas_tfidf which has all movieId 
    and corresponding tfidf with their words ''' 
    # Dictionary with (es_id : movieId) pairs, where es_id is the _id from Elasticsearch
    ids = dict()
    #atlas is a dict with keys = movieId and values = { words : { 'doc_freq' : doc_freq , 'term_freq' : term_freq} }
    atlas = dict()
    # and word_params_dict = { words : { 'doc_freq' : doc_freq , 'term_freq' : term_freq} }
    word_params_dict = dict()

    res = es.search(index="movies", body={"query": {"match_all":{} }}, size = 10000)
    total_hits = res['hits']['total']['value']
    #here I create a dictionary with key-value pairs = (_id, movieId), where _id is from Elasticsearch
    for hit in range(total_hits):
        movieId = res['hits']['hits'][hit]['_source']['movieId']
        es_id = res['hits']['hits'][hit]['_id']
        ids[es_id] = movieId

    # every doc is a distinct movie. Thus, it has it's own _id from Elasticsearch
    # Out of every doc I need to get term_freq and doc_freq
    for movie in es.mtermvectors(index="movies",body=dict(ids=list(ids.keys()),parameters=dict(term_statistics=True,field_statistics=True,fields=["title"])))['docs']:                                                                                                                                                            
        title = movie['term_vectors'] 
        #get the movieId of the movie we are currently at
        movieId = ids.get( movie['_id'], None ) 
        #get the words in the title of the movie we are curently at
        words = title['title']['terms'].keys()
        # for each word get the parameters and save them on a dictionary
        for word in words:
            #for each word get the term and document frequency
            term_freq = title['title']['terms'][word]['term_freq']
            doc_freq = title['title']['terms'][word]['doc_freq']
            word_params_dict[word] = {'term_freq': term_freq, 'doc_freq' : doc_freq}
        # save said dictionary on the correct key of the atlas dictionary
        atlas[movieId] = word_params_dict
        #reset the dictionary
        word_params_dict = dict()
    atlas_tfidf = computeTfIdf(atlas)
    return atlas_tfidf

In [4]:
def createDF(movies_dummy_genre,tfidf_df,movie_rating_df_user):
  train_data = dict()
  pred_data = dict()

  tfidf_df_piv = tfidf_df.pivot(index='movieId', columns='word', values='tf-idf')
  tfidf_df_piv = tfidf_df_piv.astype(np.float16)

  mean_of_wrds = tfidf_df_piv.mean()

  tfidf_df_piv_NaN = tfidf_df_piv.fillna(mean_of_wrds)
  tfidf_df_piv_NaN = tfidf_df_piv_NaN.merge(movies_dummy_genre, on='movieId', how='left')

  for userId in list(movie_rating_df_user['userId'].unique()):
    print("\ncurrently on user : ", userId)
    user_data = tfidf_df_piv_NaN.merge(movie_rating_df_user[movie_rating_df_user['userId'] == userId],on='movieId', how = 'outer')

    user_pred_data = user_data[user_data['rating'].isnull()]
    user_pred_data.drop('userId',axis=1,inplace=True)

    user_train_data = user_data[~user_data['rating'].isnull()]
    user_train_data.drop('userId',axis=1,inplace=True)

    user_train_data.set_index('movieId', inplace=True)
    user_pred_data.set_index('movieId', inplace=True)
        
    pickle.dump(user_train_data, open("./Data/train/user_{}_train_data.p".format(userId), "wb"))
    pickle.dump(user_pred_data, open("./Data/predi/user_{}_pred_data.p".format(userId), "wb"))
  return

In [5]:
es = Elasticsearch()
print('\nPlease wait .....\n')
rating_df = pd.read_csv('./data/ratings.csv')  
movies_df = pd.read_csv('./data/movies.csv')
#one hot encoding movie genres
temp = movies_df['genres'].str.get_dummies()
temp.columns = ['Genre_' + str(col) for col in temp.columns ]
# movies_dummy_genre has dummy variables for all genres and also has movieId, title
movies_dummy_genre = movies_df.merge(temp, left_index=True, right_index=True)
movies_dummy_genre.drop(['title','genres'],axis=1,inplace=True)
tfidf_df = Tf_Idf()


movie_rating_df_user = rating_df[['userId','movieId','rating']]
# At this point I have all the info per movie I need so let's create the training dataframe per user
createDF(movies_dummy_genre, tfidf_df, movie_rating_df_user)


Please wait .....


currently on user :  1

currently on user :  2

currently on user :  3

currently on user :  4

currently on user :  5

currently on user :  6

currently on user :  7

currently on user :  8

currently on user :  9

currently on user :  10

currently on user :  11

currently on user :  12

currently on user :  13

currently on user :  14

currently on user :  15

currently on user :  16

currently on user :  17

currently on user :  18

currently on user :  19

currently on user :  20

currently on user :  21

currently on user :  22

currently on user :  23

currently on user :  24

currently on user :  25

currently on user :  26

currently on user :  27

currently on user :  28

currently on user :  29

currently on user :  30

currently on user :  31

currently on user :  32

currently on user :  33

currently on user :  34

currently on user :  35

currently on user :  36

currently on user :  37

currently on user :  38

currently on user :  39

currently on 


currently on user :  320

currently on user :  321

currently on user :  322

currently on user :  323

currently on user :  324

currently on user :  325

currently on user :  326

currently on user :  327

currently on user :  328

currently on user :  329

currently on user :  330

currently on user :  331

currently on user :  332

currently on user :  333

currently on user :  334

currently on user :  335

currently on user :  336

currently on user :  337

currently on user :  338

currently on user :  339

currently on user :  340

currently on user :  341

currently on user :  342

currently on user :  343

currently on user :  344

currently on user :  345

currently on user :  346

currently on user :  347

currently on user :  348

currently on user :  349

currently on user :  350

currently on user :  351

currently on user :  352

currently on user :  353

currently on user :  354

currently on user :  355

currently on user :  356

currently on user :  357

currently o


currently on user :  636

currently on user :  637

currently on user :  638

currently on user :  639

currently on user :  640

currently on user :  641

currently on user :  642

currently on user :  643

currently on user :  644

currently on user :  645

currently on user :  646

currently on user :  647

currently on user :  648

currently on user :  649

currently on user :  650

currently on user :  651

currently on user :  652

currently on user :  653

currently on user :  654

currently on user :  655

currently on user :  656

currently on user :  657

currently on user :  658

currently on user :  659

currently on user :  660

currently on user :  661

currently on user :  662

currently on user :  663

currently on user :  664

currently on user :  665

currently on user :  666

currently on user :  667

currently on user :  668

currently on user :  669

currently on user :  670

currently on user :  671
