# Let's Create the Data

In [4]:
import pandas as pd
import numpy as np
from elasticsearch import Elasticsearch
import pickle

def computeTfIdf(atlas):
    ''' This function computes the tf-idf vectors for every word 
    and returns atlas_tfidf using the dictionary 
    atlas we created in  getTfIdf()''' 
    atlas_df = pd.DataFrame.from_dict({(i,j): atlas[i][j] 
                           for i in atlas.keys() 
                           for j in atlas[i].keys()},
                       orient='index').sort_index()
    
    #doc_count = 9125 movies
    # tf-idf = term_freq * idf
    # idf = log(doc_count / doc_freq)
    atlas_df["idf"] = np.log( 9125 / atlas_df["doc_freq"])
    atlas_df["tf-idf"] = np.multiply(atlas_df["term_freq"], atlas_df["idf"])

    atlas_df.index.names = ["movieId", "word"]
    
    atlas_tfidf = atlas_df.reset_index()
    atlas_tfidf.drop(["term_freq","doc_freq","idf"],axis=1, inplace=True)
    atlas_tfidf.movieId = atlas_tfidf.movieId.astype("int64")
    #Bu I have a memory problem when trying to One-Hot-Encode all words in above dataframe
    #So let's try. Creating a different dataframe per user
    #That only has the movie-word-tfidf info of the movies each user has seen

    return atlas_tfidf


def getTfIdf():
    ''' This function returns atlas_tfidf which has all movieId 
    and corresponding tfidf with their words ''' 
    # Dictionary with (es_id : movieId) pairs, where es_id is the _id from Elasticsearch
    ids = dict()
    #atlas is a dict with keys = movieId and values = { words : { 'doc_freq' : doc_freq , 'term_freq' : term_freq} }
    atlas = dict()
    # and word_params_dict = { words : { 'doc_freq' : doc_freq , 'term_freq' : term_freq} }
    word_params_dict = dict()

    res = es.search(index="movies", body={"query": {"match_all":{} }}, size = 10000)
    total_hits = res['hits']['total']['value']
    #here I create a dictionary with key-value pairs = (_id, movieId), where _id is from Elasticsearch
    for hit in range(total_hits):
        movieId = res['hits']['hits'][hit]['_source']['movieId']
        es_id = res['hits']['hits'][hit]['_id']
        ids[es_id] = movieId

    # every doc is a distinct movie. Thus, it has it's own _id from Elasticsearch
    # Out of every doc I need to get term_freq and doc_freq
    for movie in es.mtermvectors(index="movies",body=dict(ids=list(ids.keys()),parameters=dict(term_statistics=True,field_statistics=True,fields=["title"])))['docs']:                                                                                                                                                            
        title = movie['term_vectors'] 
        #get the movieId of the movie we are currently at
        movieId = ids.get( movie['_id'], None ) 
        #get the words in the title of the movie we are curently at
        words = title['title']['terms'].keys()
        # for each word get the parameters and save them on a dictionary
        for word in words:
            #for each word get the term and document frequency
            term_freq = title['title']['terms'][word]['term_freq']
            doc_freq = title['title']['terms'][word]['doc_freq']
            word_params_dict[word] = {'term_freq': term_freq, 'doc_freq' : doc_freq}
        # save said dictionary on the correct key of the atlas dictionary
        atlas[movieId] = word_params_dict
        #reset the dictionary
        word_params_dict = dict()
    atlas_tfidf = computeTfIdf(atlas)
    return atlas_tfidf


def createDataFrames(movies_dummy_genre, tfidf_df, movie_rating_df_pu):
    '''This function creates the training and prediction data for each user and saves the to their 
    corresponding pickle files'''
    print("\nCreating the data...")
    training_data = dict()
    prediction_data = dict()
    #create it
    tfidf_df_pivoted = tfidf_df.pivot(index='movieId', columns='word', values='tf-idf')
    tfidf_df_pivoted = tfidf_df_pivoted.astype(np.float16)
    #compute row avg
    mean_of_words = tfidf_df_pivoted.mean()
    tfidf_df_pivoted_noNaN = tfidf_df_pivoted.fillna(mean_of_words)
    #add genre data
    tfidf_df_pivoted_noNaN = tfidf_df_pivoted_noNaN.merge(movies_dummy_genre, on='movieId', how='left')

    for userId in list(movie_rating_df_pu['userId'].unique()):
        print("\ncurrently on user : ", userId)
        user_data = tfidf_df_pivoted_noNaN.merge(movie_rating_df_pu[movie_rating_df_pu['userId'] == userId],on='movieId', how = 'outer')

        # I do not drop any words for now because it would mess up the model structure and I have to make sure that
        # training and testing and prediction data all have the same columns
        #create prediction data
        user_prediction_data = user_data[user_data['rating'].isnull()]
        user_prediction_data.drop('userId',axis=1,inplace=True)
        #create training data
        user_training_data = user_data[~user_data['rating'].isnull()]
        user_training_data.drop('userId',axis=1,inplace=True)
        #set the index
        user_training_data.set_index('movieId', inplace=True)
        user_prediction_data.set_index('movieId', inplace=True)
        #dump the data on the correct folder
        
        pickle.dump(user_training_data, open("./data/training/user_{}_training_data.p".format(userId), "wb"))
        pickle.dump(user_prediction_data, open("./data/prediction/user_{}_prediction_data.p".format(userId), "wb"))
    return 



###################################################################################################################################
# start Elasticsearch server
es = Elasticsearch()
print('\nPlease wait while we do some pre-processing.....\n')
rating_df = pd.read_csv('./data/ratings.csv')  
movies_df = pd.read_csv('./data/movies.csv')
#one hot encoding movie genres
temp = movies_df['genres'].str.get_dummies()
temp.columns = ['Genre_' + str(col) for col in temp.columns ]
# movies_dummy_genre has dummy variables for all genres and also has movieId, title
movies_dummy_genre = movies_df.merge(temp, left_index=True, right_index=True)
movies_dummy_genre.drop(['title','genres'],axis=1,inplace=True)
tfidf_df = getTfIdf()

#pu = per user
movie_rating_df_pu = rating_df[['userId','movieId','rating']]
# At this point I have all the info per movie I need so let's create the training dataframe per user
createDataFrames(movies_dummy_genre, tfidf_df, movie_rating_df_pu)



Please wait while we do some pre-processing.....


Creating the data...

currently on user :  1

currently on user :  2

currently on user :  3

currently on user :  4

currently on user :  5

currently on user :  6

currently on user :  7

currently on user :  8

currently on user :  9

currently on user :  10

currently on user :  11

currently on user :  12

currently on user :  13

currently on user :  14

currently on user :  15

currently on user :  16

currently on user :  17

currently on user :  18

currently on user :  19

currently on user :  20

currently on user :  21

currently on user :  22

currently on user :  23

currently on user :  24

currently on user :  25

currently on user :  26

currently on user :  27

currently on user :  28

currently on user :  29

currently on user :  30

currently on user :  31

currently on user :  32

currently on user :  33

currently on user :  34

currently on user :  35

currently on user :  36

currently on user :  37

currently 


currently on user :  318

currently on user :  319

currently on user :  320

currently on user :  321

currently on user :  322

currently on user :  323

currently on user :  324

currently on user :  325

currently on user :  326

currently on user :  327

currently on user :  328

currently on user :  329

currently on user :  330

currently on user :  331

currently on user :  332

currently on user :  333

currently on user :  334

currently on user :  335

currently on user :  336

currently on user :  337

currently on user :  338

currently on user :  339

currently on user :  340

currently on user :  341

currently on user :  342

currently on user :  343

currently on user :  344

currently on user :  345

currently on user :  346

currently on user :  347

currently on user :  348

currently on user :  349

currently on user :  350

currently on user :  351

currently on user :  352

currently on user :  353

currently on user :  354

currently on user :  355

currently o


currently on user :  634

currently on user :  635

currently on user :  636

currently on user :  637

currently on user :  638

currently on user :  639

currently on user :  640

currently on user :  641

currently on user :  642

currently on user :  643

currently on user :  644

currently on user :  645

currently on user :  646

currently on user :  647

currently on user :  648

currently on user :  649

currently on user :  650

currently on user :  651

currently on user :  652

currently on user :  653

currently on user :  654

currently on user :  655

currently on user :  656

currently on user :  657

currently on user :  658

currently on user :  659

currently on user :  660

currently on user :  661

currently on user :  662

currently on user :  663

currently on user :  664

currently on user :  665

currently on user :  666

currently on user :  667

currently on user :  668

currently on user :  669

currently on user :  670

currently on user :  671


# Now let's Train the Models

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics 
from sklearn.preprocessing import LabelEncoder

In [6]:
user_training_data = pickle.load( open("./data/training/user_1_training_data.p", "rb")) 
user_prediction_data = pickle.load( open("./data/prediction/user_1_prediction_data.p", "rb")) 

In [7]:
for userId in list(movie_rating_df_pu['userId'].unique()):
    print("\ncurrently on user : ", userId)
    user_training_data = pickle.load( open("./data/training/user_{}_training_data.p".format(userId), "rb")) 
    user_prediction_data = pickle.load( open("./data/prediction/user_{}_prediction_data.p".format(userId), "rb")) 
    X = user_training_data.loc[:, user_training_data.columns != 'rating']
    y = user_training_data['rating']
    #encoding y_train ( because I had some errors : ValueError: Unknown label type: 'continuous' )
    lab_enc = LabelEncoder()
    X_train = X
    y_train = y.ravel()
    y_train = lab_enc.fit_transform(y_train)

    #Create a RandomForestClassifier
    clf = RandomForestClassifier(n_estimators=100, random_state = 42)
    #Train the model using the training sets y_pred=clf.predict(X_test)
    clf.fit(X_train,y_train)
    y_pred = clf.predict(user_prediction_data.loc[:, user_prediction_data.columns != 'rating'])
    user_prediction_data['rating'] = y_pred
    user_predicted_data = user_prediction_data.append(user_training_data).reset_index()
    user_predicted_data = user_predicted_data[['movieId', 'rating']]
    pickle.dump(user_predicted_data, open("./data/predicted_result/user_{}_predicted_data.p".format(userId), "wb"))


currently on user :  1

currently on user :  2

currently on user :  3

currently on user :  4

currently on user :  5

currently on user :  6

currently on user :  7

currently on user :  8

currently on user :  9

currently on user :  10

currently on user :  11

currently on user :  12

currently on user :  13

currently on user :  14

currently on user :  15

currently on user :  16

currently on user :  17

currently on user :  18

currently on user :  19

currently on user :  20

currently on user :  21

currently on user :  22

currently on user :  23

currently on user :  24

currently on user :  25

currently on user :  26

currently on user :  27

currently on user :  28

currently on user :  29

currently on user :  30

currently on user :  31

currently on user :  32

currently on user :  33

currently on user :  34

currently on user :  35

currently on user :  36

currently on user :  37

currently on user :  38

currently on user :  39

currently on user :  40

currentl


currently on user :  321

currently on user :  322

currently on user :  323

currently on user :  324

currently on user :  325

currently on user :  326

currently on user :  327

currently on user :  328

currently on user :  329

currently on user :  330

currently on user :  331

currently on user :  332

currently on user :  333

currently on user :  334

currently on user :  335

currently on user :  336

currently on user :  337

currently on user :  338

currently on user :  339

currently on user :  340

currently on user :  341

currently on user :  342

currently on user :  343

currently on user :  344

currently on user :  345

currently on user :  346

currently on user :  347

currently on user :  348

currently on user :  349

currently on user :  350

currently on user :  351

currently on user :  352

currently on user :  353

currently on user :  354

currently on user :  355

currently on user :  356

currently on user :  357

currently on user :  358

currently o


currently on user :  637

currently on user :  638

currently on user :  639

currently on user :  640

currently on user :  641

currently on user :  642

currently on user :  643

currently on user :  644

currently on user :  645

currently on user :  646

currently on user :  647

currently on user :  648

currently on user :  649

currently on user :  650

currently on user :  651

currently on user :  652

currently on user :  653

currently on user :  654

currently on user :  655

currently on user :  656

currently on user :  657

currently on user :  658

currently on user :  659

currently on user :  660

currently on user :  661

currently on user :  662

currently on user :  663

currently on user :  664

currently on user :  665

currently on user :  666

currently on user :  667

currently on user :  668

currently on user :  669

currently on user :  670

currently on user :  671


### Below is just a test for a single file ( was user 1 when run)

In [61]:
X = user_training_data.loc[:, user_training_data.columns != 'rating']
y = user_training_data['rating']
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20) # 70% training and 30% test
#encoding y_train ( because I had some errors : ValueError: Unknown label type: 'continuous' )
lab_enc = LabelEncoder()
y_train = lab_enc.fit_transform(y_train)
y_test = lab_enc.fit_transform(y_test)
 
#Create a Gaussian Classifier
clf = RandomForestClassifier(n_estimators=100, random_state = 42)
#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
if metrics.accuracy_score(y_test, y_pred) < 0.51 :
    print("Horrible")

Accuracy: 0.75
