In [263]:
import csv,json
from elasticsearch import helpers, Elasticsearch
import pandas as pd
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])


In [264]:
def load_csv_to_es(file_name,index_name):
    with open(file_name,encoding="utf8") as f:
        reader = csv.DictReader(f)
        helpers.bulk(es, reader, index=index_name, doc_type='csv')

In [265]:
#Loading csv files to elastic search
#load_csv_to_es('source/movies.csv','movies')
#load_csv_to_es('source/ratings.csv','ratings')

In [266]:
def search_query(index,query,searchColumn,showColumn=None):
    query_body = {
      "query": {
          "match": {
              searchColumn: query
          }
      }
    }
    result=es.search(index=index, body=query_body,size=999)
    result=result["hits"]["hits"]
    if showColumn!=None:
        for i in range(len(result)):
            print(result[i]['_source'][showColumn])
            
    return result

In [267]:
result=search_query('movies','toy','title',showColumn='movieId')

1
4929
5843
3114
78499
106022


In [268]:
def getUserCluster(userId):
    cluster_map=pd.read_csv('UsersClusters.csv')  
    clusters=[]
    for i in range(max(cluster_map['cluster'])):
        x=list( cluster_map['userId'][cluster_map['cluster'] == i] )
        clusters.append(x)
    for cluster in clusters:
        if userId in cluster:
            return cluster
            

In [269]:
def getBM25score(movies):
    #returns a dictionary following the form { movieId:BM25 score }
    result={}
    for i in movies:
        #for each movie
        movieId=i['_source']['movieId']
        result[movieId]=i['_score'] 
    #normalising
    maxScore=max(result.values())
    for i in result:
        result[i]=result[i]/maxScore
    result=dict( sorted(result.items(), key=lambda item: item[1],reverse=True) )
    return result

In [270]:
def getAverageRating(movies):
    #returns a dictionary following the form { movieId:averageRating }
    #print("      ",len(movies),"movies found")
    #print("     ","Average rating of every movie")
    result={}
    for i in range(len(movies)):
        #for each movie
        #print("     ","============================================================")
        movieId=movies[i]['_source']['movieId']
        #print('movieId:',movieId)
        apot=search_query('ratings',movieId,'movieId',showColumn=None)
        average=sum( float(i['_source']['rating']) for i in apot ) /len(apot)
        #print("     ",'movieId:',movieId,'-->',average)
        result[movieId]=average
    
    #normalising
    maxRating=max(result.values())
    for i in result:
        result[i]=result[i]/maxRating
    result=dict( sorted(result.items(), key=lambda item: item[1],reverse=True) )
    return result

In [271]:
def getKmeansAverageMovieScore(result,user):
    cluster=getUserCluster(user)
    ratings=pd.read_csv('source/ratings.csv') 
    scores={}
    for i in result:
        movieId=i['_source']['movieId']
        rating=0
        counter=0
        for userId in cluster:
            index=ratings.index[ (ratings['movieId']==int(movieId)) & (ratings['userId']==userId ) ]
            if len(index)!=0:
                counter+=1
                rating+=ratings['rating'][index[0]]

        if counter!=0:
            rating=rating/counter

        scores[movieId]=rating
        
    #normalising
    maxScore=max(scores.values())
    for i in scores:
        scores[i]=scores[i]/maxScore
        
    scores=dict( sorted(scores.items(), key=lambda item: item[1],reverse=True) )
    return scores       

In [272]:
getKmeansAverageMovieScore(result,1)

{'1': 1.0,
 '78499': 0.8421052631578947,
 '3114': 0.7719298245614035,
 '4929': 0.631578947368421,
 '5843': 0.0,
 '106022': 0.0}

In [273]:
def getUserRating(movies,userId):
    result={}
    for i in range(len(movies)):
        #for each movie
        #print("     ",i,"============================================================")
        movieId=movies[i]['_source']['movieId']
        apot=search_query('ratings',movieId,'movieId',showColumn=None)
        found=False
        for i in apot:
            if i['_source']['userId']==str(userId):
                result[movieId]= float( i['_source']['rating'] )
                found=True
                
        if not found:
            result[movieId]=0
    
    #normalising
    maxScore=max(result.values())
    if maxScore==0:
        return result    
    for i in result:
        result[i]=result[i]/maxScore
        
    result=dict( sorted(result.items(), key=lambda item: item[1],reverse=True) )
        
    return result

In [303]:
def getUserDeterminedOrPredicted(result,userId):
    determined=getUserRating(result,userId)
    predicted=getKmeansAverageMovieScore(result,userId)
    out={}
    for movieId in determined.keys():
        if determined[movieId]==0:
            out[movieId]=predicted[movieId]
        else:
            out[movieId]=determined[movieId]
    
    return out

In [306]:
def getTotalScore(result,userId):
    bm25=getBM25score(result)
    userRating=getUserDeterminedOrPredicted(result,userId)
    avrgRating=getAverageRating(result)

    totalScore={}
    for  movieId in bm25:
        print('movieId:',movieId)
        print('bm25-->',bm25[movieId])
        print('user Rating-->',userRating[movieId])
        print('average Rating-->',avrgRating[movieId])

        totalScore[movieId]=bm25[movieId]+userRating[movieId]+avrgRating[movieId]
        if (userRating[movieId]==0):
            totalScore[movieId]=totalScore[movieId]/2
        else:
            totalScore[movieId]=totalScore[movieId]/3
   
    totalScore=dict( sorted(totalScore.items(), key=lambda item: item[1],reverse=True) )
    return totalScore

In [307]:
total_score=getTotalScore(result,userId=1)
total_score

movieId: 1
bm25--> 1.0
user Rating--> 1.0
average Rating--> 0.9511328929611479
movieId: 4929
bm25--> 1.0
user Rating--> 0.631578947368421
average Rating--> 0.6631578947368422
movieId: 5843
bm25--> 1.0
user Rating--> 0.0
average Rating--> 0.9824561403508772
movieId: 3114
bm25--> 0.9019450273021845
user Rating--> 0.7719298245614035
average Rating--> 0.9441403508771931
movieId: 78499
bm25--> 0.9019450273021845
user Rating--> 0.8421052631578947
average Rating--> 1.0
movieId: 106022
bm25--> 0.8214024999162064
user Rating--> 0.0
average Rating--> 0.9824561403508772


{'5843': 0.9912280701754386,
 '1': 0.9837109643203826,
 '78499': 0.9146834301533597,
 '106022': 0.9019293201335419,
 '3114': 0.8726717342469271,
 '4929': 0.7649122807017544}

In [277]:
def createIdNameDict(movies):
    #returns a dictionary following the form { movieId:movie title }
    result={}
    for i in movies:
        #for each movie
        movieId=i['_source']['movieId']
        name=i['_source']['title']
        result[movieId]=name
    
    return result
    

In [278]:
def createIdTitleDict(scores):
    titles= createIdNameDict(result)
    for i in scores:
        print(titles[i],':\t ',scores[i])

In [300]:
createIdTitleDict(total_score)

Toy Soldiers (1991) :	  0.9912280701754386
Toy Story (1995) :	  0.9837109643203826
Toy Story 3 (2010) :	  0.9146834301533597
Toy Story of Terror (2013) :	  0.9019293201335419
Toy Story 2 (1999) :	  0.8726717342469271
Toy, The (1982) :	  0.7649122807017544
