In [1]:
import csv,json
from elasticsearch import helpers, Elasticsearch
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])


Για να μην βγάζει σφάλματα λόγω του χαμηλού χώρου στον δίσκο τρέχω αυτό στο bash:
curl -XPUT -H "Content-Type: application/json" http://localhost:9200/_cluster/settings -d '{ "transient": { "cluster.routing.allocation.disk.threshold_enabled": false } }'
και
curl -XPUT -H "Content-Type: application/json" http://localhost:9200/_all/_settings -d '{"index.blocks.read_only_allow_delete": null}'


In [2]:
def load_csv_to_es(file_name,index_name):
    with open(file_name,encoding="utf8") as f:
        reader = csv.DictReader(f)
        helpers.bulk(es, reader, index=index_name, doc_type='csv')

In [3]:
#Loading csv files to elastic search
#load_csv_to_es('source/movies.csv','movies')
#load_csv_to_es('source/ratings.csv','ratings')

Στην συνέχεια θα γίνει αναζήτηση για την είσοδο που θα δώσει ο χρήστης

In [4]:
def get_columns(index):
    mapping = es.indices.get_mapping(index)
    dict_index_fields = []
    for field in mapping[index]['mappings']['properties']:
        dict_index_fields.append(field) 
    print("     ",dict_index_fields)
get_columns('ratings')

      ['movieId', 'rating', 'timestamp', 'userId']


In [5]:
def search_query(index,query,searchColumn,showColumn=None):
    query_body = {
      "query": {
          "match": {
              searchColumn: query
          }
      }
    }
    result=es.search(index=index, body=query_body,size=999)
    result=result["hits"]["hits"]
    if showColumn!=None:
        for i in range(len(result)):
            print("     ",result[i]['_source'][showColumn])
            
    return result

In [6]:
result=search_query('movies','toy','title',showColumn='title')

      Toy Story (1995)
      Toy, The (1982)
      Toy Soldiers (1991)
      Toy Story 2 (1999)
      Toy Story 3 (2010)
      Toy Story of Terror (2013)


In [7]:
def getAverageRating(movies):
    #returns a dictionary following the form { movieId:averageRating }
    print("      ",len(movies),"movies found")
    print("     ","Average rating of every movie")
    result={}
    for i in range(len(movies)):
        #for each movie
        print("     ","============================================================")
        movieId=movies[i]['_source']['movieId']
        apot=search_query('ratings',movieId,'movieId',showColumn=None)
        average=sum( float(i['_source']['rating']) for i in apot ) /len(apot)
        print("     ",'movieId:',movieId,'-->',average)
        result[movieId]=average
    
    #normalising
    maxRating=max(result.values())
    for i in result:
        result[i]=result[i]/maxRating
    result=dict( sorted(result.items(), key=lambda item: item[1],reverse=True) )
    return result
        
        

In [8]:
getAverageRating(result)

       6 movies found
      Average rating of every movie
      movieId: 1 --> 3.8724696356275303
      movieId: 4929 --> 2.7
      movieId: 5843 --> 4.0
      movieId: 3114 --> 3.844
      movieId: 78499 --> 4.071428571428571
      movieId: 106022 --> 4.0


{'78499': 1.0,
 '5843': 0.9824561403508772,
 '106022': 0.9824561403508772,
 '1': 0.9511328929611479,
 '3114': 0.9441403508771931,
 '4929': 0.6631578947368422}

In [9]:
def getBM25score(movies):
    #returns a dictionary following the form { movieId:BM25 score }
    result={}
    for i in movies:
        #for each movie
        movieId=i['_source']['movieId']
        result[movieId]=i['_score'] 
    #normalising
    maxScore=max(result.values())
    for i in result:
        result[i]=result[i]/maxScore
    result=dict( sorted(result.items(), key=lambda item: item[1],reverse=True) )
    return result

In [10]:
getBM25score(result)

{'1': 1.0,
 '4929': 1.0,
 '5843': 1.0,
 '3114': 0.9019450273021845,
 '78499': 0.9019450273021845,
 '106022': 0.8214024999162064}

In [11]:
def getUserRating(movies,userId):
    result={}
    for i in range(len(movies)):
        #for each movie
        print("     ","============================================================")
        movieId=movies[i]['_source']['movieId']
        apot=search_query('ratings',movieId,'movieId',showColumn=None)
        found=False
        for i in apot:
            #print("      ",i['_source']['userId'])
            if i['_source']['userId']==str(userId):
                result[movieId]= float( i['_source']['rating'] )
                found=True
        if not found:
            result[movieId]=0
            
    return result

In [12]:
#es.indices.delete(index='ratings')
getUserRating(result,7)



{'1': 3.0, '4929': 0, '5843': 0, '3114': 0, '78499': 0, '106022': 0}