In [1]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import requests
import math
import numpy as np
import csv

In [2]:
# считает среднюю оценку пользователя оценённых фильмов
def avgRating(row):
    rated, ratings = 0, 0
    for film in range(numberOfFilms):
        if row.iat[0, film] != -1:
            rated += 1
            ratings += row.iat[0, film]
    return round(ratings / rated, 3)

In [3]:
# создаёт list из индексов непросмотренных фильмов
def findNotRated(variant):
    notRated = []
    for j in range(numberOfFilms):
        if data.iat[variant - 1, j] == -1:
            notRated.append(j)
    return notRated

In [4]:
# считает значение sim двух пользователей
def calculateSim(user1, user2):
    uvSum, u2Sum, v2Sum = 0, 0, 0
    for i in range(numberOfFilms):
        if (user1.iat[0, i] != -1) and (user2.iat[0, i] != -1):
            uvSum += user1.iat[0, i] * user2.iat[0, i]
            u2Sum += user1.iat[0, i] ** 2
            v2Sum += user2.iat[0, i] ** 2

    return round(uvSum / (round(math.sqrt(u2Sum) * math.sqrt(v2Sum), 3)), 3)

In [5]:
# создаёт list из sim
def findSims(data, myUser):
    sims = []
    notRated = findNotRated(variant)
    for i in range(1, numberOfUsers + 1):
        if i == variant:
            sims.append(0)
        else:
            sims.append(calculateSim(data.loc[data.index == myUser], data.loc[data.index == 'User ' + str(i)]))
    ###
    return sims

In [6]:
# вычисляет оценку фильма на основе оценок схожих пользователей
def makeRate(filmIndex, k, maxSims, sims):
    ru = avgRating(data.loc[data.index == myUser])
    sumUp, sumDown = 0, 0

    for i in range(k):
        ##
        if data.iat[maxSims[i], filmIndex] != -1:
            sumUp += sims[maxSims[i]] * (
                    data.iat[maxSims[i], filmIndex] - avgRating(data.loc[data.index == 'User ' + str(maxSims[i] + 1)]))
            sumDown += abs(sims[maxSims[i]])

    return float(format(round(ru + round(sumUp / sumDown, 3), 3), '.3f'))

In [7]:
# коэффициент просмотра фильма на выходных
def onWeekend(maxSims, film):
    onWeekend = 0
    for user in maxSims:
        if context_day.iat[user, film] == " Sat" or context_day.iat[user, film] == " Sun":
            onWeekend += 1
    return round(onWeekend / 4 * 100 / 3, 3)

In [8]:
# коэффициент просмотра фильма на дома
def atHome(maxSims, film):
    atHome = 0
    for user in maxSims:
        if context_place.iat[user, film] == " h":
            atHome += 1
    return round(atHome / 4 * 100 / 3, 3)

In [9]:
def recommendation(data):
    k = 4
    notRated = findNotRated(variant)
    sims = findSims(data, myUser)
    maxSims = np.array(sims).argsort()[-k:]

    rated = {}
    for film in notRated:
        rated[film] = round(makeRate(film, k, maxSims, sims) / 5 * 100 / 3, 3) + onWeekend(maxSims,
                                                                                                     film) + atHome(
            maxSims, film)

    for key in rated.keys():
        if rated.get(key) == max(rated.values()):
            d = key + 1
            return d

In [10]:
data = pd.read_csv('data.csv', index_col=0).copy()
context_day = pd.read_csv('context_day.csv', index_col=0)
context_place = pd.read_csv('context_place.csv', index_col=0)
# был 24-й вариант
variant = 24
myUser = 'User ' + str(variant)

numberOfUsers = data.shape[0]
numberOfFilms = data.shape[1]

# В этом задании выбор делается из непросмотренных фильмов моего варианта на основе данных
# наиболее близких пользователей по sim. Оценка, место и время просмотра - учитываются равнозначно для выбора фильма
res2 = recommendation(data)

In [11]:
films = pd.read_csv('movie_names.csv', index_col=0, names=['Movie', 'Name'])
movie_name = films.iat[res2-1,0].strip()
print("Film: " + movie_name)

Film: Forrest Gump


In [12]:
API_ENDPOINT = "https://www.wikidata.org/w/api.php"

In [13]:
params = {
    'action' : 'wbsearchentities',
    'format' : 'json',
    'language' : 'en',
    'search': movie_name
}

In [14]:
res = requests.get(API_ENDPOINT, params = params)
myfilm = res.json()['search'][0]['id']
print(myfilm)

Q134773


In [15]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

sparql_query = """

SELECT ?actorLabel 
WHERE {
  ?film wdt:P31 wd:Q11424.
  ?film wdt:P161 ?actor.
  FILTER(?film = wd:Q134773)
  
  # из всех актёров фильма исключаем тех, кто умер
  MINUS {
    ?film wdt:P31 wd:Q11424.
    ?film wdt:P161 ?actor.    
    ?actor p:P570/psv:P570 ?death_date_point.
    FILTER(?film = wd:"""+ myfilm + """)
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en".}
 }
 
"""

sparql.setQuery(sparql_query)

sparql.setReturnFormat(JSON)
results = sparql.query().convert()

In [16]:
results_df = pd.io.json.json_normalize(results['results']['bindings'])
results_df[['actorLabel.value']].head()

  results_df = pd.io.json.json_normalize(results['results']['bindings'])


Unnamed: 0,actorLabel.value
0,Lonnie Hamilton
1,Teresa Denton
2,Tom Hanks
3,Hanna R. Hall
4,Kurt Russell
