In [2]:
# codigo do filme a ser classificado
movie = 'tt6644200'
name = 'A Quiet Place'

In [4]:
import re

In [5]:
# tag HTML que contem cada uma das reviews
padrao = re.compile(r'<div class="text show-more__control">(.+?)</div>', re.DOTALL)

# regex que acha cada palavra no documento
regex = r"[-'a-zA-ZÀ-ÖØ-öø-ÿ]+"

In [7]:
# leitura das stopwords
Stopwords = set([]) 
sw = open("stopwords.txt",'r')
for s in sw.readlines():
    Stopwords.add(s.strip().lower())
sw.close()

In [8]:
import csv

In [9]:
# leitura do arquivo csv de classificacoes
f = open('ratings-short.csv', 'r')
reader = csv.reader(f)
lines = list(reader)
f.close()
lines = lines[1:]

In [10]:
Document = dict([])
Title = dict([])
Rating = dict([])
Vocabulary = set([])

In [12]:
import requests

In [11]:
# funcao que 
def leFilme(movieCode, movieTitle, movieRating):
    url = 'https://www.imdb.com/title/' + movieCode + '/reviews'
    print(url)
    res = requests.get(url)
    if res.status_code == requests.codes.ok:
        conteudo = res.text
        reviews = padrao.findall(conteudo)
        if len(reviews) > 0:
            txt = '\n'.join(reviews).replace('&#39;', '\'').replace('<br/>', ' ').replace('&quot;', '\"').replace('&amp;', '&')
            txt = re.sub(r'[^\x00-\x7f]',r'', txt)
    words = re.findall(regex, txt)
    if len(words) > 0:
        Document[movieCode] = list()
        for w in words:
            if w not in Stopwords and len(w)>=3:
                Document[movieCode].append(w.lower())
        if len(Document[movieCode]) > 0:
            Vocabulary.update(Document[movieCode])
            Title[movieCode] = movieTitle
            Rating[movieCode] = movieRating
        else:
            del Document[movieCode]

In [13]:
# leitura das reviews
for line in lines:
    leFilme(line[0], line[3], line[1])
leFilme(movie, name, '0')

https://www.imdb.com/title/tt1019452/reviews
https://www.imdb.com/title/tt0103873/reviews
https://www.imdb.com/title/tt0106179/reviews
https://www.imdb.com/title/tt1082599/reviews
https://www.imdb.com/title/tt1119644/reviews
https://www.imdb.com/title/tt0113409/reviews
https://www.imdb.com/title/tt1179933/reviews
https://www.imdb.com/title/tt1182345/reviews
https://www.imdb.com/title/tt1183374/reviews
https://www.imdb.com/title/tt6644200/reviews


In [14]:
# estatisticas
D = len(Document)
V = len(Vocabulary)
print("{} documentos".format(D))
print("{} palavras".format(V))

10 documentos
8041 palavras


In [16]:
import numpy

In [17]:
# calculando a matriz de frequencias das palavras nas reviews
M = numpy.zeros((V, D))
documents  = list(Document.keys())
vocabulary = list(Vocabulary)
for j in range(0, D):
    d = documents[j]       
    print("{0:.0f}%".format(((j/D)*100)), end=' ')
    for i in range(0, V):
        w = vocabulary[i]       
        M[i,j] = float(Document[d].count(w))

0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 

In [18]:
# calculando a matriz TFIDF
idf = numpy.count_nonzero(M, axis=1)
idf = [D / x for x in idf]
idf = numpy.log(idf)
Sd = numpy.sum(M, axis=0)
TFIDF = numpy.copy(M)
for j in range(0, D): 
    TFIDF[:,j] = [x / Sd[j] for x in TFIDF[:,j]]
    TFIDF[:,j] = numpy.multiply(TFIDF[:,j], idf)

In [20]:
from scipy.spatial import distance

In [21]:
# distancia entre documentos
dist_f = {'euclidean': distance.euclidean, 
          'chebyshev': distance.chebyshev, 
          'cosine': distance.cosine}
dist = {'euclidean': numpy.ones(D-1) * numpy.nan, 
          'chebyshev': numpy.ones(D-1) * numpy.nan, 
          'cosine': numpy.ones(D-1) * numpy.nan,
          'tf-idf': numpy.ones(D-1) * numpy.nan}

In [22]:
for i in range(0, D-1): 
    for df in dist_f.keys():
        dist[df][i] = dist_f[df](M[:,i], M[:,-1])
    # similaridade de cosseno entre documentos por TF-IDF
    dist['tf-idf'][i] = distance.cosine(TFIDF[:,i], TFIDF[:,-1])

In [23]:
# similaridade entre documentos (1 - distancia normalizada)
similarity = dict()
for df in dist.keys():
    similarity[df] = 1 - (dist[df] - numpy.nanmin(dist[df])) / (numpy.nanmax(dist[df]) - numpy.nanmin(dist[df]))

In [24]:
# exibicao dos resultados
for df in dist.keys():
    idx = numpy.argsort(similarity[df])
    txt = '\nTitulos mais similares por distancia ' + df + ' e suas notas:'
    for i in range(-1, -6, -1):
        txt += '\n{1} - {0} ({2:.2f})'.format(Title[documents[idx[i]]], Rating[documents[idx[i]]].rjust(2), similarity[df][idx[i]])
    
    print(txt)


Titulos mais similares por distancia euclidean e suas notas:
 4 - Pet (1.00)
 8 - 10 Cloverfield Lane (0.90)
 6 - Stag Night (0.85)
 5 - Braindead (0.84)
 1 - A Serious Man (0.57)

Titulos mais similares por distancia chebyshev e suas notas:
 5 - Braindead (1.00)
 4 - Pet (0.98)
 6 - Stag Night (0.84)
 1 - A Serious Man (0.76)
 8 - 10 Cloverfield Lane (0.73)

Titulos mais similares por distancia cosine e suas notas:
 8 - 10 Cloverfield Lane (1.00)
 4 - Pet (0.93)
 1 - A Serious Man (0.86)
 6 - Stag Night (0.86)
 4 - In the Mouth of Madness (0.69)

Titulos mais similares por distancia tf-idf e suas notas:
 8 - 10 Cloverfield Lane (1.00)
 7 - Moon (0.16)
 1 - A Serious Man (0.15)
 9 - Fringe (0.11)
 4 - In the Mouth of Madness (0.10)
