## Trabalho Final

In [1]:
import pandas as pd
import numpy as np
import nltk
import re

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer

from unicodedata import normalize

nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
import pickle
import tensorflow as tf
import gensim
from gensim.models import Word2Vec
import random
import string
from sklearn.utils import shuffle
import multiprocessing
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics



In [3]:
def trata_caracteres(txt):
    txt=[normalize('NFKD',x).encode('ASCII','ignore').decode('ASCII') for x in txt]
    txt=[re.sub('[^a-z|\ ]','',str.lower(x)) for x in txt]
    return txt

In [4]:
stop_words = set(stopwords.words('english'))
def remove_stop_words(txt):
    texto = CountVectorizer(ngram_range=(1,1),stop_words=stop_words)
    texto.fit(txt)
    return texto

In [5]:
def tratar_texto(txt,metodo=''):
    txt = trata_caracteres(txt)
    try:
        texto = remove_stop_words(txt)
        lista_txt = texto.get_feature_names()
    except:
        lista_txt = txt
    if metodo=='lemma':
        lem = WordNetLemmatizer()
        for part_of_speech in ['a', 's', 'r', 'n', 'v']:
            tms = [lem.lemmatize(a,part_of_speech) for a in lista_txt]
    if metodo == 'stem':
        ps = PorterStemmer()
        tms = [ps.stem(a) for a in lista_txt]
    if metodo == '':
        tms = txt
    return " ".join(tms)

In [6]:
def processamento_text(df):
    return df.apply(lambda x: tratar_texto([x],'stem'))

In [7]:
def compute_DTM(df):
    vect = CountVectorizer()
    vect.fit(df)
    o = vect.transform(df)
    return pd.DataFrame(o.A, columns=vect.get_feature_names())

In [8]:
def compute_DTM_ajustado(df,queries):
    vect = CountVectorizer()
    vect.fit(df)
    o = vect.transform(queries)
    return pd.DataFrame(o.A, columns=vect.get_feature_names())

In [9]:
def remove_ano(txt):
    txt=re.sub(r'(\([^a-z]*\))','',str(txt))
    return txt

In [10]:
data = pd.read_csv('movies.csv',index_col=0)

In [11]:
query = ['toy story', 'the lion king','alladin','beauty and the best','cinderella','little mermaid','hercules']

In [12]:
data['Ntitle'] = processamento_text(data.title)

In [13]:
X = compute_DTM(data.Ntitle)

In [14]:
N = len(query)

In [15]:
xij = compute_DTM_ajustado(data.Ntitle,processamento_text(pd.DataFrame(query)[0]))

In [16]:
m = X.mean(axis=0)
c = 2
B = c*(1-m)
a = c*m

In [17]:
a_til = a + xij.sum(axis=0)

In [18]:
B_til = B + N - xij.sum(axis=0)

In [19]:
q = np.log(a_til) - np.log(a) - np.log(B_til) + np.log(B)

In [20]:
nc = np.log(a + B) - np.log(a + B + N) + np.log(B_til) - np.log(B)

In [21]:
s = nc + X.multiply(q)

In [22]:
ndata = pd.DataFrame(data.title.apply(remove_ano)).reset_index()
score = pd.DataFrame(s.sum(axis=1),columns=['score'])      

In [23]:
result = ndata.join(score)    

In [24]:
result.sort_values(by='score',ascending=False)

Unnamed: 0,movieId,title,score
1997,2081,"Little Mermaid, The",9.898317
360,364,"Lion King, The",9.141145
9398,27619,"Lion King 1½, The",9.141145
7960,8643,"Cinderella Story, A",8.860324
0,1,Toy Story,8.503923
3027,3114,Toy Story 2,8.503923
15401,78499,Toy Story 3,8.503923
18646,92793,Rusalochka (The Little Mermaid),8.394211
2041,2125,Ever After: A Cinderella Story,7.355762
18252,91266,Another Cinderella Story,7.355276


In [103]:
movie_review = pd.read_csv('movie_review1.csv',index_col=0)

In [104]:
movie_review.head()

Unnamed: 0,text,tag
0,films adapted from comic books have had plenty...,1
1,"for starters , it was created by alan moore ( ...",1
2,to say moore and campbell thoroughly researche...,1
3,"the book ( or "" graphic novel , "" if you will ...",1
4,"in other words , don't dismiss this film becau...",1


In [105]:
review = processamento_text(movie_review.text)

In [106]:
movie_review['text'] = review

In [107]:
text = pd.DataFrame(movie_review)
dim_vec = max(text.text.apply(len))
min_count = 10
window = 4
num_workers = multiprocessing.cpu_count()
seed = tf.set_random_seed(42)

In [108]:
cbow_model = Word2Vec(movie_review.text,
                    min_count = min_count, 
                    size = dim_vec, 
                    window = window,
                    seed = seed,
                    workers = num_workers,
                    sg = 1) #sg = 0 -> CBOW e sg = 1 -> skipgram

In [109]:
def meanVector(model,phrase):
    vocab = model.wv.vocab
    phrase = " ".join(phrase)
    phrase = [x for x in word_tokenize(phrase) if x in vocab]
    #Quando não houver palavra o vector recebe 0 para todas as posições
    if phrase == []:
        vetor = [0.0]*dim_vec 
    else: 
        #Caso contrário, calculando a matriz da frase
        vetor = np.mean([model[word] for word in phrase],axis=0)
    return vetor

In [110]:
def createFeatures(base, modelo): 
    features = [meanVector(modelo,base['text'][i])for i in range(len(base))]
    return features

In [111]:
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings('ignore')

In [113]:
df = createFeatures(movie_review,cbow_model)


In [115]:
classificacao = np.array(movie_review.tag)

In [126]:
classificacao

array([1, 1, 1, ..., 0, 0, 0], dtype=int64)

In [130]:
X_train, X_test, y_train, y_test = train_test_split(df[0:len(df)],classificacao[0:len(classificacao)], test_size=0.3,random_state=109)

In [123]:
clf = svm.SVC(kernel='linear') 


In [131]:
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [132]:
y_pred = clf.predict(X_test)

In [134]:
from sklearn.metrics import classification_report

In [135]:
print('------------------------------------------------------')
print('Classificação')
print(classification_report(y_test, y_pred))
print('------------------------------------------------------')
print("Matriz de Confusão")
print(pd.crosstab(y_test,y_pred,rownames=['Real'],colnames=['Predito'],margins=False))
print('------------------------------------------------------')
print('Acurácia')
print(np.mean(y_pred == y_test))
print('------------------------------------------------------')

------------------------------------------------------
Classificação
             precision    recall  f1-score   support

          0       0.60      0.05      0.10      9578
          1       0.51      0.96      0.67      9838

avg / total       0.55      0.52      0.39     19416

------------------------------------------------------
Matriz de Confusão
Predito    0     1
Real              
0        523  9055
1        355  9483
------------------------------------------------------
Acurácia
0.515348166460651
------------------------------------------------------
