In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import stanfordnlp
import string
import random
import pickle
import re
import time
import matplotlib.pyplot as plt

In [2]:
path = 'C:\\Users\\lolfa\\BMT\\TrabalhoFinal\\1_movies_per_genre'
path_film = 'C:\\Users\\lolfa\\BMT\\TrabalhoFinal\\2_reviews_per_movie_raw'
path_resultado = 'resultado_novo.dat'

In [3]:
genres = ['Action', 'Comedy', 'Drama', 'Horror', 'Sci-Fi']

genres_csv = {genre:pd.read_csv(path+'\\'+genre+'.csv') for genre in genres}
film_per_genre = {genre: [row['name'] + ' ' + str(row['year']) + '.csv' for i, row in genres_csv[genre].iterrows()] for genre in genres}

In [4]:
# Auxiliares

def clean_text(text):
    text = text.replace('<br/>', '')
    text = text.replace('<li>', '')
    text = text.replace('</li>', '')
    text = text.replace('ul', '')
    return text

def process_text(text):
    # Tokenização e POS tagging usando StanfordNLP
    doc = nlp(text)
    
    # Extrair tokens e POS tags
    tokens = [(word.text, word.upos) for sent in doc.sentences for word in sent.words]
    
    # Remoção de stop words, pontuação e stemming
    stop_words = set(stopwords.words('english'))
    processed_tokens = [
        (stemmer.stem(word), pos) 
        for word, pos in tokens 
        if word.lower() not in stop_words and word not in string.punctuation
    ]
    
    return processed_tokens

In [5]:
csv_per_film = {genre:{'rates':[], 'reviews':[]} for genre in genres}

for genre in genres:
    films = film_per_genre[genre]
    for film in films:
        try:
            csv = pd.read_csv(path_film+'\\'+film)
            csv_pre = csv[['rating', 'review']]
            for i, row in csv_pre.iterrows():
                rating = row['rating']
                review = clean_text(row['review'])
                if rating != 'Null':
                    csv_per_film[genre]['rates'].append(int(rating))
                    csv_per_film[genre]['reviews'].append(review)
        except:
            continue

dfFilm_per_genre = {genre: pd.DataFrame(csv_per_film[genre]) for genre in genres}

In [6]:
# Inicializar o stemmer e o StanfordNLP pipeline
stemmer = PorterStemmer()
nlp = stanfordnlp.Pipeline(processors='tokenize,pos', lang='en')

pos_tag_per_genre = {genre:{'rates':[], 'tag_text':[]} for genre in genres}

tempo_inicial = time.perf_counter()
for genre in genres:
    n_reviews = len(csv_per_film[genre]['reviews'])
    sample = random.sample(range(n_reviews), 10000)
    for i in sample:
        rate = csv_per_film[genre]['rates'][i]
        review = csv_per_film[genre]['reviews'][i]
        pos_review = process_text(review)
        pos_tag_per_genre[genre]['rates'].append(rate)
        pos_tag_per_genre[genre]['tag_text'].append(pos_review)
        
tempo_final = time.perf_counter()
tempo_execucao = tempo_final - tempo_inicial
print(f"O tempo de execução foi de {tempo_execucao:.6f} segundos")

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': 'C:\\Users\\lolfa\\stanfordnlp_resources\\en_ewt_models\\en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': 'C:\\Users\\lolfa\\stanfordnlp_resources\\en_ewt_models\\en_ewt_tagger.pt', 'pretrain_path': 'C:\\Users\\lolfa\\stanfordnlp_resources\\en_ewt_models\\en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Done loading processors!
---
O tempo de execução foi de 8057.011548 segundos


In [7]:
# Salva os resultados para uso futuro
with open(path_resultado, 'wb') as arquivo:
    pickle.dump(pos_tag_per_genre, arquivo) 