In [114]:
import os 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import matplotlib
from sklearn.preprocessing import MinMaxScaler
import re 
import random 

In [269]:
from nltk.tokenize import sent_tokenize,word_tokenize, RegexpTokenizer 
from labMTsimple.storyLab import emotionFileReader, emotion, stopper, emotionV
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import re
labMT,labMTvector,labMTwordList = emotionFileReader(stopval=0.0,lang='english',returnVector=True)
import random 

## DATA LOADING 

In [237]:
#the reviews for each movie are stored in a separate csv file 
#for efficiency reasons, we collect files together in a dictionary where the movie name is the key and the content of the file(reviews) is the value
movie_files_path = os.getcwd()+ '\\dataset_review' + '\\2_reviews_per_movie_raw\\'
reviews_dict = {}
for file in os.listdir(movie_files_path):
    movie_name = file.replace('.csv', '')
    movie_name = movie_name.replace('_',':')
    movie_name = movie_name[:-5]
    #read csv document for each movie 
    file_content = pd.read_csv(movie_files_path + file, encoding = "ISO-8859-1").convert_dtypes().to_dict()
    reviews_dict[movie_name] = file_content

In [238]:
#finally the dictionary is processed into a dataframe 
movie_reviews_raw = pd.DataFrame.from_dict(reviews_dict,orient='index').convert_dtypes()

In [239]:
#we rank the reviewers based on the number of reviews left and keep the ones with 5 or more movies reviewed 
popular_reviewers = pd.read_pickle(r'new_num_review_user.pkl')

In [240]:
popular_reviewers_list = popular_reviewers[popular_reviewers['revs_num']>=5]['username'].values

In [241]:
movie_reviews_raw['original_title'] = movie_reviews_raw.index
movie_reviews_raw = movie_reviews_raw[['username','rating','helpful','total','date','title','review','original_title']].reset_index().copy()

In [242]:
movie_reviews = movie_reviews_raw[['original_title','username','rating','title', 'review']].copy()

In [243]:
# tmdb_5000 dataset contains more attributes which can be used as features for the recommender systems 
tmdb_5000 =pd.read_csv("C:\\Users\\mihae\\projects\\MoviesProjectCompTools\\dataset_5000\\tmdb_5000_movies.csv")['original_title']

In [244]:
#we keep only the movies that appear in both tmdb_5000_movies and the movie review dataset 
movie_reviews = movie_reviews[movie_reviews['original_title'].isin(list(tmdb_5000))].reset_index()

In [245]:
del tmdb_5000

In [246]:
del reviews_dict

### FUNCTIONS 

In [247]:
#remove <br/> from text 
def process_text(dict_review):
    
    new_dict = {}
    for k,text in dict_review.items():
        new_dict[k] = re.sub("<br/>","",text)
    return new_dict 

In [248]:
#replace key digits with actual usernames 
def replace_username(dict_username, dict_review):

    new_dict = dict((dict_username[key], value) for (key, value) in dict_review.items())
    return new_dict 

In [249]:
#remove null entries in dictionary 
def remove_nulls_from_dict(dict_):

    return {user: value for user,value in dict_.items() if value!='Null'}

In [254]:
#only keep popular reviewers 
def keep_popular_reviewers(dict_t, popular_list):

    new_dict = {x:v for x,v in dict_t.items() if x in popular_list } # if v in popular_list}

    return new_dict

In [261]:
#reduce dataset 
def reduce_dict(dict_):
    if len(dict_)>=50:
        return dict(random.sample(dict_.items(), 50))
    else:
        return dict_

In [284]:
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer() 
#using a regex tokenizer because I am cool like that 
tokenizer = RegexpTokenizer(r'\w+')

def lemmatize_text(some_dict):

    dialogue_lemmatized = {ch:lemmatizer.lemmatize(w.lower()) for ch,w in some_dict.items()}
    
    return dialogue_lemmatized 

def clean_text(dialogue_lemmatized):
    
    #remove funny characters 
    dialogue_clean = {ch:re.sub(r'[^A-Za-z0-9]+', ' ',text)  for ch,text in dialogue_lemmatized.items() }
    
    return dialogue_clean 

#tokenize reviews into word tokens 
def tokenize_dialogue(dialogue_clean):
    
    dialogue_tokenized = {ch: word_tokenize(text) for ch, text in dialogue_clean.items()}

    return dialogue_tokenized

#keep only the ones who are in the labmt dictionary 
def labmt_tokens(dialogue_tokenized):

    dialogue_tokenized_v2 = {k:v for k,v in dialogue_tokenized.items() if v}
    labmt_tokens = {ch :[float(labMT[token][1]) for token in text if token in list(labMT.keys())] for ch, text in dialogue_tokenized_v2.items()}
    
    return labmt_tokens

In [288]:
#obtain average labmt sentiment for each user
def compute_sentiment_per_user(user_dict):

    return {user:sum(list_values)/len(list_values) if len(list_values)>0 else 0 for user,list_values in user_dict.items()}

In [84]:
def compute_avg_sent(dict_tokens):

    avg_sent_per_user = {ch:sum(token_sent)/len(token_sent) for ch,token_sent in dict_tokens.items()}
    
    return sum(list(avg_sent_per_user.values()))/len(avg_sent_per_user)

In [46]:
def remove_empty_entries(dict):

    return  {ch: value  for ch,value in dict.items() if len(value)>0}

### Data Processing
The steps below are meant to process the different columns of the dataframe into a useable form so we can proceed and compute the sentiment score for each reviewer. 

In [250]:
movie_reviews['review_final'] = movie_reviews.apply(lambda x: process_text(x.review),axis=1)
movie_reviews['review_final'] = movie_reviews.apply(lambda x: replace_username(x.username,x.review_final), axis=1)
movie_reviews['rating_final'] = movie_reviews.apply(lambda x: replace_username(x.username,x.rating), axis=1)
movie_reviews['title_final'] = movie_reviews.apply(lambda x: replace_username(x.username,x.title), axis=1)

In [251]:
movie_reviews = movie_reviews[['original_title','rating_final','title_final','review_final']]

In [252]:
movie_reviews['rating_final'] = movie_reviews.apply(lambda x: remove_nulls_from_dict(x.rating_final), axis=1)

In [253]:
movie_reviews['review_final'] = movie_reviews.apply(lambda x: remove_nulls_from_dict(x.review_final), axis=1)

In [133]:
del tmdb_5000 , reviews_dict 

In [256]:
movie_reviews['rating_final'] = movie_reviews.apply(lambda x: keep_popular_reviewers(x.rating_final,popular_reviewers_list), axis=1)

In [259]:
movie_reviews['no_ratings'] = movie_reviews.apply(lambda x: len(x.rating_final), axis=1)

In [262]:
movie_reviews['rating_final'] = movie_reviews.apply(lambda x: reduce_dict(x.rating_final), axis=1)

since Python 3.9 and will be removed in a subsequent version.
  return dict(random.sample(dict_.items(), 50))


In [264]:
movie_reviews[['original_title','rating_final']].to_pickle('ratings_only.pkl')

In [277]:
movie_reviews['review_final'] = movie_reviews.apply(lambda x: keep_popular_reviewers(x.review_final,list(x.rating_final.keys())), axis=1)

In [281]:
movie_reviews['review_final'] = movie_reviews.apply(lambda x: reduce_dict(x.review_final), axis=1)

since Python 3.9 and will be removed in a subsequent version.
  return dict(random.sample(dict_.items(), 50))


### Sentiment Analysis 

_The labMT dictionary (https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0026752) contains a collection of 10.222 words which were evaluated by users on Mechanical Turk. The words are the keys in the labMT dictionary. 
The keys are ordered in the dictionary in an descending order according to the average happiness ranking.
Each word (key) has associated a list with 7 elements where the first element in the list represents the ranking followed by the average happiness ranking (as evaluated by 50 users), the standard deviation of happiness, twitter rank, google books rank, new york times rank and finally the music lyrics rank_. 


In [282]:
movie_reviews['reviews_lemmatized'] = movie_reviews.apply(lambda x: lemmatize_text(x.review_final),axis=1)

In [283]:
movie_reviews['reviews_lemmatized_clean'] =  movie_reviews.apply(lambda x: clean_text(x.reviews_lemmatized),axis=1)

In [285]:
movie_reviews['tokenized_reviews'] = movie_reviews.apply(lambda x: tokenize_dialogue(x.reviews_lemmatized_clean),axis=1)

In [286]:
movie_reviews['labmt_tokens'] = movie_reviews.apply(lambda x: labmt_tokens(x.tokenized_reviews),axis=1)  

In [289]:
movie_reviews['user_sentiment'] = movie_reviews.apply(lambda x: compute_sentiment_per_user(x.labmt_tokens),axis=1)  

In [291]:
movie_reviews[['original_title','rating_final','user_sentiment']].to_pickle('rating_and_sentiment.pkl')

In [292]:
movie_reviews.to_pickle('labmt_sentiment.pkl')