# Task 1

In [1]:
import pandas as pd
from statistics import mean, stdev
import contractions
import nltk

nltk.download("stopwords")
nltk.download("punkt")

script = pd.read_csv("data/lotr_scripts.csv", encoding='utf-8') # load the data
concat = {} # stores the concatenated scripts for each character

script_lengths = {} # individual script lengths grouped by character
stopwords_list = set(nltk.corpus.stopwords.words("english"))

raw_scripts = {}

for index, row in script.iterrows(): # go through the entries one by one for clarity (of course, could be implemented more compactly with nice pandas oneliners)
    if not row["char"] in concat:
        concat[row["char"]] = []
        script_lengths[row["char"]] = []
        raw_scripts[row["char"]] = ""
    
    raw_scripts[row["char"]] += str(row["dialog"]).lower() + " "
    dialogue = contractions.fix(str(row["dialog"])) # can't -> can not etc.
    words = nltk.word_tokenize(dialogue) # tokenize
    words = [word.lower() for word in words if word.isalpha()] # to lowercase if the "word" consists of alphabet letters
    
    script_lengths[row["char"]].append(len(words)) # length of the individual script
    concat[row["char"]].extend(words) # add to the concatenated list for this character

concat = dict(sorted(concat.items(), key=lambda i: -len(i[1]))) # sort by number of tokens in descending order
details = {} # dictionary containing the derived characteristics for every character

for character in concat:
    if not character in details:
        details[character] = {}

    details[character]["token_num"] = len(concat[character]) # total number of tokens
    details[character]["vocab_size"] = len(set(concat[character])) # vocabulary size = number of unique tokens
    details[character]["avg_script_length"] = mean(script_lengths[character]) # mean of script lengths
    details[character]["stopword_proportion"] = 0
    
    if len(script_lengths[character]) == 1:
        details[character]["sd_script_length"] = float('inf') # if only one script available for this character, SD is "infinite"
    else:
        details[character]["sd_script_length"] = stdev(script_lengths[character])
    
    for token in concat[character]:
        if token in stopwords_list:
            details[character]["stopword_proportion"] += 1 # if contained in stopword list, increment number of stopwords per char
    
    details[character]["stopword_proportion"] /= len(concat[character]) # divide by the total number of tokens

pd.DataFrame.from_dict(details, orient="index") # display table

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/syomasa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/syomasa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,token_num,vocab_size,avg_script_length,stopword_proportion,sd_script_length
GANDALF,3138,880,15.382353,0.528999,17.865421
SAM,2164,559,10.018519,0.577634,11.186355
FRODO,1826,494,8.115556,0.591457,8.211761
ARAGORN,1394,531,7.535135,0.497131,7.933837
GOLLUM,1311,362,9.857143,0.504195,9.257383
...,...,...,...,...,...
OLD MAN,1,1,1.000000,0.000000,inf
FRODO VOICE,1,1,1.000000,0.000000,inf
MRS BRACEGIRDLE,1,1,1.000000,0.000000,inf
PROUDFOOT HOBBIT,1,1,1.000000,0.000000,inf


# Testing

In [3]:
# this follows the personality detection method created by
# author jkwieser at https://github.com/jkwieser/personality-detection-text
import pickle
from sklearn.feature_extraction.text import CountVectorizer
import plotly.express as px
import pandas as pd
import re
import numpy as np

# Loading gloVe pretrained models
cEXT = pickle.load( open( "data/models/cEXT.p", "rb"))
cNEU = pickle.load( open( "data/models/cNEU.p", "rb"))
cAGR = pickle.load( open( "data/models/cAGR.p", "rb"))
cCON = pickle.load( open( "data/models/cCON.p", "rb"))
cOPN = pickle.load( open( "data/models/cOPN.p", "rb"))
vectorizer_31 = pickle.load( open( "data/models/vectorizer_31.p", "rb"))
vectorizer_30 = pickle.load( open( "data/models/vectorizer_30.p", "rb"))

# Using the pretrained models to generate the big 5 predictions
def predict_personality(text):
    scentences = re.split("(?<=[.!?]) +", text)
    text_vector_31 = vectorizer_31.transform(scentences)
    text_vector_30 = vectorizer_30.transform(scentences)
    EXT = cEXT.predict_proba(text_vector_31)[0][1]
    NEU = cNEU.predict_proba(text_vector_30)[0][1]
    AGR = cAGR.predict_proba(text_vector_31)[0][1]
    CON = cCON.predict_proba(text_vector_31)[0][1]
    OPN = cOPN.predict_proba(text_vector_31)[0][1]
    return EXT, NEU, AGR, CON, OPN

# Change this to analyse another character
character = 'Gollum'
character = character.upper()

# creating a dataframe just out of the char and dialog sections
movie_df = pd.DataFrame(script,
           columns=['char', 'dialog'])

# getting all the dialogue of one character
dialogue = (movie_df.loc[movie_df.char == character])

all_text = ""
for dialog in dialogue.dialog:
    all_text = all_text + dialog

all_text = all_text.replace("\xa0", "")

if(len(all_text) > 2):   
    predictions = predict_personality(all_text)
    print("predicted personality:", predictions)
    df = pd.DataFrame(dict(r=predictions, theta=['Extraversion','Neuroticism','Agreeableness', 'Conscientiousness', 'Openness']))
    fig = px.line_polar(df, r='r', theta='theta', line_close=True)
    fig.show()
else:
    print("Didn't find dialogue of the character,  check character input!")

predicted personality: (0.5400782768802677, 0.52, 0.5007047819659296, 0.44071084082553935, 0.6815849265850413)



Trying to unpickle estimator LogisticRegression from version 0.22.1 when using version 1.1.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Trying to unpickle estimator DecisionTreeClassifier from version 0.22.1 when using version 1.1.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Trying to unpickle estimator RandomForestClassifier from version 0.22.1 when using version 1.1.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Trying to unpickle estimator CountVectorizer from version 0.22.1 when using version 1.1.2. This might lead to