# Task 1

In [1]:
import pandas as pd
from statistics import mean, stdev
import contractions
import nltk

from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

nltk.download("stopwords")
nltk.download("punkt")

#pd.set_option('display.max_rows', None) # uncomment to display all rows at once

script = pd.read_csv("data/lotr_scripts.csv", encoding='utf-8') # load the data
concat = {} # stores the concatenated scripts for each character

script_lengths = {} # individual script lengths grouped by character
stopwords_list = set(nltk.corpus.stopwords.words("english"))

raw_scripts = {}

for index, row in script.iterrows(): # go through the entries one by one for clarity (of course, could be implemented more compactly with nice pandas oneliners)
    character = row["char"].strip().replace("(", "")
    
    if not character in concat:
        concat[character] = []
        script_lengths[character] = []
        raw_scripts[character] = ""
    
    raw_scripts[character] += str(row["dialog"]).lower() + " "
    dialogue = contractions.fix(str(row["dialog"])) # can't -> can not etc.
    words = nltk.word_tokenize(dialogue) # tokenize
    words = [word.lower() for word in words if word.isalpha()] # to lowercase if the "word" consists of alphabet letters
    
    script_lengths[character].append(len(words)) # length of the individual script
    concat[character].extend(words) # add to the concatenated list for this character

concat = dict(sorted(concat.items(), key=lambda i: -len(i[1]))) # sort by number of tokens in descending order
details = {} # dictionary containing the derived characteristics for every character

for character in concat:
    if not character in details:
        details[character] = {}

    details[character]["token_num"] = len(concat[character]) # total number of tokens
    details[character]["vocab_size"] = len(set(concat[character])) # vocabulary size = number of unique tokens
    details[character]["avg_script_length"] = mean(script_lengths[character]) # mean of script lengths
    details[character]["stopword_proportion"] = 0
    
    if len(script_lengths[character]) == 1:
        details[character]["sd_script_length"] = float('inf') # if only one script available for this character, SD is "infinite"
    else:
        details[character]["sd_script_length"] = stdev(script_lengths[character])
    
    for token in concat[character]:
        if token in stopwords_list:
            details[character]["stopword_proportion"] += 1 # if contained in stopword list, increment number of stopwords per char
    
    details[character]["stopword_proportion"] /= len(concat[character]) # divide by the total number of tokens

pd.DataFrame.from_dict(details, orient="index") # display table

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/syomasa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/syomasa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,token_num,vocab_size,avg_script_length,stopword_proportion,sd_script_length
GANDALF,3159,883,15.409756,0.528332,17.825898
SAM,2185,563,10.069124,0.577117,11.185300
FRODO,1828,494,8.088496,0.591357,8.203585
ARAGORN,1406,534,7.518717,0.497866,7.892655
GOLLUM,1312,362,9.791045,0.503811,9.254200
...,...,...,...,...,...
OLD MAN,1,1,1.000000,0.000000,inf
FRODO VOICE,1,1,1.000000,0.000000,inf
MRS BRACEGIRDLE,1,1,1.000000,0.000000,inf
PROUDFOOT HOBBIT,1,1,1.000000,0.000000,inf


# Task 2

This code implements the MLP model presented in https://github.com/desaichirayu/Personality-Attribution-using-Natural-Language-Processing.

This section **does not require running** unless you want to re-train model on your own computer

In [7]:
# adapted and modified from
# https://github.com/desaichirayu/Personality-Attribution-using-Natural-Language-Processing/blob/master/code%20and%20data/mlp_simple.py


df = pd.read_csv(r"data/essays.csv", names=['author_id', 'essay', 'Extraversion', 'Neuroticism', 'Agreeableness', 'Conscientiousness', 'Openness'], encoding='cp1252')

x = df['essay'][1:]
x = x.str.lower() # to lowercase

def trainAndTest(max_iter=200):
    classifiers = []
    vectorizers = []
    
    choices = {
            0: ('Extraversion', ('tanh', 'adaptive', 'lbfgs')),
            1: ('Neuroticism', ('tanh', 'adaptive', 'lbfgs')),
            2: ('Agreeableness', ('tanh', 'adaptive', 'lbfgs')),
            3: ('Conscientiousness', ('relu', 'invscaling', 'lbfgs')),
            4: ('Openness', ('relu', 'invscaling', 'lbfgs'))
        } # specs for every trait classifier (activation function etc.)

    for trait in range(5):
        y = df[choices[trait][0]][1:] # select essays associated with this trait
        x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=11) # random split into train & test sets

        # TF-IDF vectorizer
        vectorizer = TfidfVectorizer()
        vectorizers.append(vectorizer)
        
        xx_train = vectorizers[trait].fit_transform(x_train)
        xx_test = vectorizers[trait].transform(x_test)

        # specify and train a classifier for every trait
        classifiers.append(MLPClassifier(activation=choices[trait][1][0], alpha=0.0001, hidden_layer_sizes=(60),
                                   learning_rate=choices[trait][1][1], max_iter=max_iter, solver=choices[trait][1][2]))
        classifiers[trait].fit(xx_train, y_train)

        predictions = classifiers[trait].predict(xx_test) # predict for test set
        score = accuracy_score(y_test, predictions)
        print("Validation accuracy (on Essays set) for {}: {}".format(choices[trait][0], score))

    print("Training and validation done!")
    return classifiers, vectorizers

In [9]:
traitNames = ['Extraversion', 'Neuroticism', 'Agreeableness', 'Conscientiousness', 'Openness']
traitProbs = {} # a dict containing the probabilites of every trait for each character
iterCount = 20 # for report results, 20 was used
max_iter = 40 # maximum allowed iterations for MLPClassifier

for character in concat:
    traitProbs[character] = [0] * 5

for _ in range(iterCount):
    classifiers, vectorizers = trainAndTest(max_iter)
    print()
    
    for trait in range(5):
        xx_test = vectorizers[trait].transform(raw_scripts.values())
        prob = classifiers[trait].predict_proba(xx_test)[:,1] # predict probability for every trait

        for i, character in enumerate(concat):
            traitProbs[character][trait] += prob[i] 

traitProbs = {key: [x / iterCount for x in value] for key, value in traitProbs.items()}
traitProbsDf = pd.DataFrame.from_dict(traitProbs, orient="index") # display table containing the probabilites of every trait for each character
traitProbsDf.columns = traitNames
traitProbsDf.to_csv(f"tool1_results/traitProbsDf_max_iter_{max_iter}.csv") # Save results for later comparison
display(traitProbsDf)

Validation accuracy (on Essays set) for Extraversion: 0.5364667747163695



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Neuroticism: 0.5575364667747164
Validation accuracy (on Essays set) for Agreeableness: 0.546191247974068



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Conscientiousness: 0.5348460291734197



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Openness: 0.6110210696920584
Training and validation done!




lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Extraversion: 0.5623987034035657



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Neuroticism: 0.5542949756888168



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Agreeableness: 0.5494327390599676



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Conscientiousness: 0.539708265802269



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Openness: 0.6029173419773096
Training and validation done!

Validation accuracy (on Essays set) for Extraversion: 0.539708265802269



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Neuroticism: 0.5526742301458671
Validation accuracy (on Essays set) for Agreeableness: 0.5607779578606159



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Conscientiousness: 0.5380875202593193



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Openness: 0.6142625607779578
Training and validation done!

Validation accuracy (on Essays set) for Extraversion: 0.546191247974068



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Neuroticism: 0.5591572123176661



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Agreeableness: 0.546191247974068



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Conscientiousness: 0.5283630470016207



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Openness: 0.6094003241491086
Training and validation done!




lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Extraversion: 0.5526742301458671
Validation accuracy (on Essays set) for Neuroticism: 0.5510534846029174



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Agreeableness: 0.5364667747163695



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Conscientiousness: 0.539708265802269



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Openness: 0.6029173419773096
Training and validation done!




lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Extraversion: 0.5316045380875203



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Neuroticism: 0.5607779578606159



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Agreeableness: 0.5494327390599676



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Conscientiousness: 0.5251215559157212



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Openness: 0.5996758508914101
Training and validation done!




lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Extraversion: 0.5656401944894651
Validation accuracy (on Essays set) for Neuroticism: 0.5526742301458671



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Agreeableness: 0.5559157212317666



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Conscientiousness: 0.526742301458671



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Openness: 0.6158833063209076
Training and validation done!

Validation accuracy (on Essays set) for Extraversion: 0.5623987034035657
Validation accuracy (on Essays set) for Neuroticism: 0.5494327390599676



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Agreeableness: 0.5510534846029174



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Conscientiousness: 0.5283630470016207



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Openness: 0.6077795786061588
Training and validation done!




lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Extraversion: 0.5575364667747164
Validation accuracy (on Essays set) for Neuroticism: 0.5429497568881686



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Agreeableness: 0.5510534846029174



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Conscientiousness: 0.526742301458671



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Openness: 0.6077795786061588
Training and validation done!

Validation accuracy (on Essays set) for Extraversion: 0.5380875202593193
Validation accuracy (on Essays set) for Neuroticism: 0.5364667747163695



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Agreeableness: 0.5429497568881686



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Conscientiousness: 0.5332252836304701



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Openness: 0.6061588330632091
Training and validation done!

Validation accuracy (on Essays set) for Extraversion: 0.546191247974068
Validation accuracy (on Essays set) for Neuroticism: 0.5494327390599676



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Agreeableness: 0.5478119935170178



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Conscientiousness: 0.5348460291734197



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Openness: 0.5964343598055105
Training and validation done!

Validation accuracy (on Essays set) for Extraversion: 0.5559157212317666
Validation accuracy (on Essays set) for Neuroticism: 0.5478119935170178



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Agreeableness: 0.5510534846029174



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Conscientiousness: 0.5299837925445705



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Openness: 0.6175040518638574
Training and validation done!

Validation accuracy (on Essays set) for Extraversion: 0.5380875202593193



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Neuroticism: 0.5526742301458671



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Agreeableness: 0.5542949756888168



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Conscientiousness: 0.5348460291734197



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Openness: 0.5964343598055105
Training and validation done!




lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Extraversion: 0.5445705024311183



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Neuroticism: 0.5575364667747164



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Agreeableness: 0.5575364667747164



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Conscientiousness: 0.5348460291734197



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Openness: 0.6077795786061588
Training and validation done!




lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Extraversion: 0.5494327390599676
Validation accuracy (on Essays set) for Neuroticism: 0.5413290113452188
Validation accuracy (on Essays set) for Agreeableness: 0.5413290113452188



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Conscientiousness: 0.539708265802269



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Openness: 0.6223662884927067
Training and validation done!




lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Extraversion: 0.5494327390599676



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Neuroticism: 0.5445705024311183
Validation accuracy (on Essays set) for Agreeableness: 0.5478119935170178



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Conscientiousness: 0.520259319286872



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Openness: 0.5996758508914101
Training and validation done!




lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Extraversion: 0.539708265802269



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Neuroticism: 0.5445705024311183



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Agreeableness: 0.5575364667747164



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Conscientiousness: 0.526742301458671



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Openness: 0.6012965964343598
Training and validation done!

Validation accuracy (on Essays set) for Extraversion: 0.5445705024311183



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Neuroticism: 0.5542949756888168
Validation accuracy (on Essays set) for Agreeableness: 0.5510534846029174



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Conscientiousness: 0.5348460291734197



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Openness: 0.6110210696920584
Training and validation done!




lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Extraversion: 0.5510534846029174



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Neuroticism: 0.5494327390599676
Validation accuracy (on Essays set) for Agreeableness: 0.5542949756888168



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Conscientiousness: 0.5235008103727715



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Openness: 0.586709886547812
Training and validation done!

Validation accuracy (on Essays set) for Extraversion: 0.5445705024311183



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Neuroticism: 0.5559157212317666
Validation accuracy (on Essays set) for Agreeableness: 0.539708265802269



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Validation accuracy (on Essays set) for Conscientiousness: 0.520259319286872
Validation accuracy (on Essays set) for Openness: 0.5996758508914101
Training and validation done!




lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html



Unnamed: 0,Extraversion,Neuroticism,Agreeableness,Conscientiousness,Openness
GANDALF,1.468977e-07,1.446855e-01,1.684118e-05,0.000122,0.015728
SAM,9.332323e-01,5.478562e-03,7.838016e-01,0.996751,0.999959
FRODO,9.560196e-03,1.464651e-05,4.222301e-01,0.999730,0.999957
ARAGORN,9.890531e-01,4.441965e-06,9.413979e-01,0.181178,0.999999
GOLLUM,1.319751e-07,6.877928e-05,3.124496e-02,0.999759,0.999921
...,...,...,...,...,...
OLD MAN,9.982566e-01,5.518036e-06,2.702228e-01,0.000414,0.943917
FRODO VOICE,9.999841e-01,1.000000e+00,1.000000e+00,0.752366,0.999998
MRS BRACEGIRDLE,7.016263e-01,5.232160e-05,4.116308e-01,0.000306,0.988559
PROUDFOOT HOBBIT,1.490224e-03,4.741846e-08,6.791024e-01,0.240990,0.896148


In [10]:
# load the tool1 tables and show the results
traitProbsDf_iter20 = pd.read_csv(r"tool1_results/traitProbsDf_max_iter_20.csv", index_col=0)
traitProbsDf_iter40 = pd.read_csv(r"tool1_results/traitProbsDf_max_iter_40.csv", index_col=0)
traitProbsDf_iter60 = pd.read_csv(r"tool1_results/traitProbsDf_max_iter_60.csv", index_col=0)
#traitProbsDf_iter80 = pd.read_csv(r"tool1_results/traitProbsDf_max_iter_80.csv", index_col=0)
#traitProbsDf_iter200 = pd.read_csv(r"tool1_results/traitProbsDf_max_iter_200.csv", index_col=0)

print("max_iter = 20")
display(traitProbsDf_iter20)
print("max_iter = 40")
display(traitProbsDf_iter40)
print("max_iter = 60")
display(traitProbsDf_iter60)
#print("max_iter = 200")
#display(traitProbsDf_iter200)
#print(traitProbsDf.columns)

max_iter = 20


Unnamed: 0,Extraversion,Neuroticism,Agreeableness,Conscientiousness,Openness
GANDALF,0.009221,0.489919,0.048841,0.006524,0.174573
SAM,0.441190,0.276799,0.456102,0.855690,0.904509
FRODO,0.083711,0.103410,0.586574,0.917103,0.922891
ARAGORN,0.759941,0.086804,0.709668,0.643023,0.967837
GOLLUM,0.039225,0.232845,0.080232,0.884741,0.969868
...,...,...,...,...,...
OLD MAN,0.777783,0.102651,0.417932,0.014822,0.618823
FRODO VOICE,0.967972,0.997918,0.998262,0.462434,0.954121
MRS BRACEGIRDLE,0.394137,0.139078,0.383575,0.016729,0.812185
PROUDFOOT HOBBIT,0.584341,0.008917,0.751255,0.396232,0.858300


max_iter = 40


Unnamed: 0,Extraversion,Neuroticism,Agreeableness,Conscientiousness,Openness
GANDALF,1.468977e-07,1.446855e-01,1.684118e-05,0.000122,0.015728
SAM,9.332323e-01,5.478562e-03,7.838016e-01,0.996751,0.999959
FRODO,9.560196e-03,1.464651e-05,4.222301e-01,0.999730,0.999957
ARAGORN,9.890531e-01,4.441965e-06,9.413979e-01,0.181178,0.999999
GOLLUM,1.319751e-07,6.877928e-05,3.124496e-02,0.999759,0.999921
...,...,...,...,...,...
OLD MAN,9.982566e-01,5.518036e-06,2.702228e-01,0.000414,0.943917
FRODO VOICE,9.999841e-01,1.000000e+00,1.000000e+00,0.752366,0.999998
MRS BRACEGIRDLE,7.016263e-01,5.232160e-05,4.116308e-01,0.000306,0.988559
PROUDFOOT HOBBIT,1.490224e-03,4.741846e-08,6.791024e-01,0.240990,0.896148


max_iter = 60


Unnamed: 0,Extraversion,Neuroticism,Agreeableness,Conscientiousness,Openness
GANDALF,3.882568e-09,2.035767e-02,1.047097e-05,6.046523e-08,0.000261
SAM,9.614641e-01,5.876233e-03,7.335323e-01,9.999985e-01,1.000000
FRODO,4.434084e-03,1.288953e-06,3.445255e-01,9.999998e-01,1.000000
ARAGORN,9.950442e-01,7.132357e-07,9.355735e-01,2.179761e-02,1.000000
GOLLUM,3.533479e-08,9.415129e-05,1.676146e-02,9.999781e-01,1.000000
...,...,...,...,...,...
OLD MAN,9.945818e-01,9.264289e-07,2.027912e-01,2.618690e-06,0.999935
FRODO VOICE,9.999982e-01,1.000000e+00,1.000000e+00,8.523689e-01,1.000000
MRS BRACEGIRDLE,6.937663e-01,2.539650e-05,3.157234e-01,1.110914e-06,1.000000
PROUDFOOT HOBBIT,1.701681e-04,3.612469e-08,5.186739e-01,1.002953e-01,0.999505


In [13]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import numpy as np


def drawPCAGraph(dataframe):
    """ Function for drawing PCA image with 2 principal components for given dataframe"""
    num_char = 10 # number of characters to be included in the scatter plot
    x = StandardScaler().fit_transform(dataframe[0:num_char].values)

    pca = PCA(n_components=2)
    principalComponents = pca.fit_transform(x) # peform PCA
    print("Variance explained: {} percent".format(100 * pca.explained_variance_ratio_.cumsum()[1]))

    df = pd.DataFrame(principalComponents, index=list(concat.keys())[0:num_char], columns=['PC1','PC2'])
    fig = px.scatter(df, x='PC1', y='PC2', text=df.index) # plot according to the principal components

    fig.update_layout(
        font=dict(
            family="Arial",
            size=15,  # Set the font size here
            color="RebeccaPurple"
        ),
        height=550
    )

    fig.update_traces(textposition="top center")

    config = {
        'toImageButtonOptions': {
            'format': 'png',
            'height': 550,
            'width': 1100,
            'scale': 8
        }
    }
    fig.show(config=config)
    
drawPCAGraph(traitProbsDf_iter20)
drawPCAGraph(traitProbsDf_iter40)
drawPCAGraph(traitProbsDf_iter60)
#drawPCAGraph(traitProbsDf_iter200)

Variance explained: 80.19127148119306 percent


Variance explained: 85.16245443322362 percent


Variance explained: 79.10722207837053 percent


# Task 3

In [None]:
similarities = {} # contains the pairwise cosine similarities for characters
num_char = 10

for character1 in list(concat.keys())[0:num_char]:
    if not character1 in similarities:
        similarities[character1] = {}

    for character2 in list(concat.keys())[0:num_char]: # I know, CosSim(A,B)=CosSim(B,A)...
        similarities[character1][character2] = cosine_similarity(np.array(traitProbs[character1]).reshape(1, -1), np.array(traitProbs[character2]).reshape(1, -1))[0][0]

pd.DataFrame.from_dict(similarities, orient="index")

# Task 4

Please see tasks_tool2.ipynb and tasks_tool3.ipynb.

# Task 5

In [None]:
probVectors = []
dialogue = []

for index, row in script.iterrows(): # go through the entries one by one for clarity
    line = str(row["dialog"]).lower()
    dialogue.append(line)

for trait in range(0, 5):
    xx_test = vectorizers[trait].transform(dialogue)
    probVectors.append(classifiers[trait].predict_proba(xx_test)[:,1]) # predict probability for every trait

traitVectors = []

for index in range(len(probVectors[0])):
    traitVector = [probVectors[0][index], probVectors[1][index], probVectors[2][index], probVectors[3][index], probVectors[4][index]]
    traitVectors.append(traitVector)

scores = pd.DataFrame(traitVectors)
scores.to_csv("traitVectors_tool1.csv") # save the scores of each script in a "databse" file
print("Done!")

# Task 6

In [None]:
dominatingTrait = {}

for character in concat:
    dominatingTrait[character] = traitProbs[character].index(max(traitProbs[character])) # dominant trait for every char

dominatingProportion = {}

lista = list(script.iterrows())
traits = {}

for i, score in enumerate(traitVectors):
    dominantTrait = score.index(max(score)) # dominant trait of the script
    character = lista[i][1]["char"].strip().replace("(", "") # the character of this script

    if not character in dominatingProportion:
        dominatingProportion[character] = {"Score 1": 0, "Score 2": 0, "Most frequent trait": 0}
        traits[character] = []

    traits[character].append(dominantTrait)
    
    if dominantTrait == dominatingTrait[character]: # it's a match!
        dominatingProportion[character]["Score 1"] += 1

totalCounts = {}
for index, row in script.iterrows(): # go through the entries one by one for clarity (of course, could be implemented more compactly with nice pandas oneliners)
    character = row["char"].strip().replace("(", "")
    
    if not character in totalCounts:
        totalCounts[character] = 0
        
    totalCounts[character] += 1

for character in totalCounts:
    dominatingProportion[character]["Score 1"] /= totalCounts[character] # for each character, divide the count by the total number of scripts for this char
    
    freqTrait = max(set(traits[character]), key=traits[character].count)
    dominatingProportion[character]["Score 2"] = traits[character].count(freqTrait) / len(traits[character])
    dominatingProportion[character]["Most frequent trait"] = traitNames[freqTrait]
    
df = pd.DataFrame.from_dict(dominatingProportion, orient="index")
df = df.reindex(concat.keys())
display(df)

# Task 7

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize
import plotly.graph_objects as pg
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import nltk

nltk.download('vader_lexicon')
sentimentCounts = {}
traitCounts = {"pos": [0,0,0,0,0], "neg": [0,0,0,0,0], "neu": [0,0,0,0,0]}

for index, row in script.iterrows(): # go through the entries one by one
    character = row["char"].strip().replace("(", "")
    dialog = str(row["dialog"])
    domTrait = traitVectors[index].index(max(traitVectors[index]))
    
    sid = SentimentIntensityAnalyzer()
    ss = sid.polarity_scores(dialog) # obtain the scores for this script
    polarity = ss["pos"] - ss["neg"] # total polarity = positive - negative
    
    if character not in sentimentCounts:
        sentimentCounts[character] = {"pos": [0,0,0,0,0], "neg": [0,0,0,0,0], "neu": [0,0,0,0,0]} # initialize dictionary
    
    if polarity > 0:
        sentimentCounts[character]["pos"][domTrait] += 1
        traitCounts["pos"][domTrait] += 1
    elif polarity < 0:
        sentimentCounts[character]["neg"][domTrait] += 1
        traitCounts["neg"][domTrait] += 1
    elif polarity == 0:
        sentimentCounts[character]["neu"][domTrait] += 1
        traitCounts["neu"][domTrait] += 1

traits = ['EXT', 'NEU', 'AGR', 'CON', 'OPN']
fig = make_subplots(rows=4, cols=3, start_cell="top-left", subplot_titles=list(concat.keys())[0:10])

for i, character in enumerate(list(concat.keys())[0:10]):
    if i == 0:
        leg = True
    else:
        leg = False

    fig.add_trace(go.Bar(
        name = 'Neutral',
        marker_color = 'cornflowerblue',
        showlegend=leg,
        x = traits,
        y = sentimentCounts[character]["neu"]), row=i // 3 + 1, col=i % 3 + 1)

    fig.add_trace(go.Bar(
        name = 'Positive',
        marker_color = 'mediumseagreen',
        showlegend=leg,
        x = traits,
        y = sentimentCounts[character]["pos"]), row=i // 3 + 1, col=i % 3 + 1)

    fig.add_trace(go.Bar(
        name = 'Negative',
        marker_color = 'crimson',
        showlegend=leg,
        x = traits,
        y = sentimentCounts[character]["neg"]), row=i // 3 + 1, col=i % 3 + 1)

config = {
    'toImageButtonOptions': {
        'format': 'png',
        'height': 1.2*1000,
        'width': 1000,
        'scale': 10
    }
}
    
fig.update_layout(barmode='stack')   
fig.show(config=config)

In [None]:
plot = pg.Figure(data=[go.Bar(
    name = 'Neutral',
    marker_color = 'cornflowerblue',
    x = traits,
    y = traitCounts["neu"]
   ),
                       go.Bar(
    name = 'Positive',
        marker_color = 'mediumseagreen',
        x = traits,
        y = traitCounts["pos"]
   ),
   go.Bar(
   name = 'Negative',
        marker_color = 'crimson',
        x = traits,
        y = traitCounts["neg"]
   )
])

config = {
    'toImageButtonOptions': {
        'format': 'png',
        'height': 500,
        'width': 890,
        'scale': 10
    }
}

plot.update_layout(barmode='stack')
plot.show(config=config)