# Task 1

In [19]:
import pandas as pd
from statistics import mean, stdev
import contractions
import nltk

from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

nltk.download("stopwords")
nltk.download("punkt")

#pd.set_option('display.max_rows', None) # uncomment to display all rows at once

script = pd.read_csv("data/lotr_scripts.csv", encoding='utf-8') # load the data
concat = {} # stores the concatenated scripts for each character

script_lengths = {} # individual script lengths grouped by character
stopwords_list = set(nltk.corpus.stopwords.words("english"))

raw_scripts = {}

for index, row in script.iterrows(): # go through the entries one by one for clarity (of course, could be implemented more compactly with nice pandas oneliners)
    character = row["char"].strip().replace("(", "")
    
    if not character in concat:
        concat[character] = []
        script_lengths[character] = []
        raw_scripts[character] = ""
    
    raw_scripts[character] += str(row["dialog"]).lower() + " "
    dialogue = contractions.fix(str(row["dialog"])) # can't -> can not etc.
    words = nltk.word_tokenize(dialogue) # tokenize
    words = [word.lower() for word in words if word.isalpha()] # to lowercase if the "word" consists of alphabet letters
    
    script_lengths[character].append(len(words)) # length of the individual script
    concat[character].extend(words) # add to the concatenated list for this character

concat = dict(sorted(concat.items(), key=lambda i: -len(i[1]))) # sort by number of tokens in descending order
details = {} # dictionary containing the derived characteristics for every character

for character in concat:
    if not character in details:
        details[character] = {}

    details[character]["token_num"] = len(concat[character]) # total number of tokens
    details[character]["vocab_size"] = len(set(concat[character])) # vocabulary size = number of unique tokens
    details[character]["avg_script_length"] = mean(script_lengths[character]) # mean of script lengths
    details[character]["stopword_proportion"] = 0
    
    if len(script_lengths[character]) == 1:
        details[character]["sd_script_length"] = float('inf') # if only one script available for this character, SD is "infinite"
    else:
        details[character]["sd_script_length"] = stdev(script_lengths[character])
    
    for token in concat[character]:
        if token in stopwords_list:
            details[character]["stopword_proportion"] += 1 # if contained in stopword list, increment number of stopwords per char
    
    details[character]["stopword_proportion"] /= len(concat[character]) # divide by the total number of tokens

pd.DataFrame.from_dict(details, orient="index") # display table

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pake10\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pake10\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,token_num,vocab_size,avg_script_length,stopword_proportion,sd_script_length
GANDALF,3159,883,15.409756,0.528332,17.825898
SAM,2185,563,10.069124,0.577117,11.185300
FRODO,1828,494,8.088496,0.591357,8.203585
ARAGORN,1406,534,7.518717,0.497866,7.892655
GOLLUM,1312,362,9.791045,0.503811,9.254200
...,...,...,...,...,...
OLD MAN,1,1,1.000000,0.000000,inf
FRODO VOICE,1,1,1.000000,0.000000,inf
MRS BRACEGIRDLE,1,1,1.000000,0.000000,inf
PROUDFOOT HOBBIT,1,1,1.000000,0.000000,inf


# Task 2

This code implements the MLP model presented in https://github.com/desaichirayu/Personality-Attribution-using-Natural-Language-Processing.

Some experimentation was performed on different values for maximum iteration count when training, however, max_iter = 20 is used for the data in the report. Additionally, iterCount = 20 was used for the report but since running it takes quite some time, iterCount = 1 has been set below.

In [20]:
# adapted and modified from
# https://github.com/desaichirayu/Personality-Attribution-using-Natural-Language-Processing/blob/master/code%20and%20data/mlp_simple.py


df = pd.read_csv(r"data/essays.csv", names=['author_id', 'essay', 'Extraversion', 'Neuroticism', 'Agreeableness', 'Conscientiousness', 'Openness'], encoding='cp1252')

x = df['essay'][1:]
x = x.str.lower() # to lowercase

def trainAndTest(max_iter=200):
    classifiers = []
    vectorizers = []
    
    choices = {
            0: ('Extraversion', ('tanh', 'adaptive', 'lbfgs')),
            1: ('Neuroticism', ('tanh', 'adaptive', 'lbfgs')),
            2: ('Agreeableness', ('tanh', 'adaptive', 'lbfgs')),
            3: ('Conscientiousness', ('relu', 'invscaling', 'lbfgs')),
            4: ('Openness', ('relu', 'invscaling', 'lbfgs'))
        } # specs for every trait classifier (activation function etc.)

    for trait in range(5):
        y = df[choices[trait][0]][1:] # select essays associated with this trait
        x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=11) # random split into train & test sets

        # TF-IDF vectorizer
        vectorizer = TfidfVectorizer()
        vectorizers.append(vectorizer)
        
        xx_train = vectorizers[trait].fit_transform(x_train)
        xx_test = vectorizers[trait].transform(x_test)

        # specify and train a classifier for every trait
        classifiers.append(MLPClassifier(activation=choices[trait][1][0], alpha=0.0001, hidden_layer_sizes=(60),
                                   learning_rate=choices[trait][1][1], max_iter=max_iter, solver=choices[trait][1][2]))
        classifiers[trait].fit(xx_train, y_train)

        predictions = classifiers[trait].predict(xx_test) # predict for test set
        score = accuracy_score(y_test, predictions)
        print("Validation accuracy (on Essays set) for {}: {}".format(choices[trait][0], score))

    print("Training and validation done!")
    return classifiers, vectorizers

In [23]:
traitNames = ['Extraversion', 'Neuroticism', 'Agreeableness', 'Conscientiousness', 'Openness']
traitProbs = {} # a dict containing the probabilites of every trait for each character
iterCount = 1 # for report results, 20 was used
max_iter = 20 # maximum allowed iterations for MLPClassifier

for character in concat:
    traitProbs[character] = [0] * 5

for _ in range(iterCount):
    classifiers, vectorizers = trainAndTest(max_iter)
    print()
    
    for trait in range(5):
        xx_test = vectorizers[trait].transform(raw_scripts.values())
        prob = classifiers[trait].predict_proba(xx_test)[:,1] # predict probability for every trait

        for i, character in enumerate(concat):
            traitProbs[character][trait] += prob[i] 

traitProbs = {key: [x / iterCount for x in value] for key, value in traitProbs.items()}
traitProbsDf = pd.DataFrame.from_dict(traitProbs, orient="index") # display table containing the probabilites of every trait for each character
traitProbsDf.columns = traitNames
traitProbsDf.to_csv(f"tool1_results/traitProbsDf_max_iter_{max_iter}.csv") # Save results for later comparison
display(traitProbsDf)

Validation accuracy (on Essays set) for Extraversion: 0.5883306320907618
Validation accuracy (on Essays set) for Neuroticism: 0.5980551053484603
Validation accuracy (on Essays set) for Agreeableness: 0.5672609400324149
Validation accuracy (on Essays set) for Conscientiousness: 0.5494327390599676
Validation accuracy (on Essays set) for Openness: 0.6175040518638574
Training and validation done!

Validation accuracy (on Essays set) for Extraversion: 0.5623987034035657
Validation accuracy (on Essays set) for Neuroticism: 0.593192868719611
Validation accuracy (on Essays set) for Agreeableness: 0.5656401944894651
Validation accuracy (on Essays set) for Conscientiousness: 0.5478119935170178
Validation accuracy (on Essays set) for Openness: 0.6175040518638574
Training and validation done!

Validation accuracy (on Essays set) for Extraversion: 0.5688816855753647
Validation accuracy (on Essays set) for Neuroticism: 0.5883306320907618
Validation accuracy (on Essays set) for Agreeableness: 0.55267

Unnamed: 0,Extraversion,Neuroticism,Agreeableness,Conscientiousness,Openness
GANDALF,0.276701,0.847579,0.091099,0.026562,0.349064
SAM,0.670854,0.548407,0.539521,0.563860,0.824807
FRODO,0.378763,0.386645,0.515885,0.651671,0.875948
ARAGORN,0.722903,0.356838,0.524831,0.401590,0.929799
GOLLUM,0.146540,0.702587,0.060633,0.605640,0.971376
...,...,...,...,...,...
OLD MAN,0.877436,0.499375,0.504798,0.041813,0.551744
FRODO VOICE,0.981105,0.992087,0.994149,0.179404,0.874160
MRS BRACEGIRDLE,0.742223,0.598134,0.412390,0.039532,0.735342
PROUDFOOT HOBBIT,0.909902,0.075057,0.572718,0.126787,0.881677


In [24]:
# load the tool1 tables and show the results for different values of max_iter
traitProbsDf_iter20 = pd.read_csv(r"tool1_results/traitProbsDf_max_iter_20.csv", index_col=0)
traitProbsDf_iter40 = pd.read_csv(r"tool1_results/traitProbsDf_max_iter_40.csv", index_col=0)
traitProbsDf_iter60 = pd.read_csv(r"tool1_results/traitProbsDf_max_iter_60.csv", index_col=0)

print("max_iter = 20")
display(traitProbsDf_iter20)
print("max_iter = 40")
display(traitProbsDf_iter40)
print("max_iter = 60")
display(traitProbsDf_iter60)

max_iter = 20


Unnamed: 0,Extraversion,Neuroticism,Agreeableness,Conscientiousness,Openness
GANDALF,0.276701,0.847579,0.091099,0.026562,0.349064
SAM,0.670854,0.548407,0.539521,0.563860,0.824807
FRODO,0.378763,0.386645,0.515885,0.651671,0.875948
ARAGORN,0.722903,0.356838,0.524831,0.401590,0.929799
GOLLUM,0.146540,0.702587,0.060633,0.605640,0.971376
...,...,...,...,...,...
OLD MAN,0.877436,0.499375,0.504798,0.041813,0.551744
FRODO VOICE,0.981105,0.992087,0.994149,0.179404,0.874160
MRS BRACEGIRDLE,0.742223,0.598134,0.412390,0.039532,0.735342
PROUDFOOT HOBBIT,0.909902,0.075057,0.572718,0.126787,0.881677


max_iter = 40


Unnamed: 0,Extraversion,Neuroticism,Agreeableness,Conscientiousness,Openness
GANDALF,1.971740e-05,3.062770e-03,8.258034e-18,7.227831e-11,0.000477
SAM,4.340072e-02,2.710225e-03,9.927110e-01,9.974547e-01,1.000000
FRODO,1.298630e-05,4.241547e-06,6.546092e-01,9.999169e-01,1.000000
ARAGORN,3.300864e-01,6.058608e-07,9.999925e-01,1.504740e-01,1.000000
GOLLUM,8.418653e-10,8.584797e-05,5.495751e-05,9.999546e-01,1.000000
...,...,...,...,...,...
OLD MAN,9.923012e-01,4.839631e-07,1.568052e-04,6.146898e-09,0.999346
FRODO VOICE,9.999445e-01,1.000000e+00,1.000000e+00,2.617008e-02,1.000000
MRS BRACEGIRDLE,4.597582e-01,5.302799e-06,5.163947e-04,1.104238e-08,0.999998
PROUDFOOT HOBBIT,3.901365e-03,4.648303e-14,1.517241e-01,6.012615e-03,0.999970


max_iter = 60


Unnamed: 0,Extraversion,Neuroticism,Agreeableness,Conscientiousness,Openness
GANDALF,1.992169e-07,0.060315,2.973447e-05,0.049990,0.032237
SAM,9.406453e-01,0.001172,8.229310e-01,0.999871,0.999976
FRODO,2.793154e-03,0.000002,4.763892e-01,0.999992,0.999963
ARAGORN,9.691601e-01,0.000001,9.572687e-01,0.052314,0.999998
GOLLUM,1.689428e-07,0.000027,1.816520e-02,0.999986,0.999976
...,...,...,...,...,...
OLD MAN,9.976604e-01,0.000164,3.141343e-01,0.050079,0.995814
FRODO VOICE,9.999993e-01,1.000000,1.000000e+00,0.843120,1.000000
MRS BRACEGIRDLE,5.858552e-01,0.000850,4.450408e-01,0.050058,0.999872
PROUDFOOT HOBBIT,6.264358e-04,0.000087,6.624255e-01,0.183884,0.949539


In [25]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import numpy as np


def drawPCAGraph(dataframe):
    """ Function for drawing PCA image with 2 principal components for given dataframe"""
    num_char = 10 # number of characters to be included in the scatter plot
    x = StandardScaler().fit_transform(dataframe[0:num_char].values)

    pca = PCA(n_components=2)
    principalComponents = pca.fit_transform(x) # peform PCA
    print("Variance explained: {} percent".format(100 * pca.explained_variance_ratio_.cumsum()[1]))

    df = pd.DataFrame(principalComponents, index=list(concat.keys())[0:num_char], columns=['PC1','PC2'])
    fig = px.scatter(df, x='PC1', y='PC2', text=df.index) # plot according to the principal components

    fig.update_layout(
        font=dict(
            family="Arial",
            size=15,  # Set the font size here
            color="RebeccaPurple"
        ),
        height=550
    )

    fig.update_traces(textposition="top center")

    config = {
        'toImageButtonOptions': {
            'format': 'png',
            'height': 550,
            'width': 1100,
            'scale': 8
        }
    }
    fig.show(config=config)
    
drawPCAGraph(traitProbsDf_iter20)
drawPCAGraph(traitProbsDf_iter40)
drawPCAGraph(traitProbsDf_iter60)

Variance explained: 78.01479332716407 percent


Variance explained: 71.05746026082383 percent


Variance explained: 83.16193834087275 percent


# Task 3

In [26]:
# For this task, max_iter=20 will be used
similarities = {} # contains the pairwise cosine similarities for characters
num_char = 10 # number of characters to be included

for character1 in list(concat.keys())[0:num_char]:
    if not character1 in similarities:
        similarities[character1] = {}

    for character2 in list(concat.keys())[0:num_char]: # I know, CosSim(A,B)=CosSim(B,A)...
        similarities[character1][character2] = cosine_similarity(np.array(list(traitProbsDf_iter20.T.to_dict()[character1].values())).reshape(1, -1), np.array(list(traitProbsDf_iter20.T.to_dict()[character2].values())).reshape(1, -1))[0][0]

pd.DataFrame.from_dict(similarities, orient="index")

Unnamed: 0,GANDALF,SAM,FRODO,ARAGORN,GOLLUM,THEODEN,PIPPIN,GIMLI,BILBO,MERRY
GANDALF,1.0,0.729438,0.630349,0.658853,0.765936,0.720793,0.703277,0.57099,0.656181,0.777237
SAM,0.729438,1.0,0.970548,0.980957,0.858812,0.934109,0.962838,0.94541,0.934887,0.955516
FRODO,0.630349,0.970548,1.0,0.951464,0.896245,0.961189,0.981066,0.961069,0.920793,0.92164
ARAGORN,0.658853,0.980957,0.951464,1.0,0.812455,0.90104,0.960166,0.919058,0.909333,0.971248
GOLLUM,0.765936,0.858812,0.896245,0.812455,1.0,0.97894,0.915065,0.872903,0.893085,0.871726
THEODEN,0.720793,0.934109,0.961189,0.90104,0.97894,1.0,0.960072,0.950143,0.952778,0.917301
PIPPIN,0.703277,0.962838,0.981066,0.960166,0.915065,0.960072,1.0,0.912606,0.892219,0.968985
GIMLI,0.57099,0.94541,0.961069,0.919058,0.872903,0.950143,0.912606,1.0,0.980293,0.862717
BILBO,0.656181,0.934887,0.920793,0.909333,0.893085,0.952778,0.892219,0.980293,1.0,0.881425
MERRY,0.777237,0.955516,0.92164,0.971248,0.871726,0.917301,0.968985,0.862717,0.881425,1.0


# Task 4

Please see tasks_tool2.ipynb and tasks_tool3.ipynb.

# Task 5

In [27]:
probVectors = []
dialogue = []

for index, row in script.iterrows(): # go through the entries one by one for clarity
    line = str(row["dialog"]).lower()
    dialogue.append(line)

for trait in range(0, 5):
    xx_test = vectorizers[trait].transform(dialogue)
    probVectors.append(classifiers[trait].predict_proba(xx_test)[:,1]) # predict probability for every trait

traitVectors = []

for index in range(len(probVectors[0])):
    traitVector = [probVectors[0][index], probVectors[1][index], probVectors[2][index], probVectors[3][index], probVectors[4][index]]
    traitVectors.append(traitVector)

scores = pd.DataFrame(traitVectors)
scores.to_csv("traitVectors_tool1.csv") # save the scores of each script in a "databse" file
print("Done!")

Done!


# Task 6

In [28]:
dominatingTrait = {}

for character in concat:
    dominatingTrait[character] = traitProbs[character].index(max(traitProbs[character])) # dominant trait for every char

dominatingProportion = {}

lista = list(script.iterrows())
traits = {}

for i, score in enumerate(traitVectors):
    dominantTrait = score.index(max(score)) # dominant trait of the script
    character = lista[i][1]["char"].strip().replace("(", "") # the character of this script

    if not character in dominatingProportion:
        dominatingProportion[character] = {"Score 1": 0, "Score 2": 0, "Most frequent trait": 0}
        traits[character] = []

    traits[character].append(dominantTrait)
    
    if dominantTrait == dominatingTrait[character]: # it's a match!
        dominatingProportion[character]["Score 1"] += 1

totalCounts = {}
for index, row in script.iterrows(): # go through the entries one by one for clarity (of course, could be implemented more compactly with nice pandas oneliners)
    character = row["char"].strip().replace("(", "")
    
    if not character in totalCounts:
        totalCounts[character] = 0
        
    totalCounts[character] += 1

for character in totalCounts:
    dominatingProportion[character]["Score 1"] /= totalCounts[character] # for each character, divide the count by the total number of scripts for this char
    
    freqTrait = max(set(traits[character]), key=traits[character].count)
    dominatingProportion[character]["Score 2"] = traits[character].count(freqTrait) / len(traits[character])
    dominatingProportion[character]["Most frequent trait"] = traitNames[freqTrait]
    
df = pd.DataFrame.from_dict(dominatingProportion, orient="index")
df = df.reindex(concat.keys())
display(df)

Unnamed: 0,Score 1,Score 2,Most frequent trait
GANDALF,0.053659,0.804878,Extraversion
SAM,0.069124,0.709677,Extraversion
FRODO,0.057522,0.699115,Extraversion
ARAGORN,0.032086,0.860963,Extraversion
GOLLUM,0.029851,0.761194,Extraversion
...,...,...,...
OLD MAN,0.000000,1.000000,Agreeableness
FRODO VOICE,0.000000,1.000000,Extraversion
MRS BRACEGIRDLE,1.000000,1.000000,Extraversion
PROUDFOOT HOBBIT,1.000000,1.000000,Extraversion


# Task 7

In [29]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize
import plotly.graph_objects as pg
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import nltk

nltk.download('vader_lexicon')
sentimentCounts = {}
traitCounts = {"pos": [0,0,0,0,0], "neg": [0,0,0,0,0], "neu": [0,0,0,0,0]}

for index, row in script.iterrows(): # go through the entries one by one
    character = row["char"].strip().replace("(", "")
    dialog = str(row["dialog"])
    domTrait = traitVectors[index].index(max(traitVectors[index]))
    
    sid = SentimentIntensityAnalyzer()
    ss = sid.polarity_scores(dialog) # obtain the scores for this script
    polarity = ss["pos"] - ss["neg"] # total polarity = positive - negative
    
    if character not in sentimentCounts:
        sentimentCounts[character] = {"pos": [0,0,0,0,0], "neg": [0,0,0,0,0], "neu": [0,0,0,0,0]} # initialize dictionary
    
    if polarity > 0:
        sentimentCounts[character]["pos"][domTrait] += 1
        traitCounts["pos"][domTrait] += 1
    elif polarity < 0:
        sentimentCounts[character]["neg"][domTrait] += 1
        traitCounts["neg"][domTrait] += 1
    elif polarity == 0:
        sentimentCounts[character]["neu"][domTrait] += 1
        traitCounts["neu"][domTrait] += 1

traits = ['EXT', 'NEU', 'AGR', 'CON', 'OPN']
fig = make_subplots(rows=4, cols=3, start_cell="top-left", subplot_titles=list(concat.keys())[0:10])

for i, character in enumerate(list(concat.keys())[0:10]):
    if i == 0:
        leg = True
    else:
        leg = False

    fig.add_trace(go.Bar(
        name = 'Neutral',
        marker_color = 'cornflowerblue',
        showlegend=leg,
        x = traits,
        y = sentimentCounts[character]["neu"]), row=i // 3 + 1, col=i % 3 + 1)

    fig.add_trace(go.Bar(
        name = 'Positive',
        marker_color = 'mediumseagreen',
        showlegend=leg,
        x = traits,
        y = sentimentCounts[character]["pos"]), row=i // 3 + 1, col=i % 3 + 1)

    fig.add_trace(go.Bar(
        name = 'Negative',
        marker_color = 'crimson',
        showlegend=leg,
        x = traits,
        y = sentimentCounts[character]["neg"]), row=i // 3 + 1, col=i % 3 + 1)

config = {
    'toImageButtonOptions': {
        'format': 'png',
        'height': 1.2*1000,
        'width': 1000,
        'scale': 10
    }
}
    
fig.update_layout(barmode='stack')   
fig.show(config=config)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\pake10\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [30]:
plot = pg.Figure(data=[go.Bar(
    name = 'Neutral',
    marker_color = 'cornflowerblue',
    x = traits,
    y = traitCounts["neu"]
   ),
                       go.Bar(
    name = 'Positive',
        marker_color = 'mediumseagreen',
        x = traits,
        y = traitCounts["pos"]
   ),
   go.Bar(
   name = 'Negative',
        marker_color = 'crimson',
        x = traits,
        y = traitCounts["neg"]
   )
])

config = {
    'toImageButtonOptions': {
        'format': 'png',
        'height': 500,
        'width': 890,
        'scale': 10
    }
}

plot.update_layout(barmode='stack')
plot.show(config=config)