# Task 1

In [1]:
import pandas as pd
from statistics import mean, stdev
import contractions
import nltk

nltk.download("stopwords")
nltk.download("punkt")

#pd.set_option('display.max_rows', None) # uncomment to display all rows at once

script = pd.read_csv("data/lotr_scripts.csv", encoding='utf-8') # load the data
concat = {} # stores the concatenated scripts for each character

script_lengths = {} # individual script lengths grouped by character
stopwords_list = set(nltk.corpus.stopwords.words("english"))

raw_scripts = {}

for index, row in script.iterrows(): # go through the entries one by one for clarity (of course, could be implemented more compactly with nice pandas oneliners)
    character = row["char"].strip().replace("(", "")
    
    if not character in concat:
        concat[character] = []
        script_lengths[character] = []
        raw_scripts[character] = ""
    
    raw_scripts[character] += str(row["dialog"]).lower() + " "
    dialogue = contractions.fix(str(row["dialog"])) # can't -> can not etc.
    words = nltk.word_tokenize(dialogue) # tokenize
    words = [word.lower() for word in words if word.isalpha()] # to lowercase if the "word" consists of alphabet letters
    
    script_lengths[character].append(len(words)) # length of the individual script
    concat[character].extend(words) # add to the concatenated list for this character

concat = dict(sorted(concat.items(), key=lambda i: -len(i[1]))) # sort by number of tokens in descending order
details = {} # dictionary containing the derived characteristics for every character

for character in concat:
    if not character in details:
        details[character] = {}

    details[character]["token_num"] = len(concat[character]) # total number of tokens
    details[character]["vocab_size"] = len(set(concat[character])) # vocabulary size = number of unique tokens
    details[character]["avg_script_length"] = mean(script_lengths[character]) # mean of script lengths
    details[character]["stopword_proportion"] = 0
    
    if len(script_lengths[character]) == 1:
        details[character]["sd_script_length"] = float('inf') # if only one script available for this character, SD is "infinite"
    else:
        details[character]["sd_script_length"] = stdev(script_lengths[character])
    
    for token in concat[character]:
        if token in stopwords_list:
            details[character]["stopword_proportion"] += 1 # if contained in stopword list, increment number of stopwords per char
    
    details[character]["stopword_proportion"] /= len(concat[character]) # divide by the total number of tokens

pd.DataFrame.from_dict(details, orient="index") # display table

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/syomasa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/syomasa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,token_num,vocab_size,avg_script_length,stopword_proportion,sd_script_length
GANDALF,3159,883,15.409756,0.528332,17.825898
SAM,2185,563,10.069124,0.577117,11.185300
FRODO,1828,494,8.088496,0.591357,8.203585
ARAGORN,1406,534,7.518717,0.497866,7.892655
GOLLUM,1312,362,9.791045,0.503811,9.254200
...,...,...,...,...,...
OLD MAN,1,1,1.000000,0.000000,inf
FRODO VOICE,1,1,1.000000,0.000000,inf
MRS BRACEGIRDLE,1,1,1.000000,0.000000,inf
PROUDFOOT HOBBIT,1,1,1.000000,0.000000,inf


# Task 2

This code implements the MLP model presented in https://github.com/desaichirayu/Personality-Attribution-using-Natural-Language-Processing.

In [7]:
# adapted and modified from
# https://github.com/desaichirayu/Personality-Attribution-using-Natural-Language-Processing/blob/master/code%20and%20data/mlp_simple.py
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

df = pd.read_csv(r"data/essays.csv", names=['author_id', 'essay', 'Extraversion', 'Neuroticism', 'Agreeableness', 'Conscientiousness', 'Openness'], encoding='cp1252')

x = df['essay'][1:]
x = x.str.lower() # to lowercase

def trainAndTest():
    classifiers = []
    vectorizers = []
    
    choices = {
            0: ('Extraversion', ('tanh', 'adaptive', 'lbfgs')),
            1: ('Neuroticism', ('tanh', 'adaptive', 'lbfgs')),
            2: ('Agreeableness', ('tanh', 'adaptive', 'lbfgs')),
            3: ('Conscientiousness', ('relu', 'invscaling', 'lbfgs')),
            4: ('Openness', ('relu', 'invscaling', 'lbfgs'))
        } # specs for every trait classifier (activation function etc.)

    for trait in range(5):
        y = df[choices[trait][0]][1:] # select essays associated with this trait
        x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=11) # random split into train & test sets

        # TF-IDF vectorizer
        vectorizer = TfidfVectorizer()
        vectorizers.append(vectorizer)
        
        xx_train = vectorizers[trait].fit_transform(x_train)
        xx_test = vectorizers[trait].transform(x_test)

        # specify and train a classifier for every trait
        classifiers.append(MLPClassifier(activation=choices[trait][1][0], alpha=0.0001, hidden_layer_sizes=(60),
                                   learning_rate=choices[trait][1][1], max_iter=200, solver=choices[trait][1][2]))
        classifiers[trait].fit(xx_train, y_train)

        predictions = classifiers[trait].predict(xx_test) # predict for test set
        score = accuracy_score(y_test, predictions)
        print("Validation accuracy (on Essays set) for {}: {}".format(choices[trait][0], score))

    print("Training and validation done!")
    return classifiers, vectorizers

In [8]:
traitNames = ['Extraversion', 'Neuroticism', 'Agreeableness', 'Conscientiousness', 'Openness']
traitProbs = {} # a dict containing the probabilites of every trait for each character
iterCount = 20 # for report results, 20 was used

for character in concat:
    traitProbs[character] = [0] * 5

for _ in range(iterCount):
    classifiers, vectorizers = trainAndTest()
    print()
    
    for trait in range(5):
        xx_test = vectorizers[trait].transform(raw_scripts.values())
        prob = classifiers[trait].predict_proba(xx_test)[:,1] # predict probability for every trait

        for i, character in enumerate(concat):
            traitProbs[character][trait] += prob[i] 

traitProbs = {key: [x / iterCount for x in value] for key, value in traitProbs.items()}
traitProbsDf = pd.DataFrame.from_dict(traitProbs, orient="index") # display table containing the probabilites of every trait for each character
traitProbsDf.columns = traitNames
display(traitProbsDf)

Validation accuracy (on Essays set) for Extraversion: 0.5688816855753647
Validation accuracy (on Essays set) for Neuroticism: 0.526742301458671
Validation accuracy (on Essays set) for Agreeableness: 0.5445705024311183
Validation accuracy (on Essays set) for Conscientiousness: 0.5218800648298217
Validation accuracy (on Essays set) for Openness: 0.5948136142625607
Training and validation done!

Validation accuracy (on Essays set) for Extraversion: 0.539708265802269
Validation accuracy (on Essays set) for Neuroticism: 0.546191247974068
Validation accuracy (on Essays set) for Agreeableness: 0.5510534846029174
Validation accuracy (on Essays set) for Conscientiousness: 0.513776337115073
Validation accuracy (on Essays set) for Openness: 0.6126418152350082
Training and validation done!

Validation accuracy (on Essays set) for Extraversion: 0.5494327390599676
Validation accuracy (on Essays set) for Neuroticism: 0.546191247974068
Validation accuracy (on Essays set) for Agreeableness: 0.547811993

Unnamed: 0,Extraversion,Neuroticism,Agreeableness,Conscientiousness,Openness
GANDALF,7.953415e-07,0.026782,2.377689e-06,0.000011,1.244903e-07
SAM,9.313602e-01,0.001093,7.411535e-01,1.000000,1.000000e+00
FRODO,7.089671e-02,0.000004,2.829793e-01,1.000000,1.000000e+00
ARAGORN,9.775692e-01,0.000012,9.736260e-01,0.163170,1.000000e+00
GOLLUM,1.113395e-06,0.000071,1.571681e-02,1.000000,1.000000e+00
...,...,...,...,...,...
OLD MAN,9.959163e-01,0.000016,1.678231e-01,0.000074,9.999341e-01
FRODO VOICE,9.999997e-01,1.000000,1.000000e+00,0.805919,1.000000e+00
MRS BRACEGIRDLE,6.176272e-01,0.000078,3.065039e-01,0.000061,1.000000e+00
PROUDFOOT HOBBIT,4.664766e-02,0.000001,6.092505e-01,0.133303,9.510823e-01


In [17]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import numpy as np

num_char = 10 # number of characters to be included in the scatter plot
x = StandardScaler().fit_transform(traitProbsDf[0:num_char].values)

pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x) # peform PCA
print("Variance explained: {} percent".format(100 * pca.explained_variance_ratio_.cumsum()[1]))

df = pd.DataFrame(principalComponents, index=list(concat.keys())[0:num_char], columns=['PC1','PC2'])
fig = px.scatter(df, x='PC1', y='PC2', text=df.index) # plot according to the principal components

fig.update_layout(
    font=dict(
        family="Arial",
        size=15,  # Set the font size here
        color="RebeccaPurple"
    ),
    height=550
)

fig.update_traces(textposition="top center")

config = {
    'toImageButtonOptions': {
        'format': 'png',
        'height': 550,
        'width': 1100,
        'scale': 8
    }
}

fig.show(config=config)

Variance explained: 85.46298057477979 percent


# Task 3

In [10]:
similarities = {} # contains the pairwise cosine similarities for characters
num_char = 10

for character1 in list(concat.keys())[0:num_char]:
    if not character1 in similarities:
        similarities[character1] = {}

    for character2 in list(concat.keys())[0:num_char]: # I know, CosSim(A,B)=CosSim(B,A)...
        similarities[character1][character2] = cosine_similarity(np.array(traitProbs[character1]).reshape(1, -1), np.array(traitProbs[character2]).reshape(1, -1))[0][0]

pd.DataFrame.from_dict(similarities, orient="index")

Unnamed: 0,GANDALF,SAM,FRODO,ARAGORN,GOLLUM,THEODEN,PIPPIN,GIMLI,BILBO,MERRY
GANDALF,1.0,0.000857,0.000297,0.000115,0.000332,0.005626,0.000321,0.0037,0.000817,0.000116
SAM,0.000857,1.0,0.852623,0.883414,0.769493,0.765222,0.90849,0.80486,0.849259,0.719144
FRODO,0.000297,0.852623,1.0,0.610078,0.981497,0.979434,0.912225,0.981042,0.9699,0.853286
ARAGORN,0.000115,0.883414,0.610078,1.0,0.486775,0.480717,0.782023,0.527339,0.583445,0.694941
GOLLUM,0.000332,0.769493,0.981497,0.486775,1.0,0.999928,0.817162,0.99675,0.980397,0.83437
THEODEN,0.005626,0.765222,0.979434,0.480717,0.999928,1.0,0.811023,0.996768,0.980449,0.832394
PIPPIN,0.000321,0.90849,0.912225,0.782023,0.817162,0.811023,1.0,0.820075,0.817658,0.786666
GIMLI,0.0037,0.80486,0.981042,0.527339,0.99675,0.996768,0.820075,1.0,0.993065,0.831323
BILBO,0.000817,0.849259,0.9699,0.583445,0.980397,0.980449,0.817658,0.993065,1.0,0.818118
MERRY,0.000116,0.719144,0.853286,0.694941,0.83437,0.832394,0.786666,0.831323,0.818118,1.0


# Task 4

Please see tasks_tool2.ipynb and tasks_tool3.ipynb.

# Task 5

In [11]:
probVectors = []
dialogue = []

for index, row in script.iterrows(): # go through the entries one by one for clarity
    line = str(row["dialog"]).lower()
    dialogue.append(line)

for trait in range(0, 5):
    xx_test = vectorizers[trait].transform(dialogue)
    probVectors.append(classifiers[trait].predict_proba(xx_test)[:,1]) # predict probability for every trait

traitVectors = []

for index in range(len(probVectors[0])):
    traitVector = [probVectors[0][index], probVectors[1][index], probVectors[2][index], probVectors[3][index], probVectors[4][index]]
    traitVectors.append(traitVector)

scores = pd.DataFrame(traitVectors)
scores.to_csv("traitVectors_tool1.csv") # save the scores of each script in a "databse" file
print("Done!")

Done!


# Task 6

In [12]:
dominatingTrait = {}

for character in concat:
    dominatingTrait[character] = traitProbs[character].index(max(traitProbs[character])) # dominant trait for every char

dominatingProportion = {}

lista = list(script.iterrows())
traits = {}

for i, score in enumerate(traitVectors):
    dominantTrait = score.index(max(score)) # dominant trait of the script
    character = lista[i][1]["char"].strip().replace("(", "") # the character of this script

    if not character in dominatingProportion:
        dominatingProportion[character] = {"Score 1": 0, "Score 2": 0, "Most frequent trait": 0}
        traits[character] = []

    traits[character].append(dominantTrait)
    
    if dominantTrait == dominatingTrait[character]: # it's a match!
        dominatingProportion[character]["Score 1"] += 1

totalCounts = {}
for index, row in script.iterrows(): # go through the entries one by one for clarity (of course, could be implemented more compactly with nice pandas oneliners)
    character = row["char"].strip().replace("(", "")
    
    if not character in totalCounts:
        totalCounts[character] = 0
        
    totalCounts[character] += 1

for character in totalCounts:
    dominatingProportion[character]["Score 1"] /= totalCounts[character] # for each character, divide the count by the total number of scripts for this char
    
    freqTrait = max(set(traits[character]), key=traits[character].count)
    dominatingProportion[character]["Score 2"] = traits[character].count(freqTrait) / len(traits[character])
    dominatingProportion[character]["Most frequent trait"] = traitNames[freqTrait]
    
df = pd.DataFrame.from_dict(dominatingProportion, orient="index")
df = df.reindex(concat.keys())
display(df)

Unnamed: 0,Score 1,Score 2,Most frequent trait
GANDALF,0.029268,0.385366,Extraversion
SAM,0.368664,0.368664,Openness
FRODO,0.070796,0.393805,Openness
ARAGORN,0.267380,0.470588,Extraversion
GOLLUM,0.253731,0.365672,Extraversion
...,...,...,...
OLD MAN,0.000000,1.000000,Agreeableness
FRODO VOICE,0.000000,1.000000,Extraversion
MRS BRACEGIRDLE,0.000000,1.000000,Extraversion
PROUDFOOT HOBBIT,0.000000,1.000000,Extraversion


# Task 7

In [8]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize
import plotly.graph_objects as pg
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import nltk

nltk.download('vader_lexicon')
sentimentCounts = {}
traitCounts = {"pos": [0,0,0,0,0], "neg": [0,0,0,0,0], "neu": [0,0,0,0,0]}

for index, row in script.iterrows(): # go through the entries one by one
    character = row["char"].strip().replace("(", "")
    dialog = str(row["dialog"])
    domTrait = traitVectors[index].index(max(traitVectors[index]))
    
    sid = SentimentIntensityAnalyzer()
    ss = sid.polarity_scores(dialog) # obtain the scores for this script
    polarity = ss["pos"] - ss["neg"] # total polarity = positive - negative
    
    if character not in sentimentCounts:
        sentimentCounts[character] = {"pos": [0,0,0,0,0], "neg": [0,0,0,0,0], "neu": [0,0,0,0,0]} # initialize dictionary
    
    if polarity > 0:
        sentimentCounts[character]["pos"][domTrait] += 1
        traitCounts["pos"][domTrait] += 1
    elif polarity < 0:
        sentimentCounts[character]["neg"][domTrait] += 1
        traitCounts["neg"][domTrait] += 1
    elif polarity == 0:
        sentimentCounts[character]["neu"][domTrait] += 1
        traitCounts["neu"][domTrait] += 1

traits = ['EXT', 'NEU', 'AGR', 'CON', 'OPN']
fig = make_subplots(rows=4, cols=3, start_cell="top-left", subplot_titles=list(concat.keys())[0:10])

for i, character in enumerate(list(concat.keys())[0:10]):
    if i == 0:
        leg = True
    else:
        leg = False

    fig.add_trace(go.Bar(
        name = 'Neutral',
        marker_color = 'cornflowerblue',
        showlegend=leg,
        x = traits,
        y = sentimentCounts[character]["neu"]), row=i // 3 + 1, col=i % 3 + 1)

    fig.add_trace(go.Bar(
        name = 'Positive',
        marker_color = 'mediumseagreen',
        showlegend=leg,
        x = traits,
        y = sentimentCounts[character]["pos"]), row=i // 3 + 1, col=i % 3 + 1)

    fig.add_trace(go.Bar(
        name = 'Negative',
        marker_color = 'crimson',
        showlegend=leg,
        x = traits,
        y = sentimentCounts[character]["neg"]), row=i // 3 + 1, col=i % 3 + 1)

config = {
    'toImageButtonOptions': {
        'format': 'png',
        'height': 1.2*1000,
        'width': 1000,
        'scale': 10
    }
}
    
fig.update_layout(barmode='stack')   
fig.show(config=config)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\pake10\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [9]:
plot = pg.Figure(data=[go.Bar(
    name = 'Neutral',
    marker_color = 'cornflowerblue',
    x = traits,
    y = traitCounts["neu"]
   ),
                       go.Bar(
    name = 'Positive',
        marker_color = 'mediumseagreen',
        x = traits,
        y = traitCounts["pos"]
   ),
   go.Bar(
   name = 'Negative',
        marker_color = 'crimson',
        x = traits,
        y = traitCounts["neg"]
   )
])

config = {
    'toImageButtonOptions': {
        'format': 'png',
        'height': 500,
        'width': 890,
        'scale': 10
    }
}

plot.update_layout(barmode='stack')
plot.show(config=config)