In [1]:
import json
import os

from collections import Counter
from data_processing import pers_labels

In [2]:
PWD = os.environ["WORKSPACE_PATH"]

## utility functions

In [3]:
PERS_DATA_FOLDER = f"{PWD}/data/personality_data/"

in_pers_data = lambda data_path: f"{PERS_DATA_FOLDER}{data_path}"

In [4]:
def open_and_load_json(path, lines=False):
    obj = [] if lines else {}

    if os.path.exists(path):
        with open(path, "r+") as fp:
            if lines:
                obj += [json.loads(line) for line in fp.readlines()]
            else:
                obj.update(json.load(fp))
    return obj

In [5]:
def save_json(path, obj, lines=False):
    with open(path, "w+") as fp:
        if lines:
            fp.writelines([json.dumps(item) for item in obj])
        else:
            json.dump(obj, fp)

## percent of movies found

In [6]:
with open(f"{PWD}/data/cornell_movies/speakers.json", "r+") as  fp:
    speakers = json.load(fp)

In [7]:
movies = sorted(list({speakers[char]["meta"]["movie_name"].strip().lower() for char in speakers}))

movie_to_id_path = in_pers_data("movie_to_id.json")
movie_to_id = open_and_load_json(movie_to_id_path)

In [8]:
len(movie_to_id) / len(movies)

0.7520259319286872

In [9]:
len(movie_to_id)

464

## percent of characters matched

In [10]:
char_to_pers_path = in_pers_data("char_to_pers_votes.json")
char_to_pers = open_and_load_json(char_to_pers_path)

movie_to_chars_path = in_pers_data("movie_to_chars.json")
movie_to_chars = open_and_load_json(movie_to_chars_path)

In [11]:
len(char_to_pers) / len([char for movie in movie_to_chars for char in movie_to_chars[movie]])

0.3775914634146341

In [12]:
len(char_to_pers)

2477

In [13]:
len([char for movie in movie_to_chars for char in movie_to_chars[movie]])

6560

In [14]:
2477 / 464

5.338362068965517

## getting personality_data

In [15]:
char_pers = pers_labels.get_pers_df()

## mbpt distribution

In [16]:
mbpt_dist = Counter()

for row in char_pers.iloc:
    label = pers_labels.get_labels(row, pers_labels.MBPT)
    mbpt_dist[label] += 1

total_count = sum(mbpt_dist.values())
valid_count = sum([mbpt_dist[key] for key in mbpt_dist if key])
{key: (mbpt_dist[key], mbpt_dist[key] / total_count, mbpt_dist[key] / valid_count) for key in  mbpt_dist}

{'ESFJ': (122, 0.04925312878482035, 0.05367355917289925),
 'INFP': (127, 0.051271699636657245, 0.05587329520457545),
 'ESFP': (167, 0.06742026645135245, 0.07347118345798505),
 'ESTP': (224, 0.0904319741622931, 0.0985481742190937),
 'INTJ': (112, 0.04521598708114655, 0.04927408710954685),
 'ISTJ': (198, 0.07993540573274122, 0.08710954685437748),
 'ISTP': (145, 0.05853855470327009, 0.06379234491860977),
 'ENTP': (143, 0.05773112636253532, 0.06291245050593929),
 'INTP': (111, 0.04481227291077917, 0.048834139903211615),
 'ENFJ': (106, 0.04279370205894227, 0.04663440387153542),
 'ISFJ': (171, 0.06903512313282197, 0.075230972283326),
 'INFJ': (100, 0.04037141703673799, 0.04399472063352398),
 'ESTJ': (170, 0.06863140896245458, 0.07479102507699076),
 'ENFP': (107, 0.04319741622930965, 0.047074351077870655),
 'ISFP': (151, 0.060960839725474364, 0.0664320281566212),
 'ENTJ': (119, 0.04804198627371821, 0.052353717553893536),
 None: (204, 0.08235769075494549, 0.08974923009238892)}

## big 5 distribution

In [17]:
big5_dist = Counter()

for row in char_pers.iloc:
    label = pers_labels.get_labels(row, pers_labels.BIG_5)
    big5_dist[label] += 1

total_count = sum(mbpt_dist.values())
valid_count = sum([big5_dist[key] for key in big5_dist if key])
{key: (big5_dist[key], big5_dist[key] / total_count, big5_dist[key] / valid_count) for key in  big5_dist}

{'SLUAN': (20, 0.008074283407347598, 0.020060180541624874),
 'RCUAI': (25, 0.010092854259184497, 0.025075225677031094),
 None: (1480, 0.5974969721437222, 1.4844533600802408),
 'SLUEN': (47, 0.018974566007266856, 0.04714142427281846),
 'RLOEI': (26, 0.010496568429551878, 0.026078234704112337),
 'RLOEN': (22, 0.008881711748082357, 0.022066198595787363),
 'RCUEN': (18, 0.0072668550666128385, 0.01805416248746239),
 'SCUAI': (46, 0.018570851836899476, 0.04613841524573721),
 'RLUAI': (28, 0.011303996770286637, 0.028084252758274825),
 'RCOEI': (47, 0.018974566007266856, 0.04714142427281846),
 'RLOAI': (34, 0.013726281792490917, 0.034102306920762285),
 'RLUAN': (33, 0.013322567622123537, 0.033099297893681046),
 'RCOAN': (54, 0.021800565199838515, 0.05416248746238716),
 'SLUAI': (28, 0.011303996770286637, 0.028084252758274825),
 'SCUEI': (42, 0.016955995155429955, 0.04212637913741224),
 'SCOEN': (24, 0.009689140088817117, 0.024072216649949848),
 'SCUAN': (31, 0.012515139281388777, 0.03109327983

## mbpt dimension distributions

### introversion vs. extraversion

In [18]:
mbpt_dist = Counter()

for row in char_pers.iloc:
    label = pers_labels.get_labels(row, pers_labels.MBPT)
    mbpt_dist[label[0] if label else label] += 1

total_count = sum(mbpt_dist.values())
valid_count = sum([mbpt_dist[key] for key in mbpt_dist if key])
{key: (mbpt_dist[key], mbpt_dist[key] / total_count, mbpt_dist[key] / valid_count) for key in  mbpt_dist}

{'E': (1158, 0.46750100928542593, 0.5094588649362076),
 'I': (1115, 0.4501412999596286, 0.4905411350637923),
 None: (204, 0.08235769075494549, 0.08974923009238892)}

### sensing vs. intuition

In [19]:
mbpt_dist = Counter()

for row in char_pers.iloc:
    label = pers_labels.get_labels(row, pers_labels.MBPT)
    mbpt_dist[label[1] if label else label] += 1

total_count = sum(mbpt_dist.values())
valid_count = sum([mbpt_dist[key] for key in mbpt_dist if key])
{key: (mbpt_dist[key], mbpt_dist[key] / total_count, mbpt_dist[key] / valid_count) for key in  mbpt_dist}

{'S': (1348, 0.5442067016552281, 0.5930488341399032),
 'N': (925, 0.3734356075898264, 0.4069511658600968),
 None: (204, 0.08235769075494549, 0.08974923009238892)}

### thinking vs. feeling

In [20]:
mbpt_dist = Counter()

for row in char_pers.iloc:
    label = pers_labels.get_labels(row, pers_labels.MBPT)
    mbpt_dist[label[2] if label else label] += 1

total_count = sum(mbpt_dist.values())
valid_count = sum([mbpt_dist[key] for key in mbpt_dist if key])
{key: (mbpt_dist[key], mbpt_dist[key] / total_count, mbpt_dist[key] / valid_count) for key in  mbpt_dist}

{'F': (1051, 0.42430359305611626, 0.462384513858337),
 'T': (1222, 0.4933387161889382, 0.537615486141663),
 None: (204, 0.08235769075494549, 0.08974923009238892)}

### judging vs. perceiving

In [21]:
mbpt_dist = Counter()

for row in char_pers.iloc:
    label = pers_labels.get_labels(row, pers_labels.MBPT)
    mbpt_dist[label[3] if label else label] += 1

total_count = sum(mbpt_dist.values())
valid_count = sum([mbpt_dist[key] for key in mbpt_dist if key])
{key: (mbpt_dist[key], mbpt_dist[key] / total_count, mbpt_dist[key] / valid_count) for key in  mbpt_dist}

{'J': (1098, 0.44327815906338314, 0.48306203255609326),
 'P': (1175, 0.4743641501816714, 0.5169379674439067),
 None: (204, 0.08235769075494549, 0.08974923009238892)}

## big 5 dimension distribution

### social vs. reserved (extraversion)

In [22]:
big5_dist = Counter()

for row in char_pers.iloc:
    label = pers_labels.get_dim_labels(row, pers_labels.BIG_5)
    big5_dist[label[0] if label else label] += 1

total_count = sum(big5_dist.values())
valid_count = sum([big5_dist[key] for key in big5_dist if key])
{key: (big5_dist[key], big5_dist[key] / total_count, big5_dist[key] / valid_count) for key in  big5_dist}

{'S': (499, 0.20145337101332256, 0.5005015045135406),
 'R': (498, 0.2010496568429552, 0.49949849548645936),
 None: (1480, 0.5974969721437222, 1.4844533600802408)}

### limbic vs. calm (neuroticism)

In [23]:
big5_dist = Counter()

for row in char_pers.iloc:
    label = pers_labels.get_dim_labels(row, pers_labels.BIG_5)
    big5_dist[label[1] if label else label] += 1

total_count = sum(big5_dist.values())
valid_count = sum([big5_dist[key] for key in big5_dist if key])
{key: (big5_dist[key], big5_dist[key] / total_count, big5_dist[key] / valid_count) for key in  big5_dist}

{'L': (416, 0.16794509487283005, 0.4172517552657974),
 'C': (581, 0.2345579329834477, 0.5827482447342026),
 None: (1480, 0.5974969721437222, 1.4844533600802408)}

### organized vs. unstructured (conscientiousness)

In [24]:
big5_dist = Counter()

for row in char_pers.iloc:
    label = pers_labels.get_dim_labels(row, pers_labels.BIG_5)
    big5_dist[label[2] if label else label] += 1

total_count = sum(big5_dist.values())
valid_count = sum([big5_dist[key] for key in big5_dist if key])
{key: (big5_dist[key], big5_dist[key] / total_count, big5_dist[key] / valid_count) for key in  big5_dist}

{'U': (470, 0.18974566007266855, 0.47141424272818455),
 None: (1480, 0.5974969721437222, 1.4844533600802408),
 'O': (527, 0.2127573677836092, 0.5285857572718154)}

### agreeable vs. egocentric (agreeableness)

In [25]:
big5_dist = Counter()

for row in char_pers.iloc:
    label = pers_labels.get_dim_labels(row, pers_labels.BIG_5)
    big5_dist[label[3] if label else label] += 1

total_count = sum(big5_dist.values())
valid_count = sum([big5_dist[key] for key in big5_dist if key])
{key: (big5_dist[key], big5_dist[key] / total_count, big5_dist[key] / valid_count) for key in  big5_dist}

{'A': (504, 0.20347194186515946, 0.5055165496489469),
 None: (1480, 0.5974969721437222, 1.4844533600802408),
 'E': (493, 0.1990310859911183, 0.4944834503510532)}

### inquisitive vs. non-curious (openness to experience)

In [26]:
big5_dist = Counter()

for row in char_pers.iloc:
    label = pers_labels.get_dim_labels(row, pers_labels.BIG_5)
    big5_dist[label[4] if label else label] += 1

total_count = sum(big5_dist.values())
valid_count = sum([big5_dist[key] for key in big5_dist if key])
{key: (big5_dist[key], big5_dist[key] / total_count, big5_dist[key] / valid_count) for key in  big5_dist}

{'N': (468, 0.1889382317319338, 0.46940822467402205),
 'I': (529, 0.21356479612434395, 0.5305917753259779),
 None: (1480, 0.5974969721437222, 1.4844533600802408)}

## mbpt alternate dimension distributions

### introversion vs. extraversion

In [27]:
mbpt_dist = Counter()

for row in char_pers.iloc:
    label = pers_labels.get_alt_dim_labels(row, pers_labels.MBPT)
    mbpt_dist[label[0] if label else label] += 1

total_count = sum(mbpt_dist.values())
valid_count = sum([mbpt_dist[key] for key in mbpt_dist if key])
{key: (mbpt_dist[key], mbpt_dist[key] / total_count, mbpt_dist[key] / valid_count) for key in  mbpt_dist}

{'E': (1156, 0.46669358094469116, 0.5085789705235372),
 'I': (1117, 0.45094872830036337, 0.4914210294764628),
 None: (204, 0.08235769075494549, 0.08974923009238892)}

### sensing vs. intuition

In [28]:
mbpt_dist = Counter()

for row in char_pers.iloc:
    label = pers_labels.get_alt_dim_labels(row, pers_labels.MBPT)
    mbpt_dist[label[1] if label else label] += 1

total_count = sum(mbpt_dist.values())
valid_count = sum([mbpt_dist[key] for key in mbpt_dist if key])
{key: (mbpt_dist[key], mbpt_dist[key] / total_count, mbpt_dist[key] / valid_count) for key in  mbpt_dist}

{'S': (1356, 0.5474364150181671, 0.5965684117905852),
 'N': (917, 0.3702058942268874, 0.4034315882094149),
 None: (204, 0.08235769075494549, 0.08974923009238892)}

### thinking vs. feeling

In [29]:
mbpt_dist = Counter()

for row in char_pers.iloc:
    label = pers_labels.get_alt_dim_labels(row, pers_labels.MBPT)
    mbpt_dist[label[2] if label else label] += 1

total_count = sum(mbpt_dist.values())
valid_count = sum([mbpt_dist[key] for key in mbpt_dist if key])
{key: (mbpt_dist[key], mbpt_dist[key] / total_count, mbpt_dist[key] / valid_count) for key in  mbpt_dist}

{'F': (1046, 0.4222850222042794, 0.4601847778266608),
 'T': (1227, 0.49535728704077514, 0.5398152221733392),
 None: (204, 0.08235769075494549, 0.08974923009238892)}

### judging vs. perceiving

In [30]:
mbpt_dist = Counter()

for row in char_pers.iloc:
    label = pers_labels.get_alt_dim_labels(row, pers_labels.MBPT)
    mbpt_dist[label[3] if label else label] += 1

total_count = sum(mbpt_dist.values())
valid_count = sum([mbpt_dist[key] for key in mbpt_dist if key])
{key: (mbpt_dist[key], mbpt_dist[key] / total_count, mbpt_dist[key] / valid_count) for key in  mbpt_dist}

{'J': (1095, 0.442067016552281, 0.48174219093708753),
 'P': (1178, 0.47557529269277354, 0.5182578090629124),
 None: (204, 0.08235769075494549, 0.08974923009238892)}

## big 5 alternate dimension distribution

### social vs. reserved (extraversion)

In [31]:
big5_dist = Counter()

for row in char_pers.iloc:
    label = pers_labels.get_alt_dim_labels(row, pers_labels.BIG_5)
    big5_dist[label[0] if label else label] += 1

total_count = sum(big5_dist.values())
valid_count = sum([big5_dist[key] for key in big5_dist if key])
{key: (big5_dist[key], big5_dist[key] / total_count, big5_dist[key] / valid_count) for key in  big5_dist}

{'S': (501, 0.20226079935405733, 0.5025075225677031),
 'R': (496, 0.20024222850222043, 0.4974924774322969),
 None: (1480, 0.5974969721437222, 1.4844533600802408)}

### limbic vs. calm (neuroticism)

In [32]:
big5_dist = Counter()

for row in char_pers.iloc:
    label = pers_labels.get_alt_dim_labels(row, pers_labels.BIG_5)
    big5_dist[label[1] if label else label] += 1

total_count = sum(big5_dist.values())
valid_count = sum([big5_dist[key] for key in big5_dist if key])
{key: (big5_dist[key], big5_dist[key] / total_count, big5_dist[key] / valid_count) for key in  big5_dist}

{'L': (419, 0.16915623738393218, 0.4202607823470411),
 'C': (578, 0.23334679047234558, 0.5797392176529589),
 None: (1480, 0.5974969721437222, 1.4844533600802408)}

### organized vs. unstructured (conscientiousness)

In [33]:
big5_dist = Counter()

for row in char_pers.iloc:
    label = pers_labels.get_alt_dim_labels(row, pers_labels.BIG_5)
    big5_dist[label[2] if label else label] += 1

total_count = sum(big5_dist.values())
valid_count = sum([big5_dist[key] for key in big5_dist if key])
{key: (big5_dist[key], big5_dist[key] / total_count, big5_dist[key] / valid_count) for key in  big5_dist}

{'U': (474, 0.19136051675413807, 0.4754262788365095),
 None: (1480, 0.5974969721437222, 1.4844533600802408),
 'O': (523, 0.2111425111021397, 0.5245737211634904)}

### agreeable vs. egocentric (agreeableness)

In [34]:
big5_dist = Counter()

for row in char_pers.iloc:
    label = pers_labels.get_alt_dim_labels(row, pers_labels.BIG_5)
    big5_dist[label[3] if label else label] += 1

total_count = sum(big5_dist.values())
valid_count = sum([big5_dist[key] for key in big5_dist if key])
{key: (big5_dist[key], big5_dist[key] / total_count, big5_dist[key] / valid_count) for key in  big5_dist}

{'A': (507, 0.20468308437626162, 0.5085255767301906),
 None: (1480, 0.5974969721437222, 1.4844533600802408),
 'E': (490, 0.19781994348001614, 0.49147442326980945)}

### inquisitive vs. non-curious (openness to experience)

In [35]:
big5_dist = Counter()

for row in char_pers.iloc:
    label = pers_labels.get_alt_dim_labels(row, pers_labels.BIG_5)
    big5_dist[label[4] if label else label] += 1

total_count = sum(big5_dist.values())
valid_count = sum([big5_dist[key] for key in big5_dist if key])
{key: (big5_dist[key], big5_dist[key] / total_count, big5_dist[key] / valid_count) for key in  big5_dist}

{'N': (470, 0.18974566007266855, 0.47141424272818455),
 'I': (527, 0.2127573677836092, 0.5285857572718154),
 None: (1480, 0.5974969721437222, 1.4844533600802408)}

: 