In [1]:
import os
import json
from collections import Counter
import pandas as pd
import numpy as np
import torch
from movieparser.scriptparser import ScriptParser
from movieparser.scriptloader import label2id
from IPython.display import display
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix
from tqdm.notebook import tqdm
from movieparser.parse_scripts_noindent import parse_lines
from movieparser.robust_parser import MovieParser

In [23]:
movieparser = MovieParser()

In [4]:
files = os.listdir("data/SAIL_coref_screenplays/screenplays")

for file in files:
    print(file)
    script = open(os.path.join("data/SAIL_coref_screenplays/screenplays", file)).read().splitlines(keepends=False)
    tags = movieparser.parse(script)
    assert len(script) == len(tags)
    df = pd.DataFrame()
    df["text"] = script
    df["tag"] = tags
    df.to_csv(os.path.join("data/SAIL_coref_screenplays/robust_parsed", file.replace(".txt", ".csv")), index=False)

quiet_place.txt
prestige.txt
zootopia.txt
dead_poets_society.txt
joker.txt
john_wick.txt
sicario.txt
avengers_endgame.txt


In [5]:
dfs = []

for file in files:
    script = open(os.path.join("data/SAIL_coref_screenplays/screenplays", file)).read().splitlines(keepends=False)
    tags = parse_lines(script)
    df = pd.read_csv(os.path.join("data/SAIL_coref_screenplays/robust_parsed", file.replace(".txt", ".csv")), index_col=None)
    df["rule"] = tags
    df["movie"] = file.split(".")[0]
    dfs.append(df)

df = pd.concat(dfs)
df = df[["movie", "text", "tag", "rule"]]
df.to_csv("data/SAIL_coref_screenplays/parse.csv", index=False)

In [15]:
result100 = json.load(open("results/cross_val/seqlen100_lomo-all_biTrue.json"))
result10 = json.load(open("results/cross_val/seqlen10_lomo-all_biTrue.json"))

In [19]:
for i in range(10):
    print("epoch {:2d}: seqlen=10 F1={:.3f}, seqlen=100 F1={:.3f}".format(i + 1, result10["epoch_{}".format(i + 1)]["1000000"]["micro"]["f1"], result100["epoch_{}".format(i + 1)]["1000000"]["micro"]["f1"]))

epoch  1: seqlen=10 F1=0.959, seqlen=100 F1=0.809
epoch  2: seqlen=10 F1=0.967, seqlen=100 F1=0.915
epoch  3: seqlen=10 F1=0.967, seqlen=100 F1=0.945
epoch  4: seqlen=10 F1=0.965, seqlen=100 F1=0.957
epoch  5: seqlen=10 F1=0.965, seqlen=100 F1=0.961
epoch  6: seqlen=10 F1=0.966, seqlen=100 F1=0.964
epoch  7: seqlen=10 F1=0.962, seqlen=100 F1=0.967
epoch  8: seqlen=10 F1=0.966, seqlen=100 F1=0.966
epoch  9: seqlen=10 F1=0.965, seqlen=100 F1=0.968
epoch 10: seqlen=10 F1=0.966, seqlen=100 F1=0.968


In [20]:
df = pd.read_csv("results/data.csv", index_col=None)

In [21]:
df

Unnamed: 0,movie,line_no,text,label,error
0,44_inch_chest,1,out to wear ... whatever combination - it,D,Original
1,44_inch_chest,2,works! - You look superb! ... And your,D,Original
2,44_inch_chest,3,underw ear - immac ulate ! 100 % cot ton!,D,Original
3,44_inch_chest,4,Dazzlin'!... Not like my pinky grey-y,D,Original
4,44_inch_chest,5,"things! Nah, you've just got it - good at",D,Original
...,...,...,...,...,...
224255,xmen,298,47 Now if youll excuse me... I have a 33,D,NoEmptyLines+DialogueExpressions
224256,xmen,299,train to catch.,D,NoEmptyLines+DialogueExpressions
224257,xmen,300,73 EH 4,M,NoEmptyLines+DialogueExpressions
224258,xmen,301,88 train - night 74,S,NoEmptyLines+DialogueExpressions


In [24]:
files = os.listdir("data/SAIL_annotation_screenplays/screenplays")

for file in files:
    script = open(os.path.join("data/SAIL_annotation_screenplays/screenplays", file)).read().splitlines()
    parsed_file = os.path.join("data/SAIL_annotation_screenplays/parsed-robust-screenplays", file.replace(".txt", "_tags.txt"))
    tags = movieparser.parse(script)
    open(parsed_file, "w").write("\n".join(tags))
    print(file, len(script), len(tags))

pandorum.txt 7891 7891
stuntman_the.txt 7957 7957
willow.txt 6009 6009
extract.txt 6305 6305
broken_embraces.txt 13227 13227
men_in_black_3.txt 6041 6041
mirrors.txt 6594 6594
up_in_the_air.txt 6235 6235
gamer.txt 6473 6473
grosse_pointe_blank.txt 5117 5117
machete.txt 4736 4736
man_who_knew_too_much_the.txt 9591 9591
i_am_number_four.txt 6363 6363
dry_white_season_a.txt 8810 8810
nine.txt 5497 5497
shawshank_redemption_the.txt 6503 6503
bodyguard.txt 6940 6940
custody.txt 5499 5499
spartan.txt 7537 7537
xmen.txt 6461 6461
up.txt 7765 7765
flight.txt 8126 8126
american_psycho.txt 5374 5374
burn_after_reading.txt 6786 6786
changeling.txt 8118 8118
strange_days.txt 7616 7616
true_romance.txt 5629 5629
wolf_of_wall_street_the.txt 8014 8014
star_trek_01_the_motion_picture.txt 7421 7421
event_horizon.txt 6515 6515
lord_of_the_rings_fellowship_of.txt 6949 6949
suspect_zero.txt 6968 6968
kids.txt 3745 3745
bounty_hunter_the.txt 6410 6410
memento.txt 7911 7911
basic_instinct.txt 5895 5895
cand