In [2]:
import os
import re
import pandas as pd
import numpy as np
import random
from parse_scripts_noindent import parse_lines
from collections import Counter
from matplotlib import pyplot as plt

## Add error functions

In [1]:
def remove_blank_lines_and_indents(lines, tags):
    newlines, newtags = [], []
    for line, tag in zip(lines, tags):
        if len(line.strip()) > 0:
            newlines.append(line.strip())
            newtags.append(tag)
    return newlines, newtags

## Normal Evaluation

In [3]:
print("reading annotator 1 file...")
annotator_1_df_dict = pd.read_excel("/workspace/mica-text-robust-script-parser/data/Annotator_1.xlsx", sheet_name=None, header=1, usecols=["line","S","N","C","D","E","T","M"])

print("reading annotator 2 file...")
annotator_2_df_dict = pd.read_excel("/workspace/mica-text-robust-script-parser/data/Annotator_2.xlsx", sheet_name=None, header=1, usecols=["line","S","N","C","D","E","T","M"])

print("reading annotator 3 file...")
annotator_3_df_dict = pd.read_excel("/workspace/mica-text-robust-script-parser/data/Annotator_3.xlsx", sheet_name=None, header=1, usecols=["line","S","N","C","D","E","T","M"])

line_numbers = open("/workspace/mica-text-robust-script-parser/data/SAIL_annotation_screenplays/line_indices.txt").read().splitlines()

movies = list(annotator_1_df_dict.keys())[1:]
ann = {}

for movie in movies:
    ann[movie] = {}
    
    for i, df in enumerate([annotator_1_df_dict[movie], annotator_2_df_dict[movie], annotator_3_df_dict[movie]]):
        anns = []

        for _, row in df.iterrows():
            if not isinstance(row["line"], str) or (isinstance(row["line"], str) and row["line"].strip() == ""):
                anns.append("O")
            else:
                n = 0
                atag = ""
                for tag in ["S","N","C","D","E","T"]:
                    if isinstance(row[tag], str) and row["line"].strip() == row[tag].strip():
                        atag = tag
                    else:
                        n += 1
                if atag != "" and n == 5:
                    anns.append(atag)
                else:
                    anns.append("O")
        
        ann[movie]["ann{}".format(i + 1)] = anns

for movie in ann.keys():
    maj = []
    for a1, a2, a3 in zip(ann[movie]["ann1"], ann[movie]["ann2"], ann[movie]["ann3"]):
        if a1 == a2 or a1 == a3:
            maj.append(a1)
        elif a2 == a3:
            maj.append(a2)
        else:
            maj.append("O")
    ann[movie]["maj"] = maj

reading annotator 1 file...
reading annotator 2 file...
reading annotator 3 file...


In [4]:
deltas = 10*np.arange(11)

S_perf = []
N_perf = []
C_perf = []
D_perf = []

for delta in deltas:
    for line in line_numbers:
        movie, _, start, end = line.split()
        script = open("/workspace/mica-text-robust-script-parser/data/SAIL_annotation_screenplays/screenplays/{}.txt".format(movie)).read().splitlines()
        script_to_parse = script[max(0, int(start) - delta) : min(len(script), int(end) + delta)]
        tags = parse_lines(script_to_parse)
        ann[movie]["sys"] = tags[min(int(start), delta) : len(tags)-min(len(script) - int(end), delta)]

    for tag in ["S","N","C","D","E","T"]:
        tp, fp, fn = 0, 0, 0
        for movie in ann.keys():
            tp += sum(x == y == tag for x, y in zip(ann[movie]["maj"], ann[movie]["sys"]))
            fp += sum(x != y == tag for x, y in zip(ann[movie]["maj"], ann[movie]["sys"]))
            fn += sum(tag == x != y for x, y in zip(ann[movie]["maj"], ann[movie]["sys"]))
        p = tp/(tp + fp)
        r = tp/(tp + fn)
        f1 = 2*p*r/(p + r)
        # print("\t{}: p = {:4.1f}, r = {:4.1f}, f1 = {:4.1f}".format(tag, 100*p, 100*r, 100*f1))

        if tag == "S":
            S_perf.append(f1)
        elif tag == "N":
            N_perf.append(f1)
        elif tag == "C":
            C_perf.append(f1)
        elif tag == "D":
            D_perf.append(f1)

print("delta = " + " ".join("{:4d}".format(delta) for delta in deltas))
print("S f1  = " + " ".join("{:4.1f}".format(100*f1) for f1 in S_perf))
print("N f1  = " + " ".join("{:4.1f}".format(100*f1) for f1 in N_perf))
print("C f1  = " + " ".join("{:4.1f}".format(100*f1) for f1 in C_perf))
print("D f1  = " + " ".join("{:4.1f}".format(100*f1) for f1 in D_perf))

delta =    0   10   20   30   40   50   60   70   80   90  100
S f1  = 97.8 97.8 97.8 97.8 97.8 97.8 97.8 97.8 97.8 97.8 97.8
N f1  = 89.2 89.7 89.6 89.6 89.6 89.6 89.6 89.6 89.9 89.8 89.8
C f1  = 94.6 95.1 95.7 96.6 96.7 96.8 96.8 96.8 97.4 97.8 97.8
D f1  = 90.3 91.1 91.6 92.2 92.4 92.5 92.5 92.5 93.4 93.5 93.5


## Robust Evaluation

In [32]:
deltas = 10*np.arange(11)

S_perf = []
N_perf = []
C_perf = []
D_perf = []

scene_keywords = ["INT", "EXT"]
transition_keywords = ["CUT", "FADE"]
names = open("/workspace/mica-text-robust-script-parser/data/names.txt").read().splitlines()
scene_keyword_names = [name for name in names if any(keyword.strip().lower() in name.strip().lower() for keyword in scene_keywords)]
transition_keyword_names = [name for name in names if any(keyword.strip().lower() in name.strip().lower() for keyword in transition_keywords)]
prob_replace_name = 0.2
prob_scene_keyword_name = 0.5

prob_remove_scene_keywords = 0.5
prob_lowercase_scene_line = 0.5
prob_lowercase_character_line = 0.5

prob_create_watermark_lines = 0.5
prob_insert_numbers_or_asteriks = 0.5
prob_insert_dialogue_expressions = 0.5

for delta in deltas:
    for line in line_numbers:
        movie, _, start, end = line.split()
        script = open("/workspace/mica-text-robust-script-parser/data/SAIL_annotation_screenplays/screenplays/{}.txt".format(movie)).read().splitlines()
        start, end = int(start), int(end)

        #####################################################################
        #### remove blank lines and indents
        #####################################################################
        
        newscript, newtags = [], []
        newstart, newend = 0, 0
        for i, line in enumerate(script):
            if i == start:
                newstart = len(newscript)
            elif i == end:
                newend = len(newscript)
            if i < start or i >= end:
                if len(line.strip()) > 0:
                    newscript.append(line.strip())
            else:
                if len(line.strip()) > 0:
                    newscript.append(line.strip())
                    newtags.append(ann[movie]["maj"][i - start])

        #####################################################################
        #### lower case everything
        #####################################################################
        
        # newscript = [line.lower() for line in newscript]

        #####################################################################
        #### include INT, EXT, CUT, FADE in character names
        #####################################################################
        
        for i, line in enumerate(newscript):
            if newstart <= i < newend:
                if newtags[i - newstart] == "C" and re.match("^[a-zA-Z]+$", line):
                    p = random.random()
                    if p < prob_replace_name:
                        q = random.random()
                        if q < prob_scene_keyword_name:
                            keyword_names = scene_keyword_names
                        else:
                            keyword_names = transition_keyword_names
                        name = random.choice(keyword_names)
                        newscript[i] = name.upper()

                    
        #####################################################################
        #### assign newscript, newtags, newstart and newend
        #####################################################################

        ann[movie]["robust_maj"] = newtags
        script = newscript
        start = newstart
        end = newend

        script_to_parse = script[max(0, start - delta) : min(len(script), end + delta)]
        tags = parse_lines(script_to_parse)
        ann[movie]["robust_sys"] = tags[min(start, delta) : len(tags)-min(len(script) - end, delta)]

    for tag in ["S","N","C","D","E","T"]:
        tp, fp, fn = 0, 0, 0
        for movie in ann.keys():
            tp += sum(x == y == tag for x, y in zip(ann[movie]["robust_maj"], ann[movie]["robust_sys"]))
            fp += sum(x != y == tag for x, y in zip(ann[movie]["robust_maj"], ann[movie]["robust_sys"]))
            fn += sum(tag == x != y for x, y in zip(ann[movie]["robust_maj"], ann[movie]["robust_sys"]))
        p = tp/(tp + fp)
        r = tp/(tp + fn)
        f1 = 2*p*r/(p + r)
        # print("\t{}: p = {:4.1f}, r = {:4.1f}, f1 = {:4.1f}".format(tag, 100*p, 100*r, 100*f1))

        if tag == "S":
            S_perf.append(f1)
        elif tag == "N":
            N_perf.append(f1)
        elif tag == "C":
            C_perf.append(f1)
        elif tag == "D":
            D_perf.append(f1)

print("delta = " + " ".join("{:4d}".format(delta) for delta in deltas))
print("S f1  = " + " ".join("{:4.1f}".format(100*f1) for f1 in S_perf))
print("N f1  = " + " ".join("{:4.1f}".format(100*f1) for f1 in N_perf))
print("C f1  = " + " ".join("{:4.1f}".format(100*f1) for f1 in C_perf))
print("D f1  = " + " ".join("{:4.1f}".format(100*f1) for f1 in D_perf))

delta =    0   10   20   30   40   50   60   70   80   90  100
S f1  = 20.5 14.6 12.6  6.2  3.4  5.1  2.3  1.2  1.7  1.7  0.6
N f1  = 11.1  8.3  7.1  3.1  1.7  2.7  1.0  0.6  1.4  1.4  0.4
C f1  = 86.2 86.8 86.5 87.1 86.9 87.1 87.7 88.2 88.3 88.4 88.8
D f1  = 64.2 64.1 64.2 64.0 63.6 64.0 64.5 65.1 64.6 64.8 65.3


In [14]:
[line.lower() for line in newscript]

['"x-men" -- early draft by ed solomon, chris mcquarrie, tom desanto & bryan singer',
 'x-men',
 'by',
 'ed solomon',
 'revisions:',
 'chris mcquarrie',
 'tom desanto',
 'bryan singer',
 'february 24, 1999',
 'black',
 'sounds of a train rolling to a halt, a shrill whistle.',
 'ext. camp - day',
 'up on the door of a weathered cattle car as a german',
 'soldier steps into frame wearing that familiar gray of',
 'the all-too familiar era.',
 'he throws the door to reveal a mass of huddled and',
 'frightened people inside.',
 'the words are not necessary.  the language is not ours',
 'and the images say enough.',
 'men, women and children are herded off the train like',
 'cattle toward a large open yard.  there they huddle until',
 'the germans begin to shout and shove through the mob.',
 'ext. fence corridor - day',
 'we are looking up at rows and rows of fences topped with',
 'barbed wire all designed to create a separator for the',
 'thousands of jew who pour through each day.',
 'then

In [10]:
for movie in ann:
    print(movie)
    for x, y in zip(ann[movie]["robust_maj"], ann[movie]["robust_sys"]):
        print("\t", x, y)
    print()

44_inch_chest
	 D D
	 D D
	 D D
	 D D
	 D D
	 D D
	 D D
	 D D
	 D D
	 D D
	 D D
	 D D
	 D D
	 D D
	 D D
	 D D
	 D D
	 D D
	 D D
	 D C
	 N D
	 N D
	 N C
	 C C
	 D D
	 D D
	 D E
	 D D
	 E D
	 E C
	 O E
	 O D
	 O C
	 C C
	 D E
	 E E
	 D D
	 D D
	 T D
	 S D
	 N D
	 N D
	 C C
	 D D
	 D E
	 D E
	 D D
	 D D
	 D D
	 D E
	 D E
	 D D
	 D E
	 D E
	 D D
	 D D
	 D E
	 D D
	 D D
	 D E
	 D E
	 D E
	 D E
	 D D
	 D E
	 D E
	 E D
	 D D
	 D D
	 D D
	 D E
	 D E
	 D D
	 D D
	 D D
	 D D
	 D D
	 D D
	 D D
	 D E
	 D E
	 D E
	 D D
	 D D
	 D E
	 D D
	 D E
	 D D
	 O E
	 O D
	 O C
	 C C
	 D E
	 D E
	 D E
	 D D
	 D E
	 E E
	 E E
	 E E
	 E E
	 D E
	 E E
	 E E
	 D E
	 E E
	 E E
	 E E
	 D D
	 D E
	 D E
	 D D
	 D E
	 E E
	 E E
	 D E
	 D E
	 D E
	 D E
	 D D
	 T D
	 O D
	 S D
	 N D
	 C C
	 D D
	 D C
	 N D
	 N D
	 N D
	 N D
	 N D
	 N D
	 N C
	 C C
	 E E
	 D D
	 N D
	 N C
	 C C
	 E E
	 E E
	 D D
	 C C
	 E E
	 E E
	 D D
	 N D
	 N D
	 N D
	 N C
	 C C
	 E C
	 E D
	 D D
	 C C
	 D D
	 T D
	 S D
	 N D
	 N D
	 O D
	 O C
	 C C
	 

## Error types

In [1]:
baddogs_script = open("/workspace/mica-text-robust-script-parser/data/SAIL Team Spellcheck/Lionsgate/Scripts_Txt/Bad Dog.txt").read().splitlines()
baddogs_tags = open("/workspace/mica-text-robust-script-parser/data/SAIL Team Spellcheck/Lionsgate/Parsed/Bad Dog_tags.txt").read().splitlines()
print(len(baddogs_script), len(baddogs_tags))

6087 6087


In [4]:
n = 0
m = 0
i = 0

while i < len(baddogs_script):
    tag, line = baddogs_tags[i], baddogs_script[i]
    if tag == "C" and line.strip().lower() == "bub":
        n += 1
        j = i + 1
        while j < len(baddogs_script) and baddogs_tags[j] in ["D",""]:
            m += 1
            j += 1
        i = j
    else:
        i += 1

print("BUM: {} turns, {} lines".format(n, m))

BUM: 192 turns, 285 lines


In [11]:
folders = ["LEGO TITAN", "Lionsgate", "NBC Universal"]

for folder in folders:
    files = os.listdir(os.path.join("/workspace/mica-text-robust-script-parser/data/SAIL Team Spellcheck", folder, "Scripts_Txt"))
    movies = list(set([re.sub("\.txt$", "", file) for file in files if file.endswith(".txt")]))

    for movie in movies:
        script = open(os.path.join("/workspace/mica-text-robust-script-parser/data/SAIL Team Spellcheck", folder, "Scripts_Txt", movie + ".txt")).read().splitlines(keepends=True)
        tags = open(os.path.join("/workspace/mica-text-robust-script-parser/data/SAIL Team Spellcheck", folder, "Parsed", movie + "_tags.txt")).read().splitlines()
        
        lines = [tag + ": " + line for tag, line in zip(tags, script)]
        open(os.path.join("/workspace/mica-text-robust-script-parser/data/SAIL Team Spellcheck", folder, "Parsed", movie + "_parsed_tags.txt"), "w").writelines(lines)

In [20]:
for line in open("/workspace/mica-text-robust-script-parser/data/SAIL Team Spellcheck/Lionsgate/Scripts_Txt/TIGERBELLES.txt").read().splitlines():
    print(line)

  Tigerbelles




   Written by
Natalie Chaidez
      and
  Kwynn Perry


 Revisions by
  Tina Mabry




                  October 9, 2020

                         ii.




Based on a true story.

                                                               1.

FADE IN:
EXT. WILMA’S YARD - RURAL TENNESSEE - DAY (1947)
YOUNG WILMA RUDOLPH (aka “Skeeter”), 7, and her older sister
YOUNG CHARLENE, 12, playfully collect laundry hanging on a
line. In the background is the Rudolph family home... large,
but slightly dilapidated wooden house/farm in the midst of
the swath of green land.
                     YOUNG WILMA
           Betcha can’t beat me back!
Charlene waves her off and continues gathering clothes.
                     YOUNG CHARLENE
           You got them little legs.
                     YOUNG WILMA
           I’m still faster than you.
A shirt that Wilma’s collecting skims the ground.
                     YOUNG CHARLENE
           Skeeter, you better pick them clean
         

In [36]:
re.sub("(IN|EX)T\.?", " ", "INT. BEDROOM").strip()

'BEDROOM'

In [44]:
letters = [chr(i) for i in range(65,91)]
"".join([random.choice(letters) for _ in range(random.choice([1,2,3]))])

'HAM'

In [51]:
expressions = []

for line in line_numbers:
    movie, _, start, end = line.split()
    start, end = int(start), int(end)
    script = open("/workspace/mica-text-robust-script-parser/data/SAIL_annotation_screenplays/screenplays/{}.txt".format(movie)).read().splitlines()[start: end]

    for tag, line in zip(ann[movie]["maj"], script):
        if tag == "E":
            expressions.append(line.strip())

In [53]:
sorted(Counter(expressions).items(), key=lambda x: x[1], reverse=True)[:50]

[('(beat)', 9),
 ('(pause)', 7),
 ('(after a beat)', 6),
 ('(a beat)', 6),
 ('(smiling)', 5),
 ('(on radio)', 4),
 ('(continuing)', 4),
 ('voice)', 3),
 ('(quietly)', 3),
 ('(shouting)', 3),
 ('(yelling)', 3),
 ('(grinning)', 2),
 ('(BEAT)', 2),
 ('LOOK)', 2),
 ('(screaming)', 2),
 ('(MORE)', 2),
 ('(anxious)', 2),
 ('(to the Driver)', 2),
 ('(interrupting)', 2),
 ('(continues)', 2),
 ('(to Chuck)', 2),
 ('(picking popcorn out of his hair)', 2),
 ('(to Carl)', 2),
 ('(LOV ERB OY do es so ... CO LI N g ra bs a', 1),
 ('chair... sits facing LOVERBOY)...', 1),
 ('stares hard at terrified LOVERBOY ) ...', 1),
 ('extracts an old document. holds it up for', 1),
 ('returns it to the drawer. Now his eye', 1),
 ('catches something inside and slowly he', 1),
 ('reaches in ... gently extracts a fancy,', 1),
 ('pearl handled, gleaming, 12" knife.)', 1),
 ('confronts us. Holding it staring at us', 1),
 ('darkly. Murderously ... Eventually...)', 1),
 ('the knife back into the drawer. Finds', 1),
 ('

In [55]:
for i in range(100):
    print(i/100)

0.0
0.01
0.02
0.03
0.04
0.05
0.06
0.07
0.08
0.09
0.1
0.11
0.12
0.13
0.14
0.15
0.16
0.17
0.18
0.19
0.2
0.21
0.22
0.23
0.24
0.25
0.26
0.27
0.28
0.29
0.3
0.31
0.32
0.33
0.34
0.35
0.36
0.37
0.38
0.39
0.4
0.41
0.42
0.43
0.44
0.45
0.46
0.47
0.48
0.49
0.5
0.51
0.52
0.53
0.54
0.55
0.56
0.57
0.58
0.59
0.6
0.61
0.62
0.63
0.64
0.65
0.66
0.67
0.68
0.69
0.7
0.71
0.72
0.73
0.74
0.75
0.76
0.77
0.78
0.79
0.8
0.81
0.82
0.83
0.84
0.85
0.86
0.87
0.88
0.89
0.9
0.91
0.92
0.93
0.94
0.95
0.96
0.97
0.98
0.99


## Robust Evaluation Figures

In [3]:
f1df = pd.read_csv("/workspace/mica-text-robust-script-parser/f1.csv", index_col=None)

In [4]:
f1df

Unnamed: 0,S,N,C,D,E,T,error,prob
0,0.977645,0.897653,0.978261,0.935190,0.862483,0.893617,original,
1,0.005831,0.003899,0.928845,0.663006,0.810748,0.028986,contiguous,
2,0.924142,0.924142,0.924142,0.924142,0.924142,0.924142,replace_name_with_scene_kw,0.01
3,0.406891,0.406891,0.406891,0.406891,0.406891,0.406891,replace_name_with_scene_kw_contiguous,0.01
4,0.924279,0.924279,0.924279,0.924279,0.924279,0.924279,replace_name_with_scene_kw,0.05
...,...,...,...,...,...,...,...,...
189,0.430127,0.430127,0.430127,0.430127,0.430127,0.430127,insert_dialogue_expressions_contiguous,0.70
190,0.941472,0.941472,0.941472,0.941472,0.941472,0.941472,insert_dialogue_expressions,0.80
191,0.430127,0.430127,0.430127,0.430127,0.430127,0.430127,insert_dialogue_expressions_contiguous,0.80
192,0.941472,0.941472,0.941472,0.941472,0.941472,0.941472,insert_dialogue_expressions,0.90


In [22]:
f1df.columns

Index(['S', 'N', 'C', 'D', 'E', 'T', 'error', 'prob'], dtype='object')

In [23]:
for error, error_df in f1df.groupby("error"):
    print(error, error_df.shape, error_df.prob.values)

contiguous (1, 8) [nan]
create_watermark_line (12, 8) [0.01 0.05 0.1  0.15 0.2  0.3  0.4  0.5  0.6  0.7  0.8  0.9 ]
create_watermark_line_contiguous (12, 8) [0.01 0.05 0.1  0.15 0.2  0.3  0.4  0.5  0.6  0.7  0.8  0.9 ]
insert_asterisks_or_numbers (12, 8) [0.01 0.05 0.1  0.15 0.2  0.3  0.4  0.5  0.6  0.7  0.8  0.9 ]
insert_asterisks_or_numbers_contiguous (12, 8) [0.01 0.05 0.1  0.15 0.2  0.3  0.4  0.5  0.6  0.7  0.8  0.9 ]
insert_dialogue_expressions (12, 8) [0.01 0.05 0.1  0.15 0.2  0.3  0.4  0.5  0.6  0.7  0.8  0.9 ]
insert_dialogue_expressions_contiguous (12, 8) [0.01 0.05 0.1  0.15 0.2  0.3  0.4  0.5  0.6  0.7  0.8  0.9 ]
lowercase_character_line (12, 8) [0.01 0.05 0.1  0.15 0.2  0.3  0.4  0.5  0.6  0.7  0.8  0.9 ]
lowercase_character_line_contiguous (12, 8) [0.01 0.05 0.1  0.15 0.2  0.3  0.4  0.5  0.6  0.7  0.8  0.9 ]
lowercase_scene_line (12, 8) [0.01 0.05 0.1  0.15 0.2  0.3  0.4  0.5  0.6  0.7  0.8  0.9 ]
lowercase_scene_line_contiguous (12, 8) [0.01 0.05 0.1  0.15 0.2  0.3  0.4 

In [24]:
f1df.index = f1df.error

In [25]:
f1df

Unnamed: 0_level_0,S,N,C,D,E,T,error,prob
error,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
original,0.977645,0.897653,0.978261,0.935190,0.862483,0.893617,original,
contiguous,0.005831,0.003899,0.928845,0.663006,0.810748,0.028986,contiguous,
replace_name_with_scene_kw,0.924142,0.924142,0.924142,0.924142,0.924142,0.924142,replace_name_with_scene_kw,0.01
replace_name_with_scene_kw_contiguous,0.406891,0.406891,0.406891,0.406891,0.406891,0.406891,replace_name_with_scene_kw_contiguous,0.01
replace_name_with_scene_kw,0.924279,0.924279,0.924279,0.924279,0.924279,0.924279,replace_name_with_scene_kw,0.05
...,...,...,...,...,...,...,...,...
insert_dialogue_expressions_contiguous,0.430127,0.430127,0.430127,0.430127,0.430127,0.430127,insert_dialogue_expressions_contiguous,0.70
insert_dialogue_expressions,0.941472,0.941472,0.941472,0.941472,0.941472,0.941472,insert_dialogue_expressions,0.80
insert_dialogue_expressions_contiguous,0.430127,0.430127,0.430127,0.430127,0.430127,0.430127,insert_dialogue_expressions_contiguous,0.80
insert_dialogue_expressions,0.941472,0.941472,0.941472,0.941472,0.941472,0.941472,insert_dialogue_expressions,0.90


In [49]:
for tag in ["S","N","C","D","T"]:

    original = f1df.loc["original", tag]
    contiguous = f1df.loc["contiguous", tag]

    fig, axs = plt.subplots(3, 3)

    fig.set_figheight(20)
    fig.set_figwidth(20)

    for i, error_type in enumerate(["replace_name_with_scene_kw",
                        "replace_name_with_transition_kw",
                        "remove_scene_kw",
                        "lowercase_scene_line",
                        "lowercase_character_line",
                        "create_watermark_line",
                        "insert_asterisks_or_numbers",
                        "insert_dialogue_expressions"
                        ]):
        
        original_arr = f1df.loc[error_type, [tag, "prob"]]
        contiguous_arr = f1df.loc[error_type + "_contiguous", [tag, "prob"]]

        r = (i + 1)//3
        c = (i + 1)%3
        ax = axs[r, c]

        ax.plot(original_arr.prob, original_arr[tag], color="b", label="original", lw=5)
        ax.plot(contiguous_arr.prob, contiguous_arr[tag], color="r", label="contiguous", lw=5)
        title = error_type.replace("_", " ").upper()
        ax.set_title(title)
        ax.set_xlim(-0.1, 1)
        ax.set_ylim(-0.1, 1)

        ax.set_xlabel("error/line")
        ax.set_ylabel("F1 score")

        ax.legend()

    ax = axs[0, 0]
    ax.bar(["original", "contiguous"], [original, contiguous], color=["b","r"])

    fig.savefig("../data/{}.png".format(tag))
    plt.close("all")

In [50]:
f1df.loc[error_type, ["S", "C", "prob"]]

Unnamed: 0_level_0,S,C,prob
error,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
insert_dialogue_expressions,0.941472,0.941472,0.01
insert_dialogue_expressions,0.941472,0.941472,0.05
insert_dialogue_expressions,0.941472,0.941472,0.1
insert_dialogue_expressions,0.941472,0.941472,0.15
insert_dialogue_expressions,0.941472,0.941472,0.2
insert_dialogue_expressions,0.941472,0.941472,0.3
insert_dialogue_expressions,0.941472,0.941472,0.4
insert_dialogue_expressions,0.941472,0.941472,0.5
insert_dialogue_expressions,0.941472,0.941472,0.6
insert_dialogue_expressions,0.941472,0.941472,0.7


In [5]:
f1df["name"] = f1df["error"]

In [6]:
f1df.loc[(f1df.error == "insert_dialogue_expressions") & (f1df.prob == 0.01)]

Unnamed: 0,S,N,C,D,E,T,error,prob,name
170,0.941472,0.941472,0.941472,0.941472,0.941472,0.941472,insert_dialogue_expressions,0.01,insert_dialogue_expressions
