In [1]:
import pandas as pd
import altair as alt

In [2]:
df = pd.read_csv("csvs/othello_by_sentence.csv", index_col=0)

def act_scene(row):
    act = str(row["act"]).strip()
    scene = str(row["scene"]).strip()
    return act + "." + scene

df["act_scene"] = df.apply(act_scene, axis=1)

def words_in_sentence(text):
    return len(text.split())

df["words_in_text"] = df["text"].apply(words_in_sentence)

df.head()

Unnamed: 0,act,scene,sentence_number,speaker,text,act_scene,words_in_text
0,1,1,1,RODERIGO,"Tush, never tell me!",1.1,4
1,1,1,2,RODERIGO,"I take it much unkindly That thou, Iago, who h...",1.1,23
2,1,1,3,IAGO,"'Sblood, but you'll not hear me!",1.1,6
3,1,1,4,IAGO,"If ever I did dream of such a matter, Abhor me.",1.1,11
4,1,1,5,RODERIGO,Thou toldst me thou didst hold him in thy hate.,1.1,10


In [3]:
len(df["speaker"].unique())

25

In [4]:
# normalize for monologuing
# sum of words spoken / count of lines
df_uninterupted_speaker = pd.read_csv("csvs/othello_by_uninterupted_speaker.csv")

df_uninterupted_speaker["act_scene"] = df_uninterupted_speaker.apply(act_scene, axis=1)
df_uninterupted_speaker["words_in_text"] = df_uninterupted_speaker["text"].apply(words_in_sentence)

df_uninterupted_speaker.head()

Unnamed: 0.1,Unnamed: 0,act,scene,sentence_number,speaker,text,act_scene,words_in_text
0,0,1,1,1,RODERIGO,"Tush, never tell me! I take it much unkindly T...",1.1,27
1,1,1,1,2,IAGO,"'Sblood, but you'll not hear me! If ever I did...",1.1,17
2,2,1,1,3,RODERIGO,Thou toldst me thou didst hold him in thy hate.,1.1,10
3,3,1,1,4,IAGO,Despise me If I do not. Three great ones of th...,1.1,193
4,4,1,1,5,RODERIGO,"By heaven, I rather would have been his hangman.",1.1,9


In [5]:
df_uninterupted_speaker["mc"] = df_uninterupted_speaker["speaker"].isin(["OTHELLO", "IAGO", "DESDEMONA", "CASSIO"])

df_uninterupted_speaker.groupby("mc")["words_in_text"].agg(["count", "mean", "sum"]).reset_index()

Unnamed: 0,mc,count,mean,sum
0,False,359,18.069638,6487
1,True,820,23.506098,19275


In [6]:
df_uninterupted_speaker.groupby("speaker")["words_in_text"].agg(["count", "mean", "sum"]).reset_index().sort_values(by="count")

Unnamed: 0,speaker,count,mean,sum
12,HERALD,1,101.0,101
23,SECOND SENATOR,1,38.0,38
8,FIRST GENTLEMAN,1,19.0,19
21,SAILOR,2,12.0,24
10,GENTLEMEN,2,3.5,7
0,ALL,2,4.5,9
15,MESSENGER,3,27.0,81
18,OFFICER,3,9.333333,28
24,THIRD GENTLEMAN,4,28.25,113
17,MUSICIAN,5,4.6,23


In [7]:
main_characters = ["IAGO", "OTHELLO", "DESDEMONA", "CASSIO"]
df_mcs = df_uninterupted_speaker[df_uninterupted_speaker["speaker"].isin(main_characters)]

df_mcs["speaker"] = df_mcs["speaker"].str.title()

df_grouped_scene = df_mcs.groupby(["speaker", "act_scene"])["words_in_text"].agg(["count", "mean", "sum"]).reset_index()
df_grouped_scene["mean"] = df_grouped_scene["mean"].round(2)

domain = ["Othello", "Iago", "Desdemona", "Cassio"]
range_ = ["#FFA600", "#D11033", "#5E3781", "#47682C"]

selection = alt.selection_point(fields=['speaker'], bind='legend', toggle='event.shiftKey', clear=False)

count = alt.Chart(df_grouped_scene).mark_point().encode(
    x=alt.X('act_scene', title="Act, Scene"),
    y=alt.Y('count', title = "Count of Lines"),
    size = alt.Size('mean', title="Avg Line Length"),
    color=alt.Color('speaker', title="Character", scale=alt.Scale(domain=domain, range=range_)),
    opacity=alt.condition(selection, alt.value(1), alt.value(0.1)),
    #tooltip = alt.Tooltip(["speaker", 'act_scene', 'count', 'mean'], title=["Character", "Sct, Scene", "Count", "Avg"])
    tooltip=[alt.Tooltip('speaker', title="Character"), alt.Tooltip('act_scene', title="Act, Scene"),
             alt.Tooltip('count', title="Count of Lines"), alt.Tooltip('mean', title="Avg Length of Line")]
).add_params(
    selection
).properties(
    width = 350,
    title = "Count of Lines & Average Length of Line"
)

normalized = alt.Chart(df_grouped_scene).mark_bar().encode(
    x=alt.X('act_scene', title="Act, Scene"),
    y=alt.Y('sum:Q', title="Words Spoken").stack("normalize"),
    color=alt.Color('speaker', title="Character", scale=alt.Scale(domain=domain, range=range_))
).add_params(
    selection
).transform_filter(
    selection
).properties(
    width = 350,
    title = "% of Words Spoken in the Scene"
)

(count | normalized)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mcs["speaker"] = df_mcs["speaker"].str.title()


In [8]:
df_grouped_scene[df_grouped_scene["mean"] == max(df_grouped_scene["mean"])]

Unnamed: 0,speaker,act_scene,count,mean,sum
32,Othello,1.3,10,86.6,866


In [9]:
#df_oi = df_uninterupted_speaker[df_uninterupted_speaker["speaker"].isin(["IAGO", "OTHELLO"])]

scenes_with_i_o = (
    df_uninterupted_speaker[df_uninterupted_speaker['speaker'].isin(['OTHELLO', 'IAGO'])]
    .groupby('act_scene')['speaker']
    .nunique()
    .loc[lambda x: x == 2]  # Only scenes with both A and B
    .index
)

df_io = df_uninterupted_speaker[(df_uninterupted_speaker["act_scene"].isin(scenes_with_i_o)) & (df_uninterupted_speaker["speaker"].isin(['OTHELLO', 'IAGO']))]

df_io = df_io.groupby(["speaker", "act_scene"])["words_in_text"].agg(["count", "sum"]).reset_index().sort_values(by=["act_scene", 'speaker'])

line_counts = (
    df_io.groupby(['act_scene', 'speaker'])['sum']
    .sum()
    .unstack(fill_value=0).reset_index()
)

line_counts["percent_spoken"] = line_counts["OTHELLO"] / (line_counts["OTHELLO"] + line_counts["IAGO"])

sum(line_counts["OTHELLO"]) / (sum(line_counts["IAGO"]) + sum(line_counts["OTHELLO"]))
#print(len(line_counts))
#print(line_counts["percent_spoken"].mean())

0.4509016572972184