In [5]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd

In [6]:
analyzer = SentimentIntensityAnalyzer()

example_sentance_1 = "I hate the Moor, And it is thought abroad that ’twixt my sheets ’Has done my office."

print(analyzer.polarity_scores(example_sentance_1))



{'neg': 0.198, 'neu': 0.802, 'pos': 0.0, 'compound': -0.5719}


In [7]:
example_sentance_2 =  "She loved me for the dangers I had passed, And I loved her that she did pity them."

print(analyzer.polarity_scores(example_sentance_2))

{'neg': 0.214, 'neu': 0.476, 'pos': 0.31, 'compound': 0.5267}


In [8]:
path = "/Users/mckennaquam/Desktop/ENGL 1600/final_proj/csvs/othello_by_sentence.csv"

df = pd.read_csv(path, index_col=0)

In [9]:
len(df["speaker"].unique())

25

In [10]:
analyzer = SentimentIntensityAnalyzer()

def get_neg_sentiment(text):
    return analyzer.polarity_scores(text)["neg"]

def get_neu_sentiment(text):
    return analyzer.polarity_scores(text)["neu"]

def get_pos_sentiment(text):
    return analyzer.polarity_scores(text)["pos"]

def get_compound_sentiment(text):
    return analyzer.polarity_scores(text)["compound"]


df["neg_sentiment"] = df["text"].apply(get_neg_sentiment)
df["neu_sentiment"] = df["text"].apply(get_neu_sentiment)
df["pos_sentiment"] = df["text"].apply(get_pos_sentiment)
df["compound_sentiment"] = df["text"].apply(get_compound_sentiment)

In [11]:
def act_scene(row):
    act = str(row["act"]).strip()
    scene = str(row["scene"]).strip()
    return act + "." + scene

df["act_scene"] = df.apply(act_scene, axis=1)

df["speaker"] = df["speaker"].str.title()

In [12]:
import altair as alt

In [13]:
df_i = df[df["speaker"] == "Iago"].copy()
df_i["line_index"] = range(1, len(df_i)+1)

df_i['cumulative_neg_sentiment'] = df_i['neg_sentiment'].cumsum()
df_i['cumulative_compound_sentiment'] = df_i['compound_sentiment'].cumsum()

df_o = df[df["speaker"] == "Othello"].copy()
df_o["line_index"] = range(1, len(df_o)+1)

df_o['cumulative_neg_sentiment'] = df_o['neg_sentiment'].cumsum()
df_o['cumulative_compound_sentiment'] = df_o['compound_sentiment'].cumsum()

df_combined = pd.concat([df_i, df_o], axis=0)

domain = ["Othello", "Iago"]
range_ = ["#FFA600", "#D11033"]

df_neg = df_combined[["line_index", "cumulative_neg_sentiment", "speaker"]]

chart = alt.Chart(df_neg).mark_line().encode(
    alt.X("line_index:Q", title="Sentence Number"),
    alt.Y("cumulative_neg_sentiment:Q", title= "Cumulative Negative Sentiment"), 
    alt.Color("speaker", title="Characters", scale=alt.Scale(domain=domain, range=range_))
).properties(
    title = "Cumulative Negative Sentiment for Iago and Othello"
)

chart

In [14]:
df_o[df_o["cumulative_neg_sentiment"] == max(df_o["cumulative_neg_sentiment"])]

Unnamed: 0,act,scene,sentence_number,speaker,text,neg_sentiment,neu_sentiment,pos_sentiment,compound_sentiment,act_scene,line_index,cumulative_neg_sentiment,cumulative_compound_sentiment
2630,5,2,383,Othello,"No way but this, Killing myself, to die upon a...",0.596,0.25,0.154,-0.9122,5.2,664,70.069,26.874


In [15]:
df_i[df_i["cumulative_neg_sentiment"] == max(df_i["cumulative_neg_sentiment"])]

Unnamed: 0,act,scene,sentence_number,speaker,text,neg_sentiment,neu_sentiment,pos_sentiment,compound_sentiment,act_scene,line_index,cumulative_neg_sentiment,cumulative_compound_sentiment
2595,5,2,348,Iago,Demand me nothing.,0.429,0.571,0.0,-0.128,5.2,762,70.219,42.9023
2596,5,2,349,Iago,"What you know, you know.",0.0,1.0,0.0,0.0,5.2,763,70.219,42.9023
2597,5,2,350,Iago,From this time forth I never will speak word.,0.0,1.0,0.0,0.0,5.2,764,70.219,42.9023


In [16]:
domain = ["Othello", "Iago"]
range_ = ["#FFA600", "#D11033"]

#df_compound = df_combined[["line_index", "cumulative_compound_sentiment", "speaker"]]

alt.Chart(df_combined).mark_line().encode(
    alt.X("line_index", title="Sentence Number"),
    alt.Y("cumulative_compound_sentiment", title="Cumulative Compound Sentiment"), 
    alt.Color("speaker", title="Character", scale=alt.Scale(domain=domain, range=range_)),
).properties(
    title = "Cumulative Sentiment Scores for Iago and Othello"
)

In [17]:
def words_in_sentence(text):
    return len(text.split())

def act_to_word(i):
    return "Act " + str(i)

df["act_word"] = df["act"].apply(act_to_word)

def tooltip_text(row):
    return row["speaker"] + " " +row["act_scene"] + ": " + row["text"]


df["words_in_text"] = df["text"].apply(words_in_sentence)
df["tooltip_text"] = df.apply(tooltip_text, axis=1)

selection = alt.selection_point(fields=['act_word'], bind='legend')

domain = ["Act 1", "Act 2", "Act 3", "Act 4", "Act 5"]
range_ = ["#FFA600", "#D11033", "#5E3781", "#47682C", "#077187"]

df_filtered = df[["compound_sentiment", "words_in_text", "tooltip_text", "act_word"]]

points = alt.Chart(df_filtered).mark_point().encode(
    x=alt.X("compound_sentiment", title="Compund Sentiment Score"),
    y=alt.Y("words_in_text", title="Length of Sentence"),
    tooltip=alt.Tooltip("tooltip_text"),
    color=alt.Color("act_word:N", title="Act", scale=alt.Scale(domain=domain, range=range_)),
    opacity=alt.when(selection).then(alt.value(1)).otherwise(alt.value(0))
).properties(
    width=900,
    height=500,
    title = "Compound Sentiment Score for all Sentences in Othello"
).add_params(
    selection
)

points

In [18]:
df.groupby("act")["words_in_text"].agg(["min", "mean", "max"])

Unnamed: 0_level_0,min,mean,max
act,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,12.852459,106
2,1,11.414079,93
3,1,10.056122,71
4,1,8.439523,62
5,1,7.110307,64


In [19]:
df["moor_boolean"] = df['text'].str.contains('moor', case=False)

df_moor = df[df["moor_boolean"]]

df_moor["pos_compund"] = df_moor["compound_sentiment"] >= 0

speaker_order = df_moor['speaker'].value_counts().index.tolist()

click = alt.selection_multi(encodings=['x'])

domain = [False, True]
range_ = ["#077187", "#FFA600",]

df_filtered = df_moor[["speaker", "pos_compund", "act_scene", "compound_sentiment", "tooltip_text"]]

speaker_hist = alt.Chart(df_filtered).mark_bar().encode(
        alt.X("speaker", sort=speaker_order, title="Characters"),
        alt.Y("count()", title="Count of Sentences"),
        color= alt.condition(click, alt.Color('pos_compund:N', scale=alt.Scale(domain=domain, range=range_)), alt.value('lightgray'))
    ).add_params(
        click
    ).properties(
        width=200,
        title = "Count of Sentences"
    )

sentiment_points = alt.Chart(df_filtered).mark_point().encode(
    x=alt.X("act_scene", title="Act, Scene"),
    y=alt.Y('compound_sentiment:Q', title="Compound Sentiment").scale(domain=(-1, 1)),
    tooltip=alt.Tooltip("tooltip_text:N"),
    color=alt.Color("pos_compund", title="Pos Compound", scale=alt.Scale(domain=domain, range=range_))
).transform_filter(
    click
).properties(
    width = 200,
    title= "Act, Scene vs Compound Sentiment"
)


moor_graph = (sentiment_points | speaker_hist).properties(title="Analysis of Sentences Including 'Moor'")

chart.save('moor_chart_sentiment_speaker.json')

#speaker_hist

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_moor["pos_compund"] = df_moor["compound_sentiment"] >= 0
Deprecated since `altair=5.0.0`. Use selection_point instead.
  click = alt.selection_multi(encodings=['x'])


In [20]:
# NLTK's list of english stopwords
stop_words = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", 
              "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", 
              "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", 
              "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", 
              "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", 
              "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", 
              "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", 
              "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]


df_words = df[["speaker", "text"]]

# Step 1: Tokenize the text into words
df_words['words'] = df_words['text'].str.lower().str.split()

# Step 2: Explode so each word is in a separate row
df_exploded = df_words.explode('words')

df_exploded['words'] = df_exploded['words'].str.replace(r'[^\w\s]', '', regex=True)

df_exploded = df_exploded[~df_exploded["words"].isin(stop_words)]

# Step 3: Drop duplicate (speaker, word) pairs
df_unique = df_exploded[['speaker', 'words']].drop_duplicates()

# Step 4: Group by word and count unique speakers
word_speaker_counts = df_unique.groupby('words')['speaker'].nunique().reset_index(name="count")

# Step 5: Sort by the number of unique speakers
word_speaker_counts = word_speaker_counts.sort_values("count", ascending=False)

words_10_char = word_speaker_counts[word_speaker_counts["count"] == 10]["words"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_words['words'] = df_words['text'].str.lower().str.split()


In [21]:
df_10_char = pd.DataFrame(columns=["word", "count"])

for word in words_10_char:
    temp_df = df[df['text'].str.contains(word, case=False)]
    df_10_char.loc[len(df_10_char)] = [word, len(temp_df)]


In [22]:
def color_for_chart(text):
    if text == "moor":
        return 2
    elif text == "cassio":
        return 1
    else:
        return 0

df_10_char["for_color"] = df_10_char["word"].apply(color_for_chart)
df_10_char["word"] = df_10_char["word"].str.title()

domain = [0, 1, 2]
range_ = ["#077187", "#47682C", "#FFA600"]

alt.Chart(df_10_char).mark_bar().encode(
    x = alt.X("word", sort="-y", title="Word"),
    y = alt.Y("count", title="Count of Lines Containing"),
    color=alt.Color("for_color:N", scale=alt.Scale(domain=domain, range=range_), legend=None)
).properties(
    title="Words Spoken by 10 Unique Characters"
)

In [23]:
word_speaker_counts[word_speaker_counts["words"] == "othello"]

Unnamed: 0,words,count
2222,othello,11


In [24]:
df[df['text'].str.contains('othello', case=False)]["speaker"].unique()

array(['Duke', 'First Senator', 'Desdemona', 'Iago', 'Third Gentleman',
       'Montano', 'Cassio', 'Herald', 'Othello', 'Roderigo', 'Gratiano',
       'Lodovico'], dtype=object)

In [25]:
df[df['text'].str.contains('moor', case=False)]["speaker"].unique()

array(['Iago', 'Roderigo', 'Brabantio', 'First Senator', 'Desdemona',
       'Third Gentleman', 'Cassio', 'Montano', 'Emilia', 'Lodovico'],
      dtype=object)

In [26]:
othello_othello =  df[(df['text'].str.contains('othello', case=False)) & (df["speaker"] == "Othello")]["text"]


for text in othello_othello:
    print(text)

Othello's occupation's gone!
I took you for that cunning whore of Venice That married with Othello.--You, mistress,
Man but a rush against Othello's breast, And he retires.
Where should Othello go?
That's he that was Othello.
