<a href="https://colab.research.google.com/github/minako-m/datasci112_final_project/blob/main/112_final_project_data_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Data Science 112 Final Project: Data Analysis**
# Amira and Sophia

This project explores the Cornell Movie Dialog Corpus (https://convokit.cornell.edu/documentation/movie.html).

Research questions:
1. Has movie dialogue sentiment changed over time?
2. Has the sentiment of movie dialogue spoken by men versus women changed over time?
3. Has the sentiment of movie dialogue spoken by men to men, by men to women, by women to men, and by women to women changed over time?

In this file we perform analysis of dialogues pre-processed using Tf-IDF. We find statistically significant trends in the data. This supports out hypothesis that performing analysis on raw dialogues was did not yield significant results due to raw data containing too much neutral words, such as stop words, that made the sentiments very neutral.

In [8]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
nltk.download('vader_lexicon')

movie_df = pd.read_csv('movie_dialogues.csv')
movie_df = movie_df[movie_df['text'].isna() == False]

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [9]:
sid = SentimentIntensityAnalyzer()

In [10]:
def get_sentiment_score(text):
    return sid.polarity_scores(text)['compound']

In [11]:
max_features = 1000
tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
all_utterances = ' '.join(movie_df['text'])

tfidf_vectorizer.fit([all_utterances])

sentiment_scores = []
for utterance in movie_df['text']:
    # Transform utterance to TF-IDF features
    tfidf_features = tfidf_vectorizer.transform([utterance])
    # Get the non-zero TF-IDF terms
    non_zero_indices = tfidf_features.nonzero()[1]
    non_zero_terms = set([tfidf_vectorizer.get_feature_names_out()[idx] for idx in non_zero_indices])
    # Filter out terms with zero TF-IDF scores
    filtered_utterance = ' '.join([word for word in utterance.split() if word in non_zero_terms])
    # Calculate sentiment score for the filtered utterance
    sentiment_score = get_sentiment_score(filtered_utterance)
    sentiment_scores.append(sentiment_score)

movie_df['SentimentScore'] = sentiment_scores


In [12]:
og_movie_df = movie_df
og_movie_df['SentimentScore'] = sentiment_scores

In [13]:
og_movie_df

Unnamed: 0.1,Unnamed: 0,text,utt id_x,reply_to id,speaker id,movie_name,gender_x,release year,rating,genre,utt id_y,gender_y,decade,SentimentScore
0,0,They do not!,L1045,L1044,u0,10 things i hate about you,f,1999,6.9,"['comedy', 'romance']",L1044,m,1990-1999,0.0000
1,1,They do to!,L1044,,u2,10 things i hate about you,m,1999,6.9,"['comedy', 'romance']",,,1990-1999,0.0000
2,2,I hope so.,L985,L984,u0,10 things i hate about you,f,1999,6.9,"['comedy', 'romance']",L984,m,1990-1999,0.4404
3,3,She okay?,L984,,u2,10 things i hate about you,m,1999,6.9,"['comedy', 'romance']",,,1990-1999,0.0000
4,4,Let's go.,L925,L924,u0,10 things i hate about you,f,1999,6.9,"['comedy', 'romance']",L924,m,1990-1999,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304708,304708,Lord Chelmsford seems to want me to stay back ...,L666371,L666370,u9030,zulu dawn,?,1979,6.4,"['action', 'adventure', 'drama', 'history', 'w...",L666370,?,1970-1979,0.0772
304709,304709,I'm to take the Sikali with the main column to...,L666370,L666369,u9034,zulu dawn,?,1979,6.4,"['action', 'adventure', 'drama', 'history', 'w...",L666369,?,1970-1979,0.0000
304710,304710,"Your orders, Mr Vereker?",L666369,,u9030,zulu dawn,?,1979,6.4,"['action', 'adventure', 'drama', 'history', 'w...",,,1970-1979,0.0000
304711,304711,"Good ones, yes, Mr Vereker. Gentlemen who can ...",L666257,L666256,u9030,zulu dawn,?,1979,6.4,"['action', 'adventure', 'drama', 'history', 'w...",L666256,?,1970-1979,-0.3400


In [14]:
movie_df = og_movie_df[og_movie_df["SentimentScore"].abs() >= 0.01]

In [15]:
import plotly.express as px

avg_sentiment_by_year = movie_df.groupby('decade')['SentimentScore'].mean().reset_index()

px.box(movie_df.sort_values(by="decade"), x='decade', y='SentimentScore',
       title='Sentiment Scores Over Time')

In [16]:
movie_df[["release year", "SentimentScore"]].corr()

Unnamed: 0,release year,SentimentScore
release year,1.0,-0.062069
SentimentScore,-0.062069,1.0


In [17]:
movie_df = movie_df[movie_df["gender_x"] != '?']

male_utterances = movie_df[(movie_df["gender_x"] == "m")]
female_utterances = movie_df[(movie_df["gender_x"] == "f")]

avg_sentiment_by_year_male = male_utterances.groupby('decade')['SentimentScore'].mean().reset_index()
avg_sentiment_by_year_male["gender"] = "male"
avg_sentiment_by_year_female = female_utterances.groupby('decade')['SentimentScore'].mean().reset_index()
avg_sentiment_by_year_female["gender"] = "female"

df = pd.concat([avg_sentiment_by_year_male, avg_sentiment_by_year_female], ignore_index=True)

px.line(df, x='decade', y='SentimentScore', color='gender',
              title='Average Sentiment Scores of Utterances by Males and Females')

In [18]:
from scipy import stats

decades = movie_df['decade'].unique()
for decade in decades:
    print(decade)
    male_utterances_1 = male_utterances[(male_utterances['decade'] == decade)]["SentimentScore"]
    female_utterances_1 = female_utterances[(female_utterances['decade'] == decade)]["SentimentScore"]
    t_statistic, p_value = stats.ttest_ind(male_utterances_1, female_utterances_1)
    print(p_value)

1990-1999
1.0849711184309075e-14
2000-2010
0.0011703018337081818
1960-1969
0.009977926740330576
1980-1989
0.00023058504251930947
1950-1959
0.6844845011850418
1970-1979
0.0005585717855566855
1930-1939
0.5007426670307935
1940-1949
0.046616468772444235
1920-1929
nan


In [19]:
med_sentiment_by_year_male = male_utterances.groupby('decade')['SentimentScore'].median().reset_index()
med_sentiment_by_year_male["gender"] = "male"
med_sentiment_by_year_female = female_utterances.groupby('decade')['SentimentScore'].median().reset_index()
med_sentiment_by_year_female["gender"] = "female"

df = pd.concat([med_sentiment_by_year_male, med_sentiment_by_year_female], ignore_index=True)

px.line(df, x='decade', y='SentimentScore', color='gender',
              title='Median Sentiment Scores of Utterances by Males and Females')

In [20]:
px.box(movie_df.sort_values(by="decade"), x='decade', y='SentimentScore', color='gender_x',
       title='Sentiment Scores Over Time')

In [21]:
m2m = movie_df[(movie_df["gender_x"] == "m") & (movie_df["gender_y"] == "m")]
m2f = movie_df[(movie_df["gender_x"] == "m") & (movie_df["gender_y"] == "f")]
f2m = movie_df[(movie_df["gender_x"] == "f") & (movie_df["gender_y"] == "m")]
f2f = movie_df[(movie_df["gender_x"] == "f") & (movie_df["gender_y"] == "f")]

avg_sent_by_dec_m2m = m2m.groupby('decade')['SentimentScore'].mean().reset_index()
avg_sent_by_dec_m2m["group"] = "Male to Male"

avg_sent_by_dec_m2f = m2f.groupby('decade')['SentimentScore'].mean().reset_index()
avg_sent_by_dec_m2f["group"] = "Male to Female"

avg_sent_by_dec_f2m = f2m.groupby('decade')['SentimentScore'].mean().reset_index()
avg_sent_by_dec_f2m["group"] = "Female to Male"

avg_sent_by_dec_f2f = f2f.groupby('decade')['SentimentScore'].mean().reset_index()
avg_sent_by_dec_f2f["group"] = "Female to Female"

In [22]:
gender_df = pd.concat([avg_sent_by_dec_m2m, avg_sent_by_dec_m2f, avg_sent_by_dec_f2m, avg_sent_by_dec_f2f], ignore_index=True)

In [23]:
gender_df

Unnamed: 0,decade,SentimentScore,group
0,1930-1939,0.155754,Male to Male
1,1940-1949,0.133151,Male to Male
2,1950-1959,0.141669,Male to Male
3,1960-1969,0.0954,Male to Male
4,1970-1979,0.05494,Male to Male
5,1980-1989,0.064609,Male to Male
6,1990-1999,0.01858,Male to Male
7,2000-2010,0.062755,Male to Male
8,1930-1939,0.201861,Male to Female
9,1940-1949,0.174628,Male to Female


In [24]:
px.line(gender_df.sort_values(by="decade"), x='decade', y='SentimentScore', color='group',
              title='Sentiment Scores of Utterances between Gender Groups')

In [25]:
same_gender = movie_df[((movie_df["gender_x"] == "m") & (movie_df["gender_y"] == "m")) | ((movie_df["gender_x"] == "f") & (movie_df["gender_y"] == "f"))]
diff_gender = movie_df[((movie_df["gender_x"] == "m") & (movie_df["gender_y"] == "f")) | ((movie_df["gender_x"] == "f") & (movie_df["gender_y"] == "m"))]

avg_sent_by_dec_same = same_gender.groupby('decade')['SentimentScore'].mean().reset_index()
avg_sent_by_dec_same["group"] = "Same Gender"

avg_sent_by_dec_diff = diff_gender.groupby('decade')['SentimentScore'].mean().reset_index()
avg_sent_by_dec_diff["group"] = "Different Gender"

gender_df_2 = pd.concat([avg_sent_by_dec_same, avg_sent_by_dec_diff], ignore_index=True)

px.line(gender_df_2.sort_values(by="decade"), x='decade', y='SentimentScore', color='group',
              title='Sentiment Scores of Utterances between Gender Groups')

In [26]:
for decade in decades:
    print(decade)
    same_utt_1 = same_gender[(same_gender['decade'] == decade)]["SentimentScore"]
    diff_utt_1 = diff_gender[(diff_gender['decade'] == decade)]["SentimentScore"]
    t_statistic, p_value = stats.ttest_ind(same_utt_1, diff_utt_1)
    print(p_value)

1990-1999
1.5838820137897097e-23
2000-2010
1.0861211826341032e-05
1960-1969
0.011749286833144384
1980-1989
4.851948341196912e-06
1950-1959
0.10517964867928123
1970-1979
0.00023246987567348678
1930-1939
0.12799542422885307
1940-1949
0.17738955087561567
1920-1929
nan
