<a href="https://colab.research.google.com/github/minako-m/datasci112_final_project/blob/main/112_final_project_data_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Data Science 112 Final Project: Data Extraction**
# Amira and Sophia

This project explores the Cornell Movie Dialog Corpus (https://convokit.cornell.edu/documentation/movie.html).

Research questions:
1. Has movie dialogue sentiment changed over time?
2. Has the sentiment of movie dialogue spoken by men versus women changed over time?
3. Has the sentiment of movie dialogue spoken by men to men, by men to women, by women to men, and by women to women changed over time?

In this file we extract data from different parts of the Corpus and join it to form a single dataset we will be working with.

In [1]:
import plotly.graph_objs as go
import plotly.offline as pyo

# This enables Plotly to work in offline mode and render plots inline
pyo.init_notebook_mode(connected=True)

In [2]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [4]:
sid = SentimentIntensityAnalyzer()

In [5]:
def get_sentiment_score(text):
    return sid.polarity_scores(text)['compound']

In [6]:
pip install convokit

Collecting convokit
  Downloading convokit-3.0.0.tar.gz (183 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.2/183.2 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting msgpack-numpy>=0.4.3.2 (from convokit)
  Downloading msgpack_numpy-0.4.8-py2.py3-none-any.whl (6.9 kB)
Collecting dill>=0.2.9 (from convokit)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
Collecting clean-text>=0.6.0 (from convokit)
  Downloading clean_text-0.6.0-py3-none-any.whl (11 kB)
Collecting unidecode>=1.1.1 (from convokit)
  Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m27.8 MB/s[0m eta [

In [7]:
from convokit import Corpus, download
corpus = Corpus(filename=download("movie-corpus"))

Downloading movie-corpus to /root/.convokit/downloads/movie-corpus
Downloading movie-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip (40.9MB)... Done
No configuration file found at /root/.convokit/config.yml; writing with contents: 
# Default Backend Parameters
db_host: localhost:27017
data_directory: ~/.convokit/saved-corpora
default_backend: mem


In [8]:
utt_df_info = {"text": [], "utt id": [], "reply_to id": [], "speaker id": []}

for utterance in corpus.iter_utterances():
  utt_df_info["text"].append(utterance.text)
  utt_df_info["utt id"].append(utterance.id)
  utt_df_info["reply_to id"].append(utterance.reply_to)
  utt_df_info["speaker id"].append(utterance.speaker.id)

utt_df = pd.DataFrame(utt_df_info)

In [9]:
speaker_df_info = {"speaker id": [], "movie_name": [], "gender": []}

for speaker in corpus.iter_speakers():
  speaker_df_info["speaker id"].append(speaker.id)
  if "movie_name" in speaker.meta:
    speaker_df_info["movie_name"].append(speaker.meta["movie_name"])
  else:
    speaker_df_info["movie_name"].append("NF")
  if "gender" in speaker.meta:
    speaker_df_info["gender"].append(speaker.meta["gender"])
  else:
    speaker_df_info["gender"].append("NF")

speaker_df = pd.DataFrame(speaker_df_info)

In [10]:
convo_df_info = {"movie_name": [], "release year": [], "rating": [], "genre": []}

for convo in corpus.iter_conversations():
  convo_df_info["movie_name"].append(convo.meta["movie_name"])
  convo_df_info["release year"].append(convo.meta["release_year"])
  convo_df_info["rating"].append(convo.meta["rating"])
  convo_df_info["genre"].append(convo.meta["genre"])

convo_df = pd.DataFrame(convo_df_info).drop_duplicates()

In [11]:
merged_df_1 = pd.merge(utt_df, speaker_df, on='speaker id', how='left')
merged_df_2 = pd.merge(merged_df_1, convo_df, on='movie_name', how='left')
merged_df_2["release year"] = merged_df_2["release year"].str.replace("/I", "").astype(int)
merged_df_2['gender'] = merged_df_2['gender'].str.lower()
movie_df = pd.merge(merged_df_2, merged_df_2[['utt id', 'gender']],
              left_on='reply_to id', right_on='utt id', how='left')

In [12]:
ranges = [(1920, 1929), (1930, 1939), (1940, 1949), (1950, 1959),
 (1960, 1969), (1970, 1979), (1980, 1989), (1990, 1999), (2000, 2010)]

labels = ['1920-1929', '1930-1939', '1940-1949', '1950-1959', '1960-1969',
          '1970-1979', '1980-1989', '1990-1999', '2000-2010']

movie_df['decade'] = pd.cut(movie_df['release year'], bins=[x[0] for x in ranges] + [ranges[-1][1]], labels=labels)

In [14]:
movie_df

Unnamed: 0,text,utt id_x,reply_to id,speaker id,movie_name,gender_x,release year,rating,genre,utt id_y,gender_y,decade
0,They do not!,L1045,L1044,u0,10 things i hate about you,f,1999,6.90,"['comedy', 'romance']",L1044,m,1990-1999
1,They do to!,L1044,,u2,10 things i hate about you,m,1999,6.90,"['comedy', 'romance']",,,1990-1999
2,I hope so.,L985,L984,u0,10 things i hate about you,f,1999,6.90,"['comedy', 'romance']",L984,m,1990-1999
3,She okay?,L984,,u2,10 things i hate about you,m,1999,6.90,"['comedy', 'romance']",,,1990-1999
4,Let's go.,L925,L924,u0,10 things i hate about you,f,1999,6.90,"['comedy', 'romance']",L924,m,1990-1999
...,...,...,...,...,...,...,...,...,...,...,...,...
304708,Lord Chelmsford seems to want me to stay back ...,L666371,L666370,u9030,zulu dawn,?,1979,6.40,"['action', 'adventure', 'drama', 'history', 'w...",L666370,?,1970-1979
304709,I'm to take the Sikali with the main column to...,L666370,L666369,u9034,zulu dawn,?,1979,6.40,"['action', 'adventure', 'drama', 'history', 'w...",L666369,?,1970-1979
304710,"Your orders, Mr Vereker?",L666369,,u9030,zulu dawn,?,1979,6.40,"['action', 'adventure', 'drama', 'history', 'w...",,,1970-1979
304711,"Good ones, yes, Mr Vereker. Gentlemen who can ...",L666257,L666256,u9030,zulu dawn,?,1979,6.40,"['action', 'adventure', 'drama', 'history', 'w...",L666256,?,1970-1979


In [13]:
movie_df.to_csv('movie_dialogues.csv')