In [12]:
import pandas as pd
import numpy as np
import ast

from imdb import Cinemagoer
import spacy

import altair as alt
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer 

### 1. Data: [Cornell Movie-Dialogs Corpus](https://convokit.cornell.edu/documentation/movie.html) & [Cinemagoer](https://cinemagoer.readthedocs.io/en/latest/index.html)

[Conell's Page on the Corpus](https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html)

[Cinemagoer](https://cinemagoer.readthedocs.io/en/latest/index.html)
 
[Cinemagoer's GitHub](https://github.com/cinemagoer/cinemagoer)

[Our Complied Data for Download](https://uillinoisedu-my.sharepoint.com/:f:/g/personal/mengyue4_illinois_edu/ErWZnueG65RPrwWrxzms3DUBWE5y55pf1IomDARFoBG02w?e=uuMaeg) 

### 2. This Part's Goal:

~~1. restrict the year range of our movies to only as early as 2000~~

1. for dislogue/convesrsation data for movies from 1927 - 2009, what are the top frequency terms (aside from year and name) for low, med, and high rating movies ([2.5, 6.6], [6.7, 7.6], [7.7, 9.2])?

2. what about synopsis data for low, med, and high rating movies ([2.5, 6.6], [6.7, 7.6], [7.7, 9.2])?

3. coducting text Similarity on the synopsis of these movies, obtaining some comprehension on what makes a great movie in terms of the plot

### 3. Read-in & Preprocess on Cornell

In [34]:
movies = pd.read_csv(r'C:\Users\98768\Desktop\is310\project\movie-corpus\processed\movies.csv')
speakers = pd.read_csv(r'C:\Users\98768\Desktop\is310\project\movie-corpus\processed\speakers.csv')
#utterances = pd.read_csv(r'C:\Users\98768\Desktop\is310\project\movie-corpus\processed\utterances.csv')
grouped_utterances = pd.read_csv(r'C:\Users\98768\Desktop\is310\project\movie-corpus\processed\grouped.csv')

print('movies:', movies.columns.values)
print('speakers:', speakers.columns.values)
#print('utterances:', utterances.columns.values)
print('grouped_utterances:', grouped_utterances.columns.values)

movies: ['movie_idx' 'movie_name' 'release_year' 'rating' 'votes' 'genre'
 'Cinemagoer_id' 'intro' 'synopsis' 'reviews' 'rating_cat']
speakers: ['speaker_id' 'character_name' 'movie_idx' 'movie_name' 'gender'
 'credit_pos']
grouped_utterances: ['movie_idx' 'conversation_data' 'utterances_id_list']


In [6]:
speakers

Unnamed: 0,speaker_id,character_name,movie_idx,movie_name,gender,credit_pos
0,u0,BIANCA,m0,10 things i hate about you,f,4
1,u2,CAMERON,m0,10 things i hate about you,m,3
2,u4,JOEY,m0,10 things i hate about you,m,6
3,u5,KAT,m0,10 things i hate about you,f,2
4,u11,WALTER,m0,10 things i hate about you,m,9
...,...,...,...,...,...,...
3010,u9017,FREDDY,m615,young frankenstein,m,0
3011,u9015,ELIZABETH,m615,young frankenstein,f,25
3012,u9019,IGOR,m615,young frankenstein,m,3
3013,u9022,MEDICAL STUDENT,m615,young frankenstein,m,9


In [7]:
grouped_utterances

Unnamed: 0,movie_idx,conversation_data,utterances_id_list
0,m0,They do not! They do to! I hope so. She okay? ...,"['L1045', 'L1044', 'L985', 'L984', 'L925', 'L9..."
1,m1,"Can't be that far, I say. Also, I don't like ...","['L2181', 'L2180', 'L2179', 'L2177', 'L2176', ..."
2,m10,All the figures show is that Gordon LaRiviere ...,"['L12762', 'L12761', 'L12760', 'L12759', 'L127..."
3,m100,Sending what we know back to U.S.T. via satell...,"['L303627', 'L303626', 'L303551', 'L303550', '..."
4,m101,In the backyard. Where? I heard them. Did you ...,"['L303971', 'L303970', 'L303969', 'L303968', '..."
...,...,...,...
612,m95,I'll go. We'll both go. Far away. You have t...,"['L296577', 'L296576', 'L296575', 'L296574', '..."
613,m96,"Better now, son? I was so scared. I know, Elle...","['L307676', 'L307675', 'L307674', 'L307670', '..."
614,m97,"If we do, we'll both die virgins. But at last ...","['L299148', 'L299147', 'L298768', 'L298767', '..."
615,m98,The dog!? You are named after the dog... May ...,"['L300004', 'L300003', 'L299639', 'L299638', '..."


In [8]:
movies

Unnamed: 0,movie_idx,movie_name,release_year,rating,votes,genre,Cinemagoer_id,intro,synopsis,reviews,rating_cat
0,m0,10 things i hate about you,1999,6.9,62847,"[""comedy"", "" romance""]",147800,"A high-school boy, Cameron, cannot date Bianca...","Cameron James (Joseph Gordon-Levitt), a new st...",I'm 19 now (so was only 9 when '10 Things I Ha...,1
1,m2,15 minutes,2001,6.1,25854,"[""action"", "" crime"", "" drama"", "" thriller""]",179626,A homicide detective and a fire marshal must s...,The story opens as two men come through US cus...,This movie took a severe beating in the press ...,0
2,m3,2001: a space odyssey,1968,8.4,163227,"[""adventure"", "" mystery"", "" sci-fi""]",62622,After uncovering a mysterious artifact buried ...,"The film consists of four major sections, all ...",I felt the need to review this movie after rea...,2
3,m4,48 hrs.,1982,6.9,22289,"[""action"", "" comedy"", "" crime"", "" drama"", "" th...",83511,A hard-nosed cop reluctantly teams up with a w...,Convicted robber Albert Ganz is working as par...,"Violent criminal Ganz escapes from prison, and...",1
4,m5,the fifth element,1997,7.5,133756,"[""action"", "" adventure"", "" romance"", "" sci-fi""...",119116,"In the colorful future, a cab driver unwitting...",The story's premise is that every five thousan...,I really believe that they billed this movie w...,1
...,...,...,...,...,...,...,...,...,...,...,...
470,m611,the world is not enough,1999,6.3,60047,"[""action"", "" adventure"", "" thriller""]",143145,James Bond uncovers a nuclear plot while prote...,"In Bilbao, MI6 agent James Bond (Pierce Brosna...",I first saw this in the early 2k on cable tv.R...,0
471,m612,watchmen,2009,7.8,135229,"[""action"", "" crime"", "" fantasy"", "" mystery"", ""...",409459,"In a version of 1985 where superheroes exist, ...","""Watchmen"" is set in an alternate 1985 America...",Christopher Nolan was right. This movie indeed...,2
472,m613,xxx,2002,5.6,53505,"[""action"", "" adventure"", "" crime""]",295701,The US government recruits extreme sports athl...,An NSA agent on a mission infiltrates a concer...,I don´t understand all the commotion in the us...,0
473,m615,young frankenstein,1974,8.0,57618,"[""comedy"", "" sci-fi""]",72431,An American grandson of the infamous scientist...,Dr. Frederick Frankenstein (Gene Wilder) is a ...,"""Young Frankenstein"", now you're talking about...",2


the rating ranges

In [9]:
(movies[movies['rating_cat'] == 0]['rating'].min(), movies[movies['rating_cat'] == 0]['rating'].max())

(2.5, 6.6)

In [32]:
(movies[movies['rating_cat'] == 1]['rating'].min(), movies[movies['rating_cat'] == 1]['rating'].max())

(6.7, 7.6)

In [33]:
(movies[movies['rating_cat'] == 2]['rating'].min(), movies[movies['rating_cat'] == 2]['rating'].max())

(7.7, 9.2)

In [35]:
rating_low = movies[movies['rating_cat'] == 0]
low = pd.merge(rating_low, grouped_utterances, left_on='movie_idx', right_on='movie_idx', how='inner').reset_index(drop=True)
low.rename(columns={'movie_name_x': 'movie_name'}, inplace=True)

rating_med = movies[movies['rating_cat'] == 1]
med = pd.merge(rating_med, grouped_utterances, left_on='movie_idx', right_on='movie_idx', how='inner').reset_index(drop=True)
med.rename(columns={'movie_name_x': 'movie_name'}, inplace=True)

rating_high = movies[movies['rating_cat'] == 2]
high = pd.merge(rating_high, grouped_utterances, left_on='movie_idx', right_on='movie_idx', how='inner').reset_index(drop=True)
high.rename(columns={'movie_name_x': 'movie_name'}, inplace=True)

In [11]:
# some helper functions

# here some specific definition of year, filtering and keep the other numbers, could be better
def indicate_year(term):
    if len(term) == 4:
        if term[0] + term[1] in ['17', '18', '19', '20']:
            if term[1:].isdigit():
                return 1
    return 0

# check if number, a stricter version of the previous function, but i guess i do not need this
def indicate_number(term):
    if len(term) == 4:
        if term.isdigit():
            return 1
    return 0

# check if person's name
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")

def is_name(term):
    # Process the term
    doc = nlp(term)
    # Check if any of the entities is a person's name
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            return 1
    return 0

### 4. Goal1 - on Conversations Data

for dislogue data for movies from 1927 - 2009, what are the top frequency terms (aside from year and name) from low, med, and high rating movies ([2.5, 6.6], [6.7, 7.6], [7.7, 9.2])?

#### Low

In [14]:
# realtively low ratings
df = low
col = 'conversation_data'
helper_fnc = [indicate_year, is_name]

df[col] = df[col].str.replace(r'\[.*\]', '', regex=True)
df[col] = df[col].fillna('')
df.reset_index(drop=True, inplace=True)


documents = df[col].tolist()
vectorizer = TfidfVectorizer(max_df=.8, min_df=0.05, stop_words='english', lowercase=True)
transformed_documents = vectorizer.fit_transform(documents)

# top features for each document
transformed_documents_as_array = transformed_documents.toarray()

dates = df['release_year'].tolist()
tfidf_results = []

# Loop through each document and get the top terms
for counter, doc in enumerate(transformed_documents_as_array):
    # create a df with the top terms
    tf_idf_tuples = list(zip(vectorizer.get_feature_names_out(), doc))
    one_doc_as_df = pd.DataFrame(tf_idf_tuples, columns=['term', 'score']).sort_values(by='score', ascending=False).reset_index(drop=True)
    one_doc_as_df['movie_release_year'] = dates[counter]
    # create indicator for year and name
    one_doc_as_df['is_year'] = one_doc_as_df['term'].apply(helper_fnc[0])
    tfidf_results.append(one_doc_as_df)
    one_doc_as_df['is_name'] = one_doc_as_df['term'].apply(helper_fnc[1])
    tfidf_results.append(one_doc_as_df)

tfidf_df = pd.concat(tfidf_results)
tfidf_df = tfidf_df.sort_values(by=['score'], ascending=False).reset_index(drop=True)
tfidf_df.head()

Unnamed: 0,term,score,movie_release_year,is_year,is_name
0,snake,0.834254,1996,0,0
1,snake,0.834254,1996,0,0
2,jack,0.814884,2000,0,0
3,jack,0.814884,2000,0,0
4,jerry,0.807586,1997,0,1


In [22]:
data = tfidf_df[(tfidf_df['is_year'] == 0) & (tfidf_df['is_name'] == 0)].head(200).reset_index(drop=True)
selection = alt.selection_point(fields=['term'], bind='legend')

chart = alt.Chart(data).mark_bar().encode(
    y='score',
    x=alt.X('movie_release_year:N', axis=alt.Axis(labelAngle=-45)),
    color=alt.Color('term', legend=alt.Legend(title='Term', orient='right', symbolLimit=len(data['term'].unique()), columns=5), scale=alt.Scale(scheme='tableau20')),
    tooltip=['term', 'score', 'movie_release_year'],
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2))
).add_params(selection).properties(
    title='Top 200 Terms by TF-IDF Score in Movies Dialogue w/ Low Ratings (2.5~6.6)'
)

chart

In [15]:
tfidf_df.to_csv(r'C:\Users\98768\Desktop\is310\project\movie-corpus\processed\conversation_results\tfidf_low.csv', index=False)

#### Med

In [43]:
# realtively median ratings
df = med
col = 'conversation_data'
helper_fnc = [indicate_year, is_name]

df[col] = df[col].str.replace(r'\[.*\]', '', regex=True)
df[col] = df[col].fillna('')
df.reset_index(drop=True, inplace=True)


documents = df[col].tolist()
vectorizer = TfidfVectorizer(max_df=.8, min_df=0.05, stop_words='english', lowercase=True)
transformed_documents = vectorizer.fit_transform(documents)

# top features for each document
transformed_documents_as_array = transformed_documents.toarray()

dates = df['release_year'].tolist()
tfidf_results = []

# Loop through each document and get the top terms
for counter, doc in enumerate(transformed_documents_as_array):
    # create a df with the top terms
    tf_idf_tuples = list(zip(vectorizer.get_feature_names_out(), doc))
    one_doc_as_df = pd.DataFrame(tf_idf_tuples, columns=['term', 'score']).sort_values(by='score', ascending=False).reset_index(drop=True)
    one_doc_as_df['movie_release_year'] = dates[counter]
    # create indicator for year and name
    one_doc_as_df['is_year'] = one_doc_as_df['term'].apply(helper_fnc[0])
    tfidf_results.append(one_doc_as_df)
    one_doc_as_df['is_name'] = one_doc_as_df['term'].apply(helper_fnc[1])
    tfidf_results.append(one_doc_as_df)

tfidf_df = pd.concat(tfidf_results)
tfidf_df = tfidf_df.sort_values(by=['score'], ascending=False).reset_index(drop=True)
tfidf_df.head()

Unnamed: 0,term,score,movie_release_year,is_year,is_name
0,walter,0.915098,1959,0,0
1,walter,0.915098,1959,0,0
2,louis,0.829335,1994,0,0
3,louis,0.829335,1994,0,0
4,george,0.824855,2001,0,0


In [24]:
data = tfidf_df[(tfidf_df['is_year'] == 0) & (tfidf_df['is_name'] == 0)].head(200).reset_index(drop=True)
selection = alt.selection_point(fields=['term'], bind='legend')

chart = alt.Chart(data).mark_bar().encode(
    y='score',
    x=alt.X('movie_release_year:N', axis=alt.Axis(labelAngle=-45)),
    color=alt.Color('term', legend=alt.Legend(title='Term', orient='right', symbolLimit=len(data['term'].unique()), columns=5), scale=alt.Scale(scheme='tableau20')),
    tooltip=['term', 'score', 'movie_release_year'],
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2))
).add_params(selection).properties(
    title='Top 200 Terms by TF-IDF Score in Movies Dialogue w/ Med Ratings (6.7~7.6)'
)

chart

In [47]:
tfidf_df.to_csv(r'C:\Users\98768\Desktop\is310\project\movie-corpus\processed\conversation_results\tfidf_med.csv', index=False)

#### High

In [49]:
# realtively high ratings
df = high
col = 'conversation_data'
helper_fnc = [indicate_year, is_name]

df[col] = df[col].str.replace(r'\[.*\]', '', regex=True)
df[col] = df[col].fillna('')
df.reset_index(drop=True, inplace=True)


documents = df[col].tolist()
vectorizer = TfidfVectorizer(max_df=.8, min_df=0.05, stop_words='english', lowercase=True)
transformed_documents = vectorizer.fit_transform(documents)

# top features for each document
transformed_documents_as_array = transformed_documents.toarray()

dates = df['release_year'].tolist()
tfidf_results = []

# Loop through each document and get the top terms
for counter, doc in enumerate(transformed_documents_as_array):
    # create a df with the top terms
    tf_idf_tuples = list(zip(vectorizer.get_feature_names_out(), doc))
    one_doc_as_df = pd.DataFrame(tf_idf_tuples, columns=['term', 'score']).sort_values(by='score', ascending=False).reset_index(drop=True)
    one_doc_as_df['movie_release_year'] = dates[counter]
    # create indicator for year and name
    one_doc_as_df['is_year'] = one_doc_as_df['term'].apply(helper_fnc[0])
    tfidf_results.append(one_doc_as_df)
    one_doc_as_df['is_name'] = one_doc_as_df['term'].apply(helper_fnc[1])
    tfidf_results.append(one_doc_as_df)

tfidf_df = pd.concat(tfidf_results)
tfidf_df = tfidf_df.sort_values(by=['score'], ascending=False).reset_index(drop=True)
tfidf_df.head()

Unnamed: 0,term,score,movie_release_year,is_year,is_name
0,nick,0.871375,1978,0,0
1,nick,0.871375,1978,0,0
2,ya,0.855669,1976,0,0
3,ya,0.855669,1976,0,0
4,paul,0.784718,2004,0,1


In [26]:
data = tfidf_df[(tfidf_df['is_year'] == 0) & (tfidf_df['is_name'] == 0)].head(200).reset_index(drop=True)
selection = alt.selection_point(fields=['term'], bind='legend')

chart = alt.Chart(data).mark_bar().encode(
    y='score',
    x=alt.X('movie_release_year:N', axis=alt.Axis(labelAngle=-45)),
    color=alt.Color('term', legend=alt.Legend(title='Term', orient='right', symbolLimit=len(data['term'].unique()), columns=5), scale=alt.Scale(scheme='tableau20')),
    tooltip=['term', 'score', 'movie_release_year'],
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2))
).add_params(selection).properties(
    title='Top 200 Terms by TF-IDF Score in Movies Dialogue w/ High Ratings (7.7~9.2)'
)

chart

In [51]:
tfidf_df.to_csv(r'C:\Users\98768\Desktop\is310\project\movie-corpus\processed\conversation_results\tfidf_high.csv', index=False)

### 5. Goal2 - on Synopsis Data

In [15]:
# realtively low ratings
df = low
col = 'synopsis'
helper_fnc = [indicate_year, is_name]

df[col] = df[col].str.replace(r'\[.*\]', '', regex=True)
df[col] = df[col].fillna('')
df.reset_index(drop=True, inplace=True)


documents = df[col].tolist()
vectorizer = TfidfVectorizer(max_df=.8, min_df=0.05, stop_words='english', lowercase=True)
transformed_documents = vectorizer.fit_transform(documents)

# top features for each document
transformed_documents_as_array = transformed_documents.toarray()

dates = df['release_year'].tolist()
tfidf_results = []

# Loop through each document and get the top terms
for counter, doc in enumerate(transformed_documents_as_array):
    # create a df with the top terms
    tf_idf_tuples = list(zip(vectorizer.get_feature_names_out(), doc))
    one_doc_as_df = pd.DataFrame(tf_idf_tuples, columns=['term', 'score']).sort_values(by='score', ascending=False).reset_index(drop=True)
    one_doc_as_df['movie_release_year'] = dates[counter]
    # create indicator for year and name
    one_doc_as_df['is_year'] = one_doc_as_df['term'].apply(helper_fnc[0])
    tfidf_results.append(one_doc_as_df)
    one_doc_as_df['is_name'] = one_doc_as_df['term'].apply(helper_fnc[1])
    tfidf_results.append(one_doc_as_df)

tfidf_df = pd.concat(tfidf_results)
tfidf_df = tfidf_df.sort_values(by=['score'], ascending=False).reset_index(drop=True)
tfidf_df.head()

Unnamed: 0,term,score,movie_release_year,is_year,is_name
0,nick,0.943847,1989,0,0
1,nick,0.943847,1989,0,0
2,chris,0.92643,2002,0,1
3,chris,0.92643,2002,0,1
4,jason,0.907006,1993,0,1


In [16]:
data = tfidf_df[(tfidf_df['is_year'] == 0) & (tfidf_df['is_name'] == 0)].head(200).reset_index(drop=True)
selection = alt.selection_point(fields=['term'], bind='legend')

chart = alt.Chart(data).mark_bar().encode(
    y='score',
    x=alt.X('movie_release_year:N', axis=alt.Axis(labelAngle=-45)),
    color=alt.Color('term', legend=alt.Legend(title='Term', orient='right', symbolLimit=len(data['term'].unique()), columns=5), scale=alt.Scale(scheme='tableau20')),
    tooltip=['term', 'score', 'movie_release_year'],
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2))
).add_params(selection).properties(
    title='Top 200 Terms by TF-IDF Score in Movies Synopsis w/ Low Ratings (2.5~6.6)'
)

chart

In [17]:
tfidf_df.to_csv(r'C:\Users\98768\Desktop\is310\project\movie-corpus\processed\synopsis_results\tfidf_low.csv', index=False)

In [18]:
# realtively med ratings
df = med
col = 'synopsis'
helper_fnc = [indicate_year, is_name]

df[col] = df[col].str.replace(r'\[.*\]', '', regex=True)
df[col] = df[col].fillna('')
df.reset_index(drop=True, inplace=True)


documents = df[col].tolist()
vectorizer = TfidfVectorizer(max_df=.8, min_df=0.05, stop_words='english', lowercase=True)
transformed_documents = vectorizer.fit_transform(documents)

# top features for each document
transformed_documents_as_array = transformed_documents.toarray()

dates = df['release_year'].tolist()
tfidf_results = []

# Loop through each document and get the top terms
for counter, doc in enumerate(transformed_documents_as_array):
    # create a df with the top terms
    tf_idf_tuples = list(zip(vectorizer.get_feature_names_out(), doc))
    one_doc_as_df = pd.DataFrame(tf_idf_tuples, columns=['term', 'score']).sort_values(by='score', ascending=False).reset_index(drop=True)
    one_doc_as_df['movie_release_year'] = dates[counter]
    # create indicator for year and name
    one_doc_as_df['is_year'] = one_doc_as_df['term'].apply(helper_fnc[0])
    tfidf_results.append(one_doc_as_df)
    one_doc_as_df['is_name'] = one_doc_as_df['term'].apply(helper_fnc[1])
    tfidf_results.append(one_doc_as_df)

tfidf_df = pd.concat(tfidf_results)
tfidf_df = tfidf_df.sort_values(by=['score'], ascending=False).reset_index(drop=True)

tfidf_df.to_csv(r'C:\Users\98768\Desktop\is310\project\movie-corpus\processed\synopsis_results\tfidf_med.csv', index=False)
 
data = tfidf_df[(tfidf_df['is_year'] == 0) & (tfidf_df['is_name'] == 0)].head(200).reset_index(drop=True)
selection = alt.selection_point(fields=['term'], bind='legend')

chart = alt.Chart(data).mark_bar().encode(
    y='score',
    x=alt.X('movie_release_year:N', axis=alt.Axis(labelAngle=-45)),
    color=alt.Color('term', legend=alt.Legend(title='Term', orient='right', symbolLimit=len(data['term'].unique()), columns=5), scale=alt.Scale(scheme='tableau20')),
    tooltip=['term', 'score', 'movie_release_year'],
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2))
).add_params(selection).properties(
    title='Top 200 Terms by TF-IDF Score in Movies Synopsis w/ Low Ratings (6.7~7.6)'
)

chart 

In [19]:
# realtively high ratings
df = high
col = 'synopsis'
helper_fnc = [indicate_year, is_name]

df[col] = df[col].str.replace(r'\[.*\]', '', regex=True)
df[col] = df[col].fillna('')
df.reset_index(drop=True, inplace=True)


documents = df[col].tolist()
vectorizer = TfidfVectorizer(max_df=.8, min_df=0.05, stop_words='english', lowercase=True)
transformed_documents = vectorizer.fit_transform(documents)

# top features for each document
transformed_documents_as_array = transformed_documents.toarray()

dates = df['release_year'].tolist()
tfidf_results = []

# Loop through each document and get the top terms
for counter, doc in enumerate(transformed_documents_as_array):
    # create a df with the top terms
    tf_idf_tuples = list(zip(vectorizer.get_feature_names_out(), doc))
    one_doc_as_df = pd.DataFrame(tf_idf_tuples, columns=['term', 'score']).sort_values(by='score', ascending=False).reset_index(drop=True)
    one_doc_as_df['movie_release_year'] = dates[counter]
    # create indicator for year and name
    one_doc_as_df['is_year'] = one_doc_as_df['term'].apply(helper_fnc[0])
    tfidf_results.append(one_doc_as_df)
    one_doc_as_df['is_name'] = one_doc_as_df['term'].apply(helper_fnc[1])
    tfidf_results.append(one_doc_as_df)

tfidf_df = pd.concat(tfidf_results)
tfidf_df = tfidf_df.sort_values(by=['score'], ascending=False).reset_index(drop=True)

tfidf_df.to_csv(r'C:\Users\98768\Desktop\is310\project\movie-corpus\processed\synopsis_results\tfidf_high.csv', index=False)
 
data = tfidf_df[(tfidf_df['is_year'] == 0) & (tfidf_df['is_name'] == 0)].head(200).reset_index(drop=True)
selection = alt.selection_point(fields=['term'], bind='legend')

chart = alt.Chart(data).mark_bar().encode(
    y='score',
    x=alt.X('movie_release_year:N', axis=alt.Axis(labelAngle=-45)),
    color=alt.Color('term', legend=alt.Legend(title='Term', orient='right', symbolLimit=len(data['term'].unique()), columns=5), scale=alt.Scale(scheme='tableau20')),
    tooltip=['term', 'score', 'movie_release_year'],
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2))
).add_params(selection).properties(
    title='Top 200 Terms by TF-IDF Score in Movies Synopsis w/ Low Ratings (7.7~9.2)'
)

chart 

### 6. Goal3 Text Similarity 

In [37]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [43]:
df = pd.read_csv(r'C:\Users\98768\Desktop\is310\project\movie-corpus\processed\conversation_results\tfidf_high.csv')
conver_high = df[df['score'] > 0.5]
df = pd.read_csv(r'C:\Users\98768\Desktop\is310\project\movie-corpus\processed\conversation_results\tfidf_med.csv')
conver_med = df[df['score'] > 0.5]
df = pd.read_csv(r'C:\Users\98768\Desktop\is310\project\movie-corpus\processed\conversation_results\tfidf_low.csv')
conver_low = df[df['score'] > 0.5]

df = pd.read_csv(r'C:\Users\98768\Desktop\is310\project\movie-corpus\processed\synopsis_results\tfidf_high.csv')
syn_high = df[df['score'] > 0.5]
df = pd.read_csv(r'C:\Users\98768\Desktop\is310\project\movie-corpus\processed\synopsis_results\tfidf_med.csv')
syn_med = df[df['score'] > 0.5]
df = pd.read_csv(r'C:\Users\98768\Desktop\is310\project\movie-corpus\processed\synopsis_results\tfidf_low.csv')
syn_low = df[df['score'] > 0.5]

#### Conversation Data

In [89]:
combined_texts = pd.concat([conver_high['term'], conver_med['term']]).reset_index(drop=True)

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(combined_texts)

similarity_matrix = cosine_similarity(tfidf_matrix)
similarity_df = pd.DataFrame(similarity_matrix, index=combined_texts.index, columns=combined_texts.index)

overall_similarity = similarity_df.mean().mean()
print("Overall similarity between conver_high['term'] and conver_med['term']: ", overall_similarity)

Overall similarity between conver_high['term'] and conver_med['term']:  0.018229166666666668


In [90]:
combined_texts = pd.concat([conver_high['term'], conver_low['term']]).reset_index(drop=True)

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(combined_texts)

similarity_matrix = cosine_similarity(tfidf_matrix)
similarity_df = pd.DataFrame(similarity_matrix, index=combined_texts.index, columns=combined_texts.index)

overall_similarity = similarity_df.mean().mean()
print("Overall similarity between conver_high['term'] and conver_med['term']: ", overall_similarity)

Overall similarity between conver_high['term'] and conver_med['term']:  0.019886363636363636


In [92]:
combined_texts = pd.concat([conver_med['term'], conver_low['term']]).reset_index(drop=True)

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(combined_texts)

similarity_matrix = cosine_similarity(tfidf_matrix)
similarity_df = pd.DataFrame(similarity_matrix, index=combined_texts.index, columns=combined_texts.index)

overall_similarity = similarity_df.mean().mean()
print("Overall similarity between conver_high['term'] and conver_med['term']: ", overall_similarity)

Overall similarity between conver_high['term'] and conver_med['term']:  0.022308149910767405


#### Synopsis Data

In [94]:
combined_texts = pd.concat([syn_high['term'], syn_med['term']]).reset_index(drop=True)

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(combined_texts)

similarity_matrix = cosine_similarity(tfidf_matrix)
similarity_df = pd.DataFrame(similarity_matrix, index=combined_texts.index, columns=combined_texts.index)

overall_similarity = similarity_df.mean().mean()
print("Overall similarity between conver_high['term'] and conver_med['term']: ", overall_similarity)

Overall similarity between conver_high['term'] and conver_med['term']:  0.01745160798658741


In [96]:
combined_texts = pd.concat([syn_high['term'], syn_low['term']]).reset_index(drop=True)

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(combined_texts)

similarity_matrix = cosine_similarity(tfidf_matrix)
similarity_df = pd.DataFrame(similarity_matrix, index=combined_texts.index, columns=combined_texts.index)

overall_similarity = similarity_df.mean().mean()
print("Overall similarity between conver_high['term'] and conver_med['term']: ", overall_similarity)

Overall similarity between conver_high['term'] and conver_med['term']:  0.01651077097505669


In [98]:
combined_texts = pd.concat([syn_med['term'], syn_low['term']]).reset_index(drop=True)

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(combined_texts)

similarity_matrix = cosine_similarity(tfidf_matrix)
similarity_df = pd.DataFrame(similarity_matrix, index=combined_texts.index, columns=combined_texts.index)

overall_similarity = similarity_df.mean().mean()
print("Overall similarity between conver_high['term'] and conver_med['term']: ", overall_similarity)

Overall similarity between conver_high['term'] and conver_med['term']:  0.018272210743801646
