In [1]:
from tqdm import tqdm_notebook

import json
import pandas as pd
import pickle

In [2]:
with open('topics.json') as f:
    topics = json.loads(f.read())

In [3]:
mydata = pd.read_csv('ML_data.csv', dtype={'id': 'str'})

In [4]:
themes = {}
ids = [el for el in list(mydata['id']) if el != '1']
film_kwds = {}

for _id in tqdm_notebook(ids):
    with open(f'movie_data/{_id}.pickle', 'rb') as f:
        file = pickle.loads(f.read())
    themes[_id] = []
    film_kwds[_id] = {}
        
    for topic in topics:
        for word in topics[topic]:
            if word in file['keywords']:
                themes[_id].append(topic)
                if f'{topic}_kwds' not in film_kwds[_id]:
                    film_kwds[_id][f'{topic}_kwds' ] = []
                film_kwds[_id][f'{topic}_kwds'].append(word)
                
    themes[_id] = ', '.join(themes[_id])
    themes[_id] = str(list(set(themes[_id].split(', '))))

HBox(children=(IntProgress(value=0, max=12717), HTML(value='')))




In [5]:
df_kwds = pd.DataFrame(film_kwds).T
df_kwds.index.name = 'id'

df_themes = pd.DataFrame(themes, index=['problems']).T
df_themes.index.name = 'id'

In [6]:
data_final = pd.merge(mydata, df_themes, on='id', how='outer')
data_final = pd.merge(data_final, df_kwds, on='id', how='outer')

In [7]:
data_final.to_csv('data_with_topics.csv', index=False)

In [15]:
bait_winners = data_final[(data_final['n_oscars_wins'] > 0)]

In [16]:
all_cinema = len(data_final)
all_bait = len(bait_winners)

In [17]:
themes_percents = {}
for theme in ['race', 'lgbt', 'female', 'nazism', 'religion', 'all']:
    themes_percents[theme] = {}

    
themes_percents['race']['bait'] = len([el for el in bait_winners['problems'].str.contains('rac') if el == True]) / all_bait
themes_percents['race']['all'] = len([el for el in data_final['problems'].str.contains('rac') if el == True]) / all_cinema

themes_percents['lgbt']['bait'] = len([el for el in bait_winners['problems'].str.contains('lgbt') if el == True]) / all_bait
themes_percents['lgbt']['all'] = len([el for el in data_final['problems'].str.contains('lgbt') if el == True]) / all_cinema

themes_percents['female']['bait'] = len([el for el in bait_winners['problems'].str.contains('female') if el == True]) / all_bait
themes_percents['female']['all'] = len([el for el in data_final['problems'].str.contains('female') if el == True]) / all_cinema

themes_percents['nazism']['bait'] = len([el for el in bait_winners['problems'].str.contains('nazism') if el == True]) / all_bait
themes_percents['nazism']['all'] = len([el for el in data_final['problems'].str.contains('nazism') if el == True]) / all_cinema

themes_percents['religion']['bait'] = len([el for el in bait_winners['problems'].str.contains('religion') if el == True]) / all_bait
themes_percents['religion']['all'] = len([el for el in data_final['problems'].str.contains('religion') if el == True]) / all_cinema

In [18]:
all_bait_words = bait_winners['problems'].str.contains('religion') | \
bait_winners['problems'].str.contains('lgbt') | \
bait_winners['problems'].str.contains('nazism') | \
bait_winners['problems'].str.contains('rac') | \
bait_winners['problems'].str.contains('female')

In [19]:
all_words = data_final['problems'].str.contains('religion') | \
data_final['problems'].str.contains('lgbt') | \
data_final['problems'].str.contains('nazism') | \
data_final['problems'].str.contains('rac') | \
data_final['problems'].str.contains('female')

In [20]:
themes_percents['all']['bait'] = len([el for el in all_bait_words if el == True]) / all_bait
themes_percents['all']['all'] = len([el for el in all_words if el == True]) / all_cinema

In [21]:
pd.DataFrame(themes_percents).to_csv('themes_percent.csv')