In [1]:
import pandas as pd 
import numpy as np 
import re
import requests
from bs4 import BeautifulSoup

Get all movies from the Bechdel dataset, containing Title, IMDb id's, year of release and the Bechdel-test scores. 
Called via the API, it's a little too easy. If we wanted to know more about the submitters and comments, we could use the more elaborate calls. 

In [21]:
#bechdel_films = pd.read_json('http://bechdeltest.com/api/v1/getAllMovies') 

Write it into a csv, so we won't screw up with the Bechdel API curators. They kindly ask us to cache and not call the getAllMovies too often. 

In [3]:
bechdel_films.to_csv('data/bechdel_films.csv' , index = False)

In [4]:
bechdel_films = pd.read_csv('data/bechdel_films.csv' , dtype = str) #read as sting, otherwise our prescious imdb id's are float types

First, let's get the tags/keywords to figure out which film is LGBTQ+ related.  
We scrape every film that is in the Bechdel-test dataset.  
Save the full lists of tags into a dictionary by IMDb id's for later use.  
Later scraping steps are also going to be saved into dics by IMDb id's, so we can join them. 

In [None]:
tag_dict = {}

In [None]:
j = 0
for i in bechdel_films['imdbid']:
    try: 
        res = requests.get("https://www.imdb.com/title/tt" + i + "/keywords?ref_=tt_stry_kw")
        soup = BeautifulSoup(res.text, 'html.parser')
        taglist = soup.find_all('div' , class_="sodatext")
        tag_dict[i] = [ j.text.strip('\n') for j in taglist ]
        j = j + 1 
        print( str(j) + ":" + i)
    except (ConnectionError , TypeError) as err: 
        #if there is a connection or type error, then print error msg it with the ID, so the code doesn't get interrupted.
        print(err + ": IMDb ID:" + i )

Save our precious dictionary into a file, then load it back

In [None]:
np.save('data/tag_dictionary.npy', tag_dict)

In [6]:
tag_dict = np.load('data/tag_dictionary.npy', allow_pickle = True ).item()

Let's define our LGBTQ keywords.    
By checking if a film is LGBTQ+ related, we iterate through the tags/keywords scraped from IMDb and check how many of the keywords (cleaned from punctiation, special characters) is in the the LGBTQ+ keyword list.  
We save it into a dataframe, with following columns: IMDb ID (for joins), number of keywords scraped, number of LGBTQ+ keyword spotted.  

In [7]:
lgbtq_keywords = [ 'lgbt' , 'lgbtq' ,
                  'gay' , 'gays' , 'gaycharacter' , 'gaykiss' , 'gayinterest' , 
                  'lesbian' , 'lesbians' , 'lesbianinterest' , 
                  'queer' , 
                  'trans' , 'transsexual' , 'transwoman' , 'transman' , 'transgender' , 
                  'bisexual']

In [8]:
df_rows = []
for k,v in tag_dict.items():
    isitgay = sum([ (re.sub( r'\W+', '', vv) in lgbtq_keywords) for vv in v])
    df_rows.append( [ k , isitgay , len(v) ] )  

In [9]:
gay_df = pd.DataFrame(df_rows, columns = ['imdbid','lgbtq_keywords_num','keywords_num'])

In [10]:
df = pd.merge( bechdel_films , gay_df , how = 'inner')

In [11]:
df[ df['lgbtq_keywords_num']  > 0 ]

Unnamed: 0,year,rating,id,title,imdbid,lgbtq_keywords_num,keywords_num
52,1894,0,9654,Dickson Experimental Sound Film,0177707,2,16
123,1907,0,5223,"Eclipse: Courtship of the Sun and Moon, The",0215737,2,12
137,1914,2,4457,A Florida Enchantment,0003973,4,28
211,1927,1,3118,Wings,0018578,4,73
213,1927,1,7324,Downhill [When Boys Leave Home],0017825,2,74
...,...,...,...,...,...,...,...
9399,2021,3,10157,West Side Story,3581652,1,178
9401,2021,3,10161,Single All The Way,14315756,4,55
9407,2021,3,10185,Licorice Pizza,11271038,1,321
9410,2021,3,10190,tick...tick..BOOM!,8721424,4,100


In [12]:
df.to_csv('data/bechdel_films_ifgay.csv' , index = False )

In [14]:
df = pd.read_csv('data/bechdel_films_ifgay.csv' , dtype = str)

We read everything as string, some columns are numeric, so let's transform them. 

In [16]:
numeric_cols = ['year' , 'rating' , 'lgbtq_keywords_num' , 'keywords_num']

for i in numeric_cols:
    df[i] = pd.to_numeric(df[i])