In [None]:
import pandas as pd 
import numpy as np 
import re
import requests
from bs4 import BeautifulSoup
import time
import json

Get all movies from the Bechdel dataset, containing Title, IMDb id's, year of release and the Bechdel-test scores. 
Called via the API, it's a little too easy. If we wanted to know more about the submitters and comments, we could use the more elaborate calls. 

In [None]:
bechdel_films = pd.read_json('http://bechdeltest.com/api/v1/getAllMovies') 

Write it into a csv, so we won't screw up with the Bechdel API curators. They kindly ask us to cache and not call the getAllMovies too often. 

In [None]:
bechdel_films.to_csv('data/bechdel_films.csv' , index = False)

In [None]:
bechdel_films = pd.read_csv('data/bechdel_films.csv' , dtype = str) #read as sting, otherwise our prescious imdb id's are float types

First, let's get the tags/keywords to figure out which film is LGBTQ+ related.  
We scrape every film that is in the Bechdel-test dataset.  
Save the full lists of tags into a dictionary by IMDb id's for later use.  
Later scraping steps are also going to be saved into dics by IMDb id's, so we can join them. 

In [None]:
tag_dict = {}

In [None]:
j = 0
for i in bechdel_films['imdbid']:
    try: 
        res = requests.get("https://www.imdb.com/title/tt" + i + "/keywords?ref_=tt_stry_kw")
        soup = BeautifulSoup(res.text, 'html.parser')
        taglist = soup.find_all('div' , class_="sodatext")
        tag_dict[i] = [ j.text.strip('\n') for j in taglist ]
        j = j + 1 
    except (ConnectionError , TypeError) as err: 
        #if there is a connection or type error, then print error msg it with the ID, so the code doesn't get interrupted.
        print(err + ": IMDb ID:" + i )

Save our precious dictionary into a file, then load it back

In [None]:
np.save('data/tag_dictionary.npy', tag_dict)

In [None]:
tag_dict = np.load('data/tag_dictionary.npy', allow_pickle = True ).item()

Let's define our LGBTQ keywords.    
By checking if a film is LGBTQ+ related, we iterate through the tags/keywords scraped from IMDb and check how many of the keywords (cleaned from punctiation, special characters) is in the the LGBTQ+ keyword list.  
We save it into a dataframe, with following columns: IMDb ID (for joins), number of keywords scraped, number of LGBTQ+ keyword spotted.  

In [None]:
lgbtq_keywords = [ 'lgbt' , 'lgbtq' ,
                  'gay' , 'gays' , 'gaycharacter' , 'gaykiss' , 'gayinterest' , 
                  'lesbian' , 'lesbians' , 'lesbianinterest' , 
                  'queer' , 
                  'trans' , 'transsexual' , 'transwoman' , 'transman' , 'transgender' , 
                  'bisexual']

In [None]:
df_rows = []
for k,v in tag_dict.items():
    isitgay = sum([ (re.sub( r'\W+', '', vv) in lgbtq_keywords) for vv in v] )
    df_rows.append( [ k , isitgay , len(v) ] )  

In [None]:
gay_df = pd.DataFrame(df_rows, columns = ['imdbid','lgbtq_keywords_num','keywords_num'])

In [None]:
df = pd.merge( bechdel_films , gay_df , how = 'inner')

In [None]:
df[ df['lgbtq_keywords_num']  > 0 ]

df.to_csv('data/bechdel_films_ifgay.csv' , index = False )

In [None]:
df = pd.read_csv('data/bechdel_films_ifgay.csv' , dtype = str)

We read everything as string, some columns are numeric, so let's transform them. 

In [None]:
numeric_cols = ['year' , 'rating' , 'lgbtq_keywords_num' , 'keywords_num']

for i in numeric_cols:
    df[i] = pd.to_numeric(df[i])

In [None]:
df = df[df.imdbid.notnull()].reset_index()

### Use the API:

In [None]:
API_keys = ['7ecc3518' , 
           'efbb5d26' , 
           '101156d4' , 
           '78906128' , 
           'c0bebd87']

In [None]:
resp_json_dict = np.load('data/movie_data_dict.npy', allow_pickle = True ).item()

In [None]:
list_of_imdb_ids = [e for e in df['imdbid'].to_list() if e not in list(resp_json_dict.keys()) ]

In [None]:
for imdb_id in list_of_imdb_ids:
    try:
        url = 'http://www.omdbapi.com/?i=tt' + imdb_id + '&apikey=' + API_keys[0]
        resp = requests.get(url)        
        resp_json = json.loads(resp.text)
        
        #if we happen to run into API limit (1.000 calls per day) and the error is because of the limit,
        # this chunk overwrites the API_keys list, drops the first key we are using until no limit issue
        while (resp_json['Response'] == 'False' and resp_json['Error'] == 'Request limit reached!'):
            API_keys = API_keys[1:]
            url = 'http://www.omdbapi.com/?i=tt' + imdb_id + '&apikey=' + API_keys[0]
            resp = requests.get(url)        
            resp_json = json.loads(resp.text)    
        #save the json response into our dictionary    
        resp_json_dict[imdb_id] = resp_json
    
    #if we encounter any type of errors, let's just print it, so the code running for hours doesn't get interrupted.
    #we can return to the missing IMDb Id's later if feeling so
    except Exception as e: 
        print("For IMDb Id: " + str(imdb_id))
        print(e)

In [None]:
np.save('data/movie_data_dict.npy' , resp_json_dict)