# WOMEN IN MOVIES

In [1]:
import pandas as pd
import requests
import json
import time

In [2]:
from time import sleep
from time import time
from random import randint
from pandas.io.json import json_normalize
from requests import get
from IPython.core.display import clear_output

First, I obtained the data of all approved movies in the Bechdel test website API by running curl http://bechdeltest.com/api/v1/getAllMovies > data.json)
Return value: An array of JSON objects containing all information on all visible (approved) movies in the list. ID, imdbid, name, year and rating are all available.

In [3]:
# Open the JSON file

with open('data.json') as f:
    data = json.load(f)

In [4]:
# Convert it to a DataFrame
allmovies_bechdel = json_normalize(data)
print(allmovies_bechdel.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8363 entries, 0 to 8362
Data columns (total 5 columns):
year      8363 non-null int64
title     8363 non-null object
id        8363 non-null int64
rating    8363 non-null int64
imdbid    8363 non-null object
dtypes: int64(3), object(2)
memory usage: 326.8+ KB
None


In [5]:
# Finding missing values

null_values = allmovies_bechdel.isnull().sum()
null_values[null_values > 0]

#There are missing values

Series([], dtype: int64)

#### Next, I am going to filter just the movies released from the year 2000 onwards.

In [6]:
from_2000 = allmovies_bechdel.loc[allmovies_bechdel['year'] >= 2000]
from_2000

# There are a total of 5208 movies released from 2000 owards that have been tested with the Bechdel test

Unnamed: 0,year,title,id,rating,imdbid
3155,2000,Hellraiser: Inferno,5274,1,0229440
3156,2000,Batman Beyond: Return of the Joker,3387,3,0233298
3157,2000,Something Between Us,4172,3,0286137
3158,2000,Boiler Room,625,1,0181984
3159,2000,Gundam Wing: The Movie - Endless Waltz,4835,3,0260191
...,...,...,...,...,...
8358,2019,Fighting with My Family,8679,3,6513120
8359,2020,Honey Boy,8965,0,8151874
8360,2020,"Marijuana Conspiracy , The",8859,3,8461042
8361,2020,Like a Boss,8964,3,7545266


#### Now, I am going to take the imbdid of the 5208 titles and save them in a list to perform the next API request to the same API but to a different endpoint (getMovieByImdbId) http://bechdeltest.com/api/v1/getMovieByImdbId?imdbid= 

In [7]:
imbd_id_list = from_2000['imdbid'].tolist()

In [8]:
# Verifying that the list contains the 5208 imdb movie id's
print(len(imbd_id_list))
print(type(imbd_id_list))

5208
<class 'list'>


## API Request

### Bechdel Test Website

API Request to actually get the information of the Bechdel test for each imdb movie id.

Method: getMovieByImdbId

Params: imdbid (the IMDb ID of the object)

Return value: A JSON object containing information about a movie. Fields returned are:

| Name | Value |
| --- | --- |
| visible | Has this movie been approved (currently only approved movies are returned, so this value will always be 1). |
| date | The date this movie was added to the list |
| submitterid | The ID of the submitter. Since submitter information is currently not available through the API, is of no use |
| rating | The actual score. Number from 0 to 3 (0 means no two women, 1 means no talking, 2 means talking about a man, 3 means it passes the test). |
| dubious | Whether the submitter considered the rating dubious. |
| imdbid | The IMDb id. |
| id | The bechdeltest.com unique id. |
| title | The title of the movie. Any weird characters are HTML encoded (so Brüno is returned as "Br&uuml;no"). |
| year | The year this movie was released (according to IMDb). |

In [9]:
# api-endpoint
URL = 'http://bechdeltest.com/api/v1/getMovieByImdbId?imdbid='

# List to store the API requests
api_responses = []
bad_responses = []

# For every imdb id in the imdb id list
for imdb_id in imbd_id_list:
        
        # Make a get request
        response = requests.get(URL+imdb_id)
        
        if response.status_code != 200:
       
            #add data of imbd id and error to bad_responses list to track afterwards
            bad_responses.append((imdb_id, response.status_code))
            pass
        
        else:
            
            # Parse the content of the request and append to api_responses list
            json_data = json.loads(response.text)
            api_responses.append(json_data)

In [11]:
len(api_responses)

5208

#### Transform responses list to a dataframe

In [12]:
bechdel_dataset = json_normalize(api_responses)
bechdel_dataset

Unnamed: 0,rating,title,imdbid,date,dubious,year,visible,submitterid,id
0,1,Hellraiser: Inferno,0229440,2014-03-12 04:04:37,0,2000,1,9649,5274
1,3,Batman Beyond: Return of the Joker,0233298,2012-06-28 02:50:22,0,2000,1,5437,3387
2,3,Something Between Us,0286137,2013-05-28 16:23:55,0,2000,1,7266,4172
3,1,Boiler Room,0181984,2009-12-28 23:38:13,0,2000,1,356,625
4,3,Gundam Wing: The Movie - Endless Waltz,0260191,2013-11-15 09:04:43,0,2000,1,8775,4835
...,...,...,...,...,...,...,...,...,...
5203,3,Fighting with My Family,6513120,2019-03-19 17:12:02,0,2019,1,16983,8679
5204,0,Honey Boy,8151874,2020-01-17 15:08:59,0,2020,1,17564,8965
5205,3,"Marijuana Conspiracy , The",8461042,2019-09-24 19:53:51,0,2020,1,17361,8859
5206,3,Like a Boss,7545266,2020-01-15 06:50:26,0,2020,1,17557,8964


In [13]:
# Saving DataFrame to CSV
bechdel_dataset.to_csv(r'C:\Users\pao-l\Documents\GitHub\Project-Week-3-Data-Thieves\your-project\bechdeldataset.csv', index=False)


In [29]:
def idclean (myid):
    return 'tt'+ str(myid)

bechdel_dataset['imdbid'] = bechdel_dataset['imdbid'].apply(idclean)

In [30]:
bechdel_dataset

Unnamed: 0,rating,title,imdbid,date,dubious,year,visible,submitterid,id
0,1,Hellraiser: Inferno,tt0229440,2014-03-12 04:04:37,0,2000,1,9649,5274
1,3,Batman Beyond: Return of the Joker,tt0233298,2012-06-28 02:50:22,0,2000,1,5437,3387
2,3,Something Between Us,tt0286137,2013-05-28 16:23:55,0,2000,1,7266,4172
3,1,Boiler Room,tt0181984,2009-12-28 23:38:13,0,2000,1,356,625
4,3,Gundam Wing: The Movie - Endless Waltz,tt0260191,2013-11-15 09:04:43,0,2000,1,8775,4835
...,...,...,...,...,...,...,...,...,...
5203,3,Fighting with My Family,tt6513120,2019-03-19 17:12:02,0,2019,1,16983,8679
5204,0,Honey Boy,tt8151874,2020-01-17 15:08:59,0,2020,1,17564,8965
5205,3,"Marijuana Conspiracy , The",tt8461042,2019-09-24 19:53:51,0,2020,1,17361,8859
5206,3,Like a Boss,tt7545266,2020-01-15 06:50:26,0,2020,1,17557,8964


In [31]:
bechdel_dataset.to_csv(r'C:\Users\pao-l\Documents\GitHub\Project-Week-3-Data-Thieves\your-project\bechdeldataset_ttid.csv', index=False)


### IMDB API

Movie Database API from RapidAPI. The Movie Database API has two GET endpoints (no POST options):

-Search or ID/Title

I used the ID or Title endpoint:

This endpoint will return more detailed results about a specific title you have in mind (IMDb ID or movie title required).

The optional parameters include:

callback – JSONP callback name
i – IMDb ID
type – Type of result to return: (movie, series, episode)
r – Data type to return (JSON or XML)
plot – Choose short or full summary (short, full)
y – Year of release

RESPONSE: Get Title, Year, Metascore Rating, IMDB rating, Release date, Runtime, Genre, Directors, Writers, Actors, Plot, Awards, Posters, IMDB ID, Type, DVD, Boxoffice, Production company, website and response data.

In [17]:
def idclean (myid):
    return 'tt'+ str(myid)

tt_id_list = from_2000['imdbid'].apply(idclean)
tt_id_list

3155     tt0229440
3156     tt0233298
3157     tt0286137
3158     tt0181984
3159     tt0260191
           ...    
8358     tt6513120
8359     tt8151874
8360     tt8461042
8361     tt7545266
8362    tt10781754
Name: imdbid, Length: 5208, dtype: object

In [20]:
# api-endpoint
url = "https://movie-database-imdb-alternative.p.rapidapi.com/"

# Lists to store the API requests
movie_api_resp = []
movie_bad_resp = []

# For every imdb id in the imdb id list
for imdb_id in tt_id_list:
        
        # Make a get request
        querystring = {"i": imdb_id ,"r":"json"}

        headers = {
            'x-rapidapi-host': "movie-database-imdb-alternative.p.rapidapi.com",
            'x-rapidapi-key': "bc876f6968msh1ed6a4ccae475dcp128dbajsn01763fa4fb88"
            }

        movie_response = requests.request("GET", url, headers=headers, params=querystring)
        
        if movie_response.status_code != 200:
       
            #add data of imbd id and error to bad_responses list to track afterwards
            movie_bad_resp.append((imdb_id, movie_response.status_code))
            pass
        
        else:
            
            # Parse the content of the request and append to api_responses list
            myjson_data = json.loads(movie_response.text)
            movie_api_resp.append(myjson_data)

In [21]:
movie_api_resp

[{'Title': 'Hellraiser: Inferno',
  'Year': '2000',
  'Rated': 'R',
  'Released': '03 Oct 2000',
  'Runtime': '99 min',
  'Genre': 'Crime, Horror, Mystery, Thriller',
  'Director': 'Scott Derrickson',
  'Writer': 'Clive Barker (characters), Paul Harris Boardman, Scott Derrickson',
  'Actors': 'Craig Sheffer, Nicholas Turturro, James Remar, Doug Bradley',
  'Plot': 'A shady police detective becomes embroiled in a strange world of murder, sadism and madness after being assigned a murder investigation against a madman known only as "The Engineer".',
  'Language': 'English',
  'Country': 'USA',
  'Awards': '1 win & 3 nominations.',
  'Poster': 'https://m.media-amazon.com/images/M/MV5BZTJkYzI4ZGQtYTU3Yy00MWU1LWE3NjUtZjI5Zjc2ZGI0Mjc5XkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_SX300.jpg',
  'Ratings': [{'Source': 'Internet Movie Database', 'Value': '5.5/10'}],
  'Metascore': 'N/A',
  'imdbRating': '5.5',
  'imdbVotes': '14,299',
  'imdbID': 'tt0229440',
  'Type': 'movie',
  'DVD': '10 Oct 2000',
  'BoxO

In [22]:
len(movie_api_resp)

5208

#### Transform responses list to a dataframe

In [70]:
imdb_dataset = json_normalize(movie_api_resp)
imdb_dataset 

Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,...,DVD,BoxOffice,Production,Website,Response,totalSeasons,Season,Episode,seriesID,Error
0,Hellraiser: Inferno,2000,R,03 Oct 2000,99 min,"Crime, Horror, Mystery, Thriller",Scott Derrickson,"Clive Barker (characters), Paul Harris Boardma...","Craig Sheffer, Nicholas Turturro, James Remar,...",A shady police detective becomes embroiled in ...,...,10 Oct 2000,,Miramax,,True,,,,,
1,Batman Beyond: Return of the Joker,2000,PG-13,12 Dec 2000,76 min,"Animation, Action, Crime, Sci-Fi, Thriller",Curt Geda,"Bob Kane (character created by: Batman), Paul ...","Will Friedle, Kevin Conroy, Mark Hamill, Angie...","The Joker is back with a vengeance, and Gotham...",...,12 Dec 2000,,Warner Home Video,,True,,,,,
2,Something Between Us,2000,,16 Dec 2000,79 min,"Comedy, Romance",George Thomas Jr.,George Thomas Jr.,"Louisa Elder, George Thomas Jr., Meg Kannin, S...",,...,,,,,True,,,,,
3,Boiler Room,2000,R,18 Feb 2000,120 min,"Crime, Drama, Thriller",Ben Younger,Ben Younger,"Giovanni Ribisi, Vin Diesel, Nia Long, Nicky Katt","A college dropout, attempting to live up to hi...",...,11 Jul 2000,,New Line Home Entertainment,,True,,,,,
4,Gundam Wing: The Movie - Endless Waltz,1998,,10 Nov 2000,180 min,"Animation, Action, Drama, Sci-Fi",Yasunao Aoki,"Rika Takahashi (translation), Yoshiyuki Tomino...","Mark Hildreth, Scott McNeil, Kirby Morrow, Bra...","After Colony 196, Treize Kushrenada is dead an...",...,06 Feb 2001,,,,True,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5203,Fighting with My Family,2019,PG-13,22 Feb 2019,108 min,"Biography, Comedy, Drama, Sport",Stephen Merchant,Stephen Merchant,"Dwayne Johnson, Thomas Whilley, Tori Ellen Ros...",A former wrestler and his family make a living...,...,14 May 2019,,MGM,,True,,,,,
5204,Honey Boy,2019,R,27 Nov 2019,94 min,Drama,Alma Har'el,Shia LaBeouf,"Shia LaBeouf, Lucas Hedges, Noah Jupe, Byron B...",A young actor's stormy childhood and early adu...,...,07 Feb 2020,,Amazon Studios,,True,,,,,
5205,The Marijuana Conspiracy,2020,,18 Jan 2020,124 min,,Craig Pryce,Craig Pryce,"Tymika Tafari, Julia Sarah Stone, Morgan Kohan...","In 1972, young women looking for a fresh start...",...,,,,,True,,,,,
5206,Like a Boss,2020,R,10 Jan 2020,83 min,Comedy,Miguel Arteta,"Sam Pitman (screenplay by), Adam Cole-Kelly (s...","Rose Byrne, Salma Hayek, Lisa Kudrow, Jennifer...",Two friends with very different ideals start a...,...,,,Paramount Pictures,,True,,,,,


In [24]:
# Saving DataFrame to CSV
imdb_dataset.to_csv(r'C:\Users\pao-l\Documents\GitHub\Project-Week-3-Data-Thieves\your-project\imdbdataset.csv', index=False)


In [71]:
imdb_dataset.columns

Index(['Title', 'Year', 'Rated', 'Released', 'Runtime', 'Genre', 'Director',
       'Writer', 'Actors', 'Plot', 'Language', 'Country', 'Awards', 'Poster',
       'Ratings', 'Metascore', 'imdbRating', 'imdbVotes', 'imdbID', 'Type',
       'DVD', 'BoxOffice', 'Production', 'Website', 'Response', 'totalSeasons',
       'Season', 'Episode', 'seriesID', 'Error'],
      dtype='object')

In [72]:
imdb_dataset['Error'].value_counts()

Incorrect IMDb ID.     5
Error getting data.    3
Name: Error, dtype: int64

In [73]:
check= imdb_dataset.loc[imdb_dataset['Error'] == 'Incorrect IMDb ID.']
check.index

Int64Index([1610, 3170, 3444, 3804, 4076], dtype='int64')

In [74]:
check2= imdb_dataset.loc[imdb_dataset['Error'] == 'Error getting data.'] 
check2.index

Int64Index([4239, 4525, 4654], dtype='int64')

In [75]:
imdb_dataset.drop(index= [1610, 3170, 3444, 3804, 4076, 4239, 4525,4654], inplace= True)

In [77]:
imdb_dataset

Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,...,DVD,BoxOffice,Production,Website,Response,totalSeasons,Season,Episode,seriesID,Error
0,Hellraiser: Inferno,2000,R,03 Oct 2000,99 min,"Crime, Horror, Mystery, Thriller",Scott Derrickson,"Clive Barker (characters), Paul Harris Boardma...","Craig Sheffer, Nicholas Turturro, James Remar,...",A shady police detective becomes embroiled in ...,...,10 Oct 2000,,Miramax,,True,,,,,
1,Batman Beyond: Return of the Joker,2000,PG-13,12 Dec 2000,76 min,"Animation, Action, Crime, Sci-Fi, Thriller",Curt Geda,"Bob Kane (character created by: Batman), Paul ...","Will Friedle, Kevin Conroy, Mark Hamill, Angie...","The Joker is back with a vengeance, and Gotham...",...,12 Dec 2000,,Warner Home Video,,True,,,,,
2,Something Between Us,2000,,16 Dec 2000,79 min,"Comedy, Romance",George Thomas Jr.,George Thomas Jr.,"Louisa Elder, George Thomas Jr., Meg Kannin, S...",,...,,,,,True,,,,,
3,Boiler Room,2000,R,18 Feb 2000,120 min,"Crime, Drama, Thriller",Ben Younger,Ben Younger,"Giovanni Ribisi, Vin Diesel, Nia Long, Nicky Katt","A college dropout, attempting to live up to hi...",...,11 Jul 2000,,New Line Home Entertainment,,True,,,,,
4,Gundam Wing: The Movie - Endless Waltz,1998,,10 Nov 2000,180 min,"Animation, Action, Drama, Sci-Fi",Yasunao Aoki,"Rika Takahashi (translation), Yoshiyuki Tomino...","Mark Hildreth, Scott McNeil, Kirby Morrow, Bra...","After Colony 196, Treize Kushrenada is dead an...",...,06 Feb 2001,,,,True,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5203,Fighting with My Family,2019,PG-13,22 Feb 2019,108 min,"Biography, Comedy, Drama, Sport",Stephen Merchant,Stephen Merchant,"Dwayne Johnson, Thomas Whilley, Tori Ellen Ros...",A former wrestler and his family make a living...,...,14 May 2019,,MGM,,True,,,,,
5204,Honey Boy,2019,R,27 Nov 2019,94 min,Drama,Alma Har'el,Shia LaBeouf,"Shia LaBeouf, Lucas Hedges, Noah Jupe, Byron B...",A young actor's stormy childhood and early adu...,...,07 Feb 2020,,Amazon Studios,,True,,,,,
5205,The Marijuana Conspiracy,2020,,18 Jan 2020,124 min,,Craig Pryce,Craig Pryce,"Tymika Tafari, Julia Sarah Stone, Morgan Kohan...","In 1972, young women looking for a fresh start...",...,,,,,True,,,,,
5206,Like a Boss,2020,R,10 Jan 2020,83 min,Comedy,Miguel Arteta,"Sam Pitman (screenplay by), Adam Cole-Kelly (s...","Rose Byrne, Salma Hayek, Lisa Kudrow, Jennifer...",Two friends with very different ideals start a...,...,,,Paramount Pictures,,True,,,,,


In [76]:
imdb_dataset.describe()

Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,...,DVD,BoxOffice,Production,Website,Response,totalSeasons,Season,Episode,seriesID,Error
count,5200,5200,5200,5200.0,5200,5200,5200.0,5200.0,5200.0,5200.0,...,5121.0,5121.0,5121.0,5121.0,5200,58,21,21,21,0.0
unique,5143,55,17,2362.0,184,1001,3112.0,4740.0,5145.0,5167.0,...,1302.0,2539.0,1083.0,20.0,1,11,5,14,5,0.0
top,The Circle,2013,R,,90 min,Drama,,,,,...,,,,,True,1,2,2,tt0448190,
freq,3,391,1814,45.0,205,416,62.0,79.0,9.0,32.0,...,878.0,2478.0,641.0,5102.0,5200,25,13,3,15,


## Merging the Datasets

In [None]:
#df_test = df_ttdid.merge(imdb_datadf, how='left', left_on='imdbid', right_on='imdbID')

In [147]:
merge_df= bechdel_dataset.merge(imdb_dataset, left_on='imdbid', right_on='imdbID', how='outer')
merge_df.reset_index(drop=True)

Unnamed: 0,rating,title,imdbid,date,dubious,year,visible,submitterid,id,Title,...,DVD,BoxOffice,Production,Website,Response,totalSeasons,Season,Episode,seriesID,Error
0,1.0,Hellraiser: Inferno,tt0229440,2014-03-12 04:04:37,0,2000.0,1,9649.0,5274.0,Hellraiser: Inferno,...,10 Oct 2000,,Miramax,,True,,,,,
1,3.0,Batman Beyond: Return of the Joker,tt0233298,2012-06-28 02:50:22,0,2000.0,1,5437.0,3387.0,Batman Beyond: Return of the Joker,...,12 Dec 2000,,Warner Home Video,,True,,,,,
2,3.0,Something Between Us,tt0286137,2013-05-28 16:23:55,0,2000.0,1,7266.0,4172.0,Something Between Us,...,,,,,True,,,,,
3,1.0,Boiler Room,tt0181984,2009-12-28 23:38:13,0,2000.0,1,356.0,625.0,Boiler Room,...,11 Jul 2000,,New Line Home Entertainment,,True,,,,,
4,3.0,Gundam Wing: The Movie - Endless Waltz,tt0260191,2013-11-15 09:04:43,0,2000.0,1,8775.0,4835.0,Gundam Wing: The Movie - Endless Waltz,...,06 Feb 2001,,,,True,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5210,3.0,Like a Boss,tt7545266,2020-01-15 06:50:26,0,2020.0,1,17557.0,8964.0,Like a Boss,...,,,Paramount Pictures,,True,,,,,
5211,3.0,"American Terrorist: April 20, 1999",tt10781754,2019-08-10 22:19:59,1,2021.0,1,17287.0,8811.0,AmeriCaN_TeRRoRiST,...,,,,,True,,,,,
5212,,,,,,,,,,Pokémon the Movie 2000,...,14 Nov 2000,"$2,119,065",,,True,,,,,
5213,,,,,,,,,,Rush Hour 3,...,18 Dec 2007,"$140,080,850",New Line Cinema,,True,,,,,


In [148]:
check_duplicates = merge_df['imdbid'].duplicated()

In [149]:
merge_df[check_duplicates]

Unnamed: 0,rating,title,imdbid,date,dubious,year,visible,submitterid,id,Title,...,DVD,BoxOffice,Production,Website,Response,totalSeasons,Season,Episode,seriesID,Error
2549,3.0,Last Call at the Oasis,tt2043900,2013-11-29 21:16:54,0.0,2011.0,1.0,8867.0,4889.0,Last Call at the Oasis,...,13 Nov 2012,"$40,846",IDP/ATO,,True,,,,,
2550,3.0,Last Call at the Oasis,tt2043900,2013-11-29 21:16:54,0.0,2011.0,1.0,8867.0,4889.0,Last Call at the Oasis,...,13 Nov 2012,"$40,846",IDP/ATO,,True,,,,,
2551,3.0,Last Call at the Oasis,tt2043900,2013-11-29 21:16:54,0.0,2011.0,1.0,8867.0,4889.0,Last Call at the Oasis,...,13 Nov 2012,"$40,846",IDP/ATO,,True,,,,,
3252,3.0,Puella Magi Madoka Magica the Movie Part III: ...,tt2457282,2013-12-24 20:34:24,0.0,2013.0,1.0,9023.0,4980.0,Puella Magi Madoka Magica the Movie Part III: ...,...,,,Warner Brothers,,True,,,,,
3253,3.0,Puella Magi Madoka Magica the Movie Part III: ...,tt2457282,2013-12-24 20:34:24,0.0,2013.0,1.0,9023.0,4980.0,Puella Magi Madoka Magica the Movie Part III: ...,...,,,Warner Brothers,,True,,,,,
3254,3.0,Puella Magi Madoka Magica the Movie Part III: ...,tt2457282,2013-12-24 20:34:24,0.0,2013.0,1.0,9023.0,4980.0,Puella Magi Madoka Magica the Movie Part III: ...,...,,,Warner Brothers,,True,,,,,
5213,,,,,,,,,,Rush Hour 3,...,18 Dec 2007,"$140,080,850",New Line Cinema,,True,,,,,
5214,,,,,,,,,,Force Majeure,...,10 Feb 2015,,Magnolia Pictures,,True,,,,,


In [150]:
merge_df.drop(index= [2549,2550,3253,3254,5212,5213,5214], inplace= True)

In [151]:
merge_df.reset_index(drop=True)

Unnamed: 0,rating,title,imdbid,date,dubious,year,visible,submitterid,id,Title,...,DVD,BoxOffice,Production,Website,Response,totalSeasons,Season,Episode,seriesID,Error
0,1.0,Hellraiser: Inferno,tt0229440,2014-03-12 04:04:37,0,2000.0,1,9649.0,5274.0,Hellraiser: Inferno,...,10 Oct 2000,,Miramax,,True,,,,,
1,3.0,Batman Beyond: Return of the Joker,tt0233298,2012-06-28 02:50:22,0,2000.0,1,5437.0,3387.0,Batman Beyond: Return of the Joker,...,12 Dec 2000,,Warner Home Video,,True,,,,,
2,3.0,Something Between Us,tt0286137,2013-05-28 16:23:55,0,2000.0,1,7266.0,4172.0,Something Between Us,...,,,,,True,,,,,
3,1.0,Boiler Room,tt0181984,2009-12-28 23:38:13,0,2000.0,1,356.0,625.0,Boiler Room,...,11 Jul 2000,,New Line Home Entertainment,,True,,,,,
4,3.0,Gundam Wing: The Movie - Endless Waltz,tt0260191,2013-11-15 09:04:43,0,2000.0,1,8775.0,4835.0,Gundam Wing: The Movie - Endless Waltz,...,06 Feb 2001,,,,True,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5203,3.0,Fighting with My Family,tt6513120,2019-03-19 17:12:02,0,2019.0,1,16983.0,8679.0,Fighting with My Family,...,14 May 2019,,MGM,,True,,,,,
5204,0.0,Honey Boy,tt8151874,2020-01-17 15:08:59,0,2020.0,1,17564.0,8965.0,Honey Boy,...,07 Feb 2020,,Amazon Studios,,True,,,,,
5205,3.0,"Marijuana Conspiracy , The",tt8461042,2019-09-24 19:53:51,0,2020.0,1,17361.0,8859.0,The Marijuana Conspiracy,...,,,,,True,,,,,
5206,3.0,Like a Boss,tt7545266,2020-01-15 06:50:26,0,2020.0,1,17557.0,8964.0,Like a Boss,...,,,Paramount Pictures,,True,,,,,


In [152]:
#Finding if there is any null values id the imdb id column
merge_df[merge_df.imdbid.isnull()]

Unnamed: 0,rating,title,imdbid,date,dubious,year,visible,submitterid,id,Title,...,DVD,BoxOffice,Production,Website,Response,totalSeasons,Season,Episode,seriesID,Error


In [157]:
clean_df = merge_df.reset_index(drop=True)

### Dropping Irrelevant Columns

In [162]:
clean_df.columns

Index(['rating', 'title', 'imdbid', 'date', 'dubious', 'year', 'visible',
       'submitterid', 'id', 'Title', 'Year', 'Rated', 'Released', 'Runtime',
       'Genre', 'Director', 'Writer', 'Actors', 'Plot', 'Language', 'Country',
       'Awards', 'Poster', 'Ratings', 'Metascore', 'imdbRating', 'imdbVotes',
       'imdbID', 'Type', 'DVD', 'BoxOffice', 'Production', 'Website',
       'Response', 'totalSeasons', 'Season', 'Episode', 'seriesID', 'Error'],
      dtype='object')

In [177]:
clean_df['seriesID'].value_counts()

tt0448190    15
N/A           2
tt2085059     2
tt0159876     1
tt2341669     1
Name: seriesID, dtype: int64

In [186]:
# Get names of indexes for which column Age has value 30
indexNames = clean_df[clean_df['seriesID'] == 'tt0448190'].index
clean_df.drop(indexNames , inplace=True)

In [188]:
indexNames2 = clean_df[clean_df['seriesID'] == 'tt2085059'].index
clean_df.drop(indexNames2 , inplace=True)

In [191]:
indexNames3 = clean_df[clean_df['seriesID'] == 'tt0159876'].index
clean_df.drop(indexNames3 , inplace=True)

In [194]:
indexNames3 = clean_df[clean_df['seriesID'] == 'tt2341669'].index
clean_df.drop(indexNames3 , inplace=True)

In [195]:
clean_df.loc[clean_df['seriesID'] == 'tt2341669']

Unnamed: 0,rating,title,imdbid,date,dubious,year,visible,submitterid,id,Title,...,DVD,BoxOffice,Production,Website,Response,totalSeasons,Season,Episode,seriesID,Error


In [196]:
clean_df

Unnamed: 0,rating,title,imdbid,date,dubious,year,visible,submitterid,id,Title,...,DVD,BoxOffice,Production,Website,Response,totalSeasons,Season,Episode,seriesID,Error
0,1.0,Hellraiser: Inferno,tt0229440,2014-03-12 04:04:37,0,2000.0,1,9649.0,5274.0,Hellraiser: Inferno,...,10 Oct 2000,,Miramax,,True,,,,,
1,3.0,Batman Beyond: Return of the Joker,tt0233298,2012-06-28 02:50:22,0,2000.0,1,5437.0,3387.0,Batman Beyond: Return of the Joker,...,12 Dec 2000,,Warner Home Video,,True,,,,,
2,3.0,Something Between Us,tt0286137,2013-05-28 16:23:55,0,2000.0,1,7266.0,4172.0,Something Between Us,...,,,,,True,,,,,
3,1.0,Boiler Room,tt0181984,2009-12-28 23:38:13,0,2000.0,1,356.0,625.0,Boiler Room,...,11 Jul 2000,,New Line Home Entertainment,,True,,,,,
4,3.0,Gundam Wing: The Movie - Endless Waltz,tt0260191,2013-11-15 09:04:43,0,2000.0,1,8775.0,4835.0,Gundam Wing: The Movie - Endless Waltz,...,06 Feb 2001,,,,True,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5203,3.0,Fighting with My Family,tt6513120,2019-03-19 17:12:02,0,2019.0,1,16983.0,8679.0,Fighting with My Family,...,14 May 2019,,MGM,,True,,,,,
5204,0.0,Honey Boy,tt8151874,2020-01-17 15:08:59,0,2020.0,1,17564.0,8965.0,Honey Boy,...,07 Feb 2020,,Amazon Studios,,True,,,,,
5205,3.0,"Marijuana Conspiracy , The",tt8461042,2019-09-24 19:53:51,0,2020.0,1,17361.0,8859.0,The Marijuana Conspiracy,...,,,,,True,,,,,
5206,3.0,Like a Boss,tt7545266,2020-01-15 06:50:26,0,2020.0,1,17557.0,8964.0,Like a Boss,...,,,Paramount Pictures,,True,,,,,


In [199]:
clean_df['Type'].value_counts()

movie      5115
series       58
game          3
episode       2
Name: Type, dtype: int64

In [200]:
movies = clean_df[clean_df['Type'] == 'movie']

In [207]:
#Columns to Drop

movies = movies.drop(columns=['Error','Poster', 'submitterid','visible', 'dubious', 'Website','totalSeasons','Season','Episode', 'seriesID', 'Type'], axis=1)

In [222]:
#Droping one of the title columns as tere are 2
movies = movies.drop(columns=['title'], axis=1)

In [227]:
#Droping the date in which the movie was added to the Bechdel List as I won't need it
movies = movies.drop(columns=['date'], axis=1)

In [230]:
movies = movies.drop(columns=['DVD'], axis=1)

In [234]:
movies = movies.drop(columns=['year'], axis=1)

In [236]:
movies = movies.drop(columns=['Rated'], axis=1)

In [240]:
movies = movies.drop(columns=['imdbID'], axis=1)

In [246]:
movies = movies.drop(columns=['Response'], axis=1)

##### Rename Columns

In [223]:
movies.rename(columns = {'rating':'BechdelRating', 'id':'BechdelId'}, inplace = True)

In [239]:
movies['Ratings'][3]

[{'Source': 'Internet Movie Database', 'Value': '7.0/10'},
 {'Source': 'Rotten Tomatoes', 'Value': '66%'},
 {'Source': 'Metacritic', 'Value': '63/100'}]

In [242]:
movies.columns


Index(['BechdelRating', 'imdbid', 'BechdelId', 'Title', 'Year', 'Released',
       'Runtime', 'Genre', 'Director', 'Writer', 'Actors', 'Plot', 'Language',
       'Country', 'Awards', 'Ratings', 'Metascore', 'imdbRating', 'imdbVotes',
       'BoxOffice', 'Production', 'Response'],
      dtype='object')

In [248]:
movies = movies.reset_index(drop=True)

In [249]:
movies

Unnamed: 0,BechdelRating,imdbid,BechdelId,Title,Year,Released,Runtime,Genre,Director,Writer,...,Plot,Language,Country,Awards,Ratings,Metascore,imdbRating,imdbVotes,BoxOffice,Production
0,1.0,tt0229440,5274.0,Hellraiser: Inferno,2000,03 Oct 2000,99 min,"Crime, Horror, Mystery, Thriller",Scott Derrickson,"Clive Barker (characters), Paul Harris Boardma...",...,A shady police detective becomes embroiled in ...,English,USA,1 win & 3 nominations.,"[{'Source': 'Internet Movie Database', 'Value'...",,5.5,14299,,Miramax
1,3.0,tt0233298,3387.0,Batman Beyond: Return of the Joker,2000,12 Dec 2000,76 min,"Animation, Action, Crime, Sci-Fi, Thriller",Curt Geda,"Bob Kane (character created by: Batman), Paul ...",...,"The Joker is back with a vengeance, and Gotham...",English,USA,3 wins & 5 nominations.,"[{'Source': 'Internet Movie Database', 'Value'...",,7.8,22149,,Warner Home Video
2,3.0,tt0286137,4172.0,Something Between Us,2000,16 Dec 2000,79 min,"Comedy, Romance",George Thomas Jr.,George Thomas Jr.,...,,English,USA,1 win.,"[{'Source': 'Internet Movie Database', 'Value'...",,6.7,13,,
3,1.0,tt0181984,625.0,Boiler Room,2000,18 Feb 2000,120 min,"Crime, Drama, Thriller",Ben Younger,Ben Younger,...,"A college dropout, attempting to live up to hi...","English, Russian",USA,1 win & 9 nominations.,"[{'Source': 'Internet Movie Database', 'Value'...",63,7.0,47574,,New Line Home Entertainment
4,3.0,tt0260191,4835.0,Gundam Wing: The Movie - Endless Waltz,1998,10 Nov 2000,180 min,"Animation, Action, Drama, Sci-Fi",Yasunao Aoki,"Rika Takahashi (translation), Yoshiyuki Tomino...",...,"After Colony 196, Treize Kushrenada is dead an...",English,Japan,,"[{'Source': 'Internet Movie Database', 'Value'...",,7.9,2165,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5110,3.0,tt6513120,8679.0,Fighting with My Family,2019,22 Feb 2019,108 min,"Biography, Comedy, Drama, Sport",Stephen Merchant,Stephen Merchant,...,A former wrestler and his family make a living...,English,"UK, USA, Mexico",10 wins & 2 nominations.,"[{'Source': 'Internet Movie Database', 'Value'...",68,7.1,54112,,MGM
5111,0.0,tt8151874,8965.0,Honey Boy,2019,27 Nov 2019,94 min,Drama,Alma Har'el,Shia LaBeouf,...,A young actor's stormy childhood and early adu...,"English, Spanish",USA,9 wins & 33 nominations.,"[{'Source': 'Internet Movie Database', 'Value'...",73,7.4,11841,,Amazon Studios
5112,3.0,tt8461042,8859.0,The Marijuana Conspiracy,2020,18 Jan 2020,124 min,,Craig Pryce,Craig Pryce,...,"In 1972, young women looking for a fresh start...",English,Canada,,[],,,,,
5113,3.0,tt7545266,8964.0,Like a Boss,2020,10 Jan 2020,83 min,Comedy,Miguel Arteta,"Sam Pitman (screenplay by), Adam Cole-Kelly (s...",...,Two friends with very different ideals start a...,English,USA,,"[{'Source': 'Internet Movie Database', 'Value'...",33,4.3,2633,,Paramount Pictures


### Finally, saving the DataFrame to a CSV

In [250]:
movies.to_csv(r'C:\Users\pao-l\Documents\GitHub\Project-Week-3-Data-Thieves\your-project\Movies.csv', index=False)