# Requirements
 - Input the data
 - Parse out the data in the Download Data field so that we have one field containing the Movie title and one field containing information about whether of not the movie passes the Bechdel Test
 - Before we deal with the majority of the html codes, I would recommend replacing '&amp;' instances with '&' because of this film on the website incorrectly converting the html code 
 - Extract the html codes from the Movie titles
 - These will always start with a '&' and end with a ';'
 - The maximum number of html codes in a Movie title is 5
 - Replace the html codes with their correct characters
 - Ensure that codes which match up to spaces have a space in their character cell rather than a null value
 - Parse out the information for whether a film passes or fails the Bechdel test as well as the detailed reasoning behind this
 - Rank the Bechdel Test Categorisations from 1 to 5, 1 being the best result, 5 being the worst result
 - Where a film has multiple categorisations, keep only the worse ranking, even if this means the movie moves from pass to fail
 - Output the data


In [1]:
import os
import pandas as pd

### Input the data

In [2]:
dfs = pd.read_excel('PD Bechdel Test.xlsx',sheet_name = None)


In [3]:
dfs.keys()

dict_keys(['Webscraping', 'html'])

In [4]:
df_web = dfs['Webscraping']
df_html = dfs['html']

In [5]:
df_web

Unnamed: 0,DownloadData,Year
0,"<a href=""http://us.imdb.com/title/tt3155794/"">...",1874
1,"<a href=""http://us.imdb.com/title/tt14495706/""...",1877
2,"<a href=""http://us.imdb.com/title/tt12592084/""...",1878
3,"<a href=""http://us.imdb.com/title/tt2221420/"">...",1878
4,"<a href=""http://us.imdb.com/title/tt7816420/"">...",1881
...,...,...
9412,"<a href=""http://us.imdb.com/title/tt3581652/"">...",2021
9413,"<a href=""http://us.imdb.com/title/tt8114980/"">...",2021
9414,"<a href=""http://us.imdb.com/title/tt0499097/"">...",2021
9415,"<a href=""http://us.imdb.com/title/tt12361974/""...",2021


In [6]:
df_html

Unnamed: 0,Char,Numeric,Named,Description
0,,code,code,
1,,&#32;,,space
2,!,&#33;,,exclamation mark
3,"""",&#34;,&quot;,double quote
4,#,&#35;,,number
...,...,...,...,...
168,–,&#150;,,en dash
169,',&#39;,,apostrophe
170,ı,&#305;,&Jcirc;,latin small letter dotless i
171,ź,&#378;,&zacute;,LATIN SMALL LETTER Z WITH ACUTE


### Parse out the data in the Download Data field so that we have one field containing the Movie title and one field containing information about whether of not the movie passes the Bechdel Test

### Before we deal with the majority of the html codes, I would recommend replacing '&amp;' instances with '&' because of this film on the website incorrectly converting the html code 

In [7]:
df_html[df_html['Char']=='&']

Unnamed: 0,Char,Numeric,Named,Description
7,&,&#38;,&amp;,ampersand


In [8]:
df_web['DownloadData'] = df_web['DownloadData'].str.replace('&amp','&')

 - Extract the html codes from the Movie titles
 - These will always start with a '&' and end with a ';'

In [9]:
df_web[['test','movie']] = df_web['DownloadData'].str.split(' href=',expand = True).iloc[: , 1:3]
df_web

Unnamed: 0,DownloadData,Year,test,movie
0,"<a href=""http://us.imdb.com/title/tt3155794/"">...",1874,"""http://us.imdb.com/title/tt3155794/""><img src...","""/view/9602/passage_de_venus/"">Passage de Venu..."
1,"<a href=""http://us.imdb.com/title/tt14495706/""...",1877,"""http://us.imdb.com/title/tt14495706/""><img sr...","""/view/9804/la_rosace_magique/"">La Rosace Magi..."
2,"<a href=""http://us.imdb.com/title/tt12592084/""...",1878,"""http://us.imdb.com/title/tt12592084/""><img sr...","""/view/9806/le_singe_musicien/"">Le singe music..."
3,"<a href=""http://us.imdb.com/title/tt2221420/"">...",1878,"""http://us.imdb.com/title/tt2221420/""><img src...","""/view/9603/sallie_gardner_at_a_gallop/"">Salli..."
4,"<a href=""http://us.imdb.com/title/tt7816420/"">...",1881,"""http://us.imdb.com/title/tt7816420/""><img src...","""/view/9816/athlete_swinging_a_pick/"">Athlete ..."
...,...,...,...,...
9412,"<a href=""http://us.imdb.com/title/tt3581652/"">...",2021,"""http://us.imdb.com/title/tt3581652/""><img src...","""/view/10157/west_side_story/"">West Side Story..."
9413,"<a href=""http://us.imdb.com/title/tt8114980/"">...",2021,"""http://us.imdb.com/title/tt8114980/""><img src...","""/view/9604/willy&#39;s_wonderland/"">Willy&#39..."
9414,"<a href=""http://us.imdb.com/title/tt0499097/"">...",2021,"""http://us.imdb.com/title/tt0499097/""><img src...","""/view/9830/without_remorse/"">Without Remorse<..."
9415,"<a href=""http://us.imdb.com/title/tt12361974/""...",2021,"""http://us.imdb.com/title/tt12361974/""><img sr...","""/view/9575/zack_snyder&#39;s_justice_league/""..."


In [23]:
# Get test text
def strip_test(i):
    b = i.split('title="[')[1].split(']">')[0].strip()
    return b

In [24]:
# Get Movie text
def strip_movie(i):
    b = i.split('/">')[1].split('</a>')[0].strip()
    return b

In [25]:
# Clean HTML text
from bs4 import BeautifulSoup

def clean_html(i):
    return BeautifulSoup(i).text

In [26]:
# Get Ranking
def strip_ranking(i):
    b = i.split('alt="[[',)[1].split(']]"')[0]
    return b

In [27]:
df_web['ranking'] = df_web['test'].apply(strip_ranking).astype('int')

In [29]:
df_web['test'] = df_web['test'].apply(strip_test)

In [30]:
df_web['movie'] = df_web['movie'].apply(strip_movie)

In [31]:
# Clean HTML text
df_web['movie'] = df_web['movie'].apply(clean_html)
df_web['test'] = df_web['test'].apply(clean_html)

In [None]:
.to_excel('test.xlsx')
os.startfile('test.xlsx')

In [34]:
df_web.drop(columns='DownloadData',inplace=True)

In [38]:
df_web[['movie','test','ranking','Year']].to_excel('Bechdal.xlsx')
os.startfile('Bechdal.xlsx')