In [2]:
# Dependencies
from bs4 import BeautifulSoup
import requests, os
import pandas as pd
from pymongo import MongoClient

In [3]:
# connect to Mongo just in case, for later

client = MongoClient('mongodb+srv://mongo:<password>@ngranback.bmasa.mongodb.net/myFirstDatabase?retryWrites=true&w=majority')
ETL_db = client['ETL_db']
nella_ETL_collection = ETL_db['nella_ETL']

SCRAPING EPISODE RATING INFO FROM IMDB

In [18]:
# The goal is to create an episodic ratings table to fill in  
# episode ID: string (ex. S01E01)
# season: integer
# episode: integer
# rating: float
# title: string

# Initialize lists to hold scraped data, outside of loop 
ratinglist, titlelist, IDlist = [],[],[]
# Don't need these: seasonlist, episodelist

#List of seasons to loop through
seasons = [x+1 for x in range(9)]


# Loop through each of the seasons
for season in seasons:
    
    # loop status
    print(f'Starting to scrape season {season}')
    
    url = 'https://www.imdb.com/title/tt0098904/episodes?season='+str(season)    
    
    # Retrieve page and create iterable list of all episode info elements in that season
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    results = soup.find_all('div', class_="info", itemprop="episodes")

    # Loop through results from current season
    for result in results:
        
        try:
            #Season is already identified in for loop
            #seasonlist.append(season)
            
            #Identify episode as integer
            episode = int(result.find('meta', itemprop='episodeNumber')['content'])
            #episodelist.append(episode)
            
            #Identify rating as float
            rating = float(result.find('span', class_='ipl-rating-star__rating').text)
            ratinglist.append(rating)
            
            # Identify episode title as string
            title = result.find('a', itemprop='name').text
            titlelist.append(title)

            #Create episode ID
            IDseason = "{0:0=2d}".format(season)
            IDepisode = "{0:0=2d}".format(episode)
            ID = 'S'+IDseason+'E'+IDepisode
            IDlist.append(ID)         
            
        except AttributeError as e:
            print(e)


Starting to scrape season 1
Starting to scrape season 2
Starting to scrape season 3
Starting to scrape season 4
Starting to scrape season 5
Starting to scrape season 6
Starting to scrape season 7
Starting to scrape season 8
Starting to scrape season 9


In [19]:
# Assemble lists into final ratings dataframe
ratings_df = pd.DataFrame({
    'SEID': IDlist,
    #'Season': seasonlist,
    #'Episode': episodelist,
    'Rating': ratinglist,
    'Title': titlelist
})

print(ratings_df.shape)
ratings_df.head(5)

(173, 3)


Unnamed: 0,SEID,Rating,Title
0,S01E01,7.6,"Good News, Bad News"
1,S01E02,7.6,The Stakeout
2,S01E03,7.6,The Robbery
3,S01E04,7.3,Male Unbonding
4,S01E05,7.5,The Stock Tip


IMPORTING EPISODE DATA AND SCRIPT CSVs

In [28]:
info_df = pd.read_csv(r'C:\Users\nella\BOOTCAMP\HOMEWORK\ETL-mini-project\Seinfeld\episode_info.csv')

# Drop Season and Episode since they are covered in other tables, and unnamed column
info_df.drop('Season', axis=1, inplace=True)
info_df.drop('EpisodeNo', axis=1, inplace=True)
info_df.drop('SEID', axis=1, inplace=True) # this column is INACCURATE in the csv!!!
info_df.drop('Unnamed: 0', axis=1, inplace=True)

print(info_df.shape)
info_df.head()

(174, 4)


Unnamed: 0,Title,AirDate,Writers,Director
0,"Good News, Bad News","July 5, 1989","Larry David, Jerry Seinfeld",Art Wolff
1,The Stakeout,"May 31, 1990","Larry David, Jerry Seinfeld",Tom Cherones
2,The Robbery,"June 7, 1990",Matt Goldman,Tom Cherones
3,Male Unbonding,"June 14, 1990","Larry David, Jerry Seinfeld",Tom Cherones
4,The Stock Tip,"June 21, 1990","Larry David, Jerry Seinfeld",Tom Cherones


In [40]:
# Join ratings table to episode info table, on the title
info_with_ratings = info_df.merge(ratings_df, how='inner', on='Title')

print(info_with_ratings.shape) # Lost 16 episodes due to incomplete data
info_with_ratings.head()

(158, 6)


Unnamed: 0,Title,AirDate,Writers,Director,SEID,Rating
0,"Good News, Bad News","July 5, 1989","Larry David, Jerry Seinfeld",Art Wolff,S01E01,7.6
1,The Stakeout,"May 31, 1990","Larry David, Jerry Seinfeld",Tom Cherones,S01E02,7.6
2,The Robbery,"June 7, 1990",Matt Goldman,Tom Cherones,S01E03,7.6
3,Male Unbonding,"June 14, 1990","Larry David, Jerry Seinfeld",Tom Cherones,S01E04,7.3
4,The Stock Tip,"June 21, 1990","Larry David, Jerry Seinfeld",Tom Cherones,S01E05,7.5


In [33]:
script_df = pd.read_csv(r'C:\Users\nella\BOOTCAMP\HOMEWORK\ETL-mini-project\Seinfeld\scripts.csv')

# Convert Season and Episode to integers
script_df.EpisodeNo = script_df.EpisodeNo.astype('int64')
script_df.Season = script_df.Season.astype('int64')
# Drop extra index column
script_df.drop('Unnamed: 0', axis=1, inplace=True)


173

In [41]:
# Join all episode information to the script df
full_df = script_df.merge(info_with_ratings, how='inner', on='SEID')

print(len(full_df['SEID'].unique())) # Lost 4 more episodes (20 total) due to incomplete data

154


In [65]:
info_with_ratings['joincol'] = 1
scriptSEID = pd.DataFrame({'SEID': script_df['SEID'].unique()})


left = pd.merge(left=info_with_ratings,right=scriptSEID, on='SEID', how='left')
right = pd.merge(left=info_with_ratings,right=scriptSEID, on='SEID', how='right')
right.head(50)
no_ratings = right[right['Title'].isna()]
no_ratings['SEID']

31     S03E16
32     S03E17
33     S03E18
39     S04E01
40     S04E02
43     S04E05
44     S04E06
61     S04E23
62     S04E24
83     S05E22
91     S06E08
105    S06E24
119    S07E14
122    S07E17
125    S07E20
128    S07E23
129    S07E24
163    S09E12
172    S09E23
Name: SEID, dtype: object