In [29]:
import pandas as pd
from bs4 import BeautifulSoup
from requests import get

In [30]:
# Initializing the series for the loop
theoffice_episodes = []

# For every season in the The Ofice
for sn in range(1,9):
    # Request from the server the content of the web page by and store the server’s response
    response = get('https://www.imdb.com/title/tt0386676/episodes?season=' + str(sn))

    # Parse the content of the request
    page_html = BeautifulSoup(response.text, 'html.parser')

    # Select all the episode containers from the season's page
    episode_containers = page_html.find_all('div', class_ = 'info')

    # For each episode in each season
    for episodes in episode_containers:
            # Get the info of each episode on the page
            season = sn
            episode_number = episodes.meta['content']
            title = episodes.a['title']
            airdate = episodes.find('div', class_='airdate').text.strip()
            rating = episodes.find('span', class_='ipl-rating-star__rating').text
            total_votes = episodes.find('span', class_='ipl-rating-star__total-votes').text
            desc = episodes.find('div', class_='item_description').text.strip()
            # Compile the episode info
            episode_data = [season, episode_number, title, airdate, rating, total_votes, desc]

            # Append the episode info to the complete dataset
            theoffice_episodes.append(episode_data)

In [31]:
#Create the data frame
theoffice_episodes = pd.DataFrame(theoffice_episodes, columns = ['season', 'episode_number', 'title', 'airdate', 'rating', 'total_votes', 'desc'])
theoffice_episodes.head()

Unnamed: 0,season,episode_number,title,airdate,rating,total_votes,desc
0,1,1,Pilot,24 Mar. 2005,7.4,"(7,022)",The premiere episode introduces the boss and s...
1,1,2,Diversity Day,16 Mar. 2005,8.3,"(6,918)",Michael's off color remark puts a sensitivity ...
2,1,3,Health Care,5 Apr. 2005,7.7,"(5,771)",Michael leaves Dwight in charge of picking the...
3,1,4,The Alliance,12 Apr. 2005,8.0,"(5,596)","Just for a laugh, Jim agrees to an alliance wi..."
4,1,5,Basketball,19 Apr. 2005,8.4,"(6,203)",Michael and his staff challenge the warehouse ...


In [32]:
# Convert total votes to numeric integer values
# First reate a function that uses removes the commas and parentheses strings from total_votes
def remove_str(votes):
    for r in ((',',''), ('(',''),(')','')):
        votes = votes.replace(*r)
    return votes

In [33]:
# Apply the function and change from string to integer
theoffice_episodes['total_votes'] = theoffice_episodes.total_votes.apply(remove_str).astype(int)
theoffice_episodes.head()

Unnamed: 0,season,episode_number,title,airdate,rating,total_votes,desc
0,1,1,Pilot,24 Mar. 2005,7.4,7022,The premiere episode introduces the boss and s...
1,1,2,Diversity Day,16 Mar. 2005,8.3,6918,Michael's off color remark puts a sensitivity ...
2,1,3,Health Care,5 Apr. 2005,7.7,5771,Michael leaves Dwight in charge of picking the...
3,1,4,The Alliance,12 Apr. 2005,8.0,5596,"Just for a laugh, Jim agrees to an alliance wi..."
4,1,5,Basketball,19 Apr. 2005,8.4,6203,Michael and his staff challenge the warehouse ...


In [34]:
# Change rating from string to numeric
theoffice_episodes['rating'] = theoffice_episodes.rating.astype(float)

In [35]:
# Change airdate from string to datetime
theoffice_episodes['airdate'] = pd.to_datetime(theoffice_episodes.airdate)
theoffice_episodes

Unnamed: 0,season,episode_number,title,airdate,rating,total_votes,desc
0,1,1,Pilot,2005-03-24,7.4,7022,The premiere episode introduces the boss and s...
1,1,2,Diversity Day,2005-03-16,8.3,6918,Michael's off color remark puts a sensitivity ...
2,1,3,Health Care,2005-04-05,7.7,5771,Michael leaves Dwight in charge of picking the...
3,1,4,The Alliance,2005-04-12,8.0,5596,"Just for a laugh, Jim agrees to an alliance wi..."
4,1,5,Basketball,2005-04-19,8.4,6203,Michael and his staff challenge the warehouse ...
...,...,...,...,...,...,...,...
160,8,20,Welcome Party,2012-04-12,7.0,2928,Andy tries to break up with his girlfriend aft...
161,8,21,Angry Andy,2012-04-19,6.8,3181,"Andy and Erin return to Dunder Mifflin, only t..."
162,8,22,Fundraiser,2012-04-26,7.0,2819,Andy confronts Robert for the first time after...
163,8,23,Turf War,2012-05-03,7.6,2737,Dwight and Jim fight with the Syracuse over a ...


In [36]:
theoffice_episodes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165 entries, 0 to 164
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   season          165 non-null    int64         
 1   episode_number  165 non-null    object        
 2   title           165 non-null    object        
 3   airdate         165 non-null    datetime64[ns]
 4   rating          165 non-null    float64       
 5   total_votes     165 non-null    int32         
 6   desc            165 non-null    object        
dtypes: datetime64[ns](1), float64(1), int32(1), int64(1), object(3)
memory usage: 8.5+ KB


In [37]:
theoffice_episodes.to_csv('TheOffice_Episodes_IMDb_Ratings.csv',index=False)