# Import Library

In [1]:
import numpy    as np # linear algebra
import pandas   as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import requests as rq

from bs4  import BeautifulSoup as BS
from tqdm import tqdm
import re

# Function

In [2]:
def force_bs_content(link):
    res  = rq.get(link)
    
    while not res.ok : res = rq.get(link)
    
    return BS(res.content,'html.parser')

# Webscraping Top 250 IMDb Shows

In [12]:
link = '?title_type=tvSeries'

columns = ['id','name','year','certificate','runtime(minutes)','genre','ratings']
shows = { k: [] for k in columns }

for _ in tqdm(range(5)):
    
    link    = link.get('href') if type(link) != str else link
    link    = "https://www.imdb.com/search/keyword/" + link
    
    content = force_bs_content(link)
    
    for show in content.find_all('div',class_ ='lister-item mode-detail'):
        
        tag         = 'lister-item-year text-muted unbold'
        
        id_         = show.find('div').get('data-tconst')
        details     = show.find('div', class_ = 'lister-item-content')
        
        
        name_year   = details.find('h3')
        name        = name_year.find('a')
        year        = name_year.find('span', class_ = tag)
        
        certificate = details.find('span',class_ = 'certificate')
        runtime     = details.find('span',class_ = 'runtime'    )
        genre       = details.find('span',class_ = 'genre'      )
        ratings     = details.find('strong')
        
        name        = float('nan') if name        == None else name.text
        year        = float('nan') if certificate == None else year.text
        certificate = float('nan') if certificate == None else certificate.text
        runtime     = float('nan') if runtime     == None else runtime.text
        genre       = float('nan') if genre       == None else genre.text
        ratings     = float('nan') if ratings     == None else float(ratings.text)
        
        
        shows['id'              ].append(id_)
        shows['name'            ].append(name)
        shows['year'            ].append(year)
        shows['certificate'     ].append(certificate)
        shows['runtime(minutes)'].append(runtime)
        shows['genre'           ].append(genre)
        shows['ratings'         ].append(ratings)
        
    link = content.find('div' , class_= 'desc')
    
    if link is None: break
    
    link = link.find('a')
        


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:16<00:00,  3.25s/it]


In [13]:
shows = pd.DataFrame(shows)

In [14]:
shows.head()

Unnamed: 0,id,name,year,certificate,runtime(minutes),genre,ratings
0,tt3581920,The Last of Us,(2023– ),UA 16+,50 min,"\nAction, Adventure, Drama",9.2
1,tt14269590,Poker Face,,,58 min,"\nCrime, Drama, Mystery",8.2
2,tt13406094,The White Lotus,(2021–2023),18,60 min,"\nComedy, Drama",7.9
3,tt13802576,Lockwood & Co,(2023– ),UA 13+,44 min,"\nAction, Adventure, Drama",7.5
4,tt4236770,Yellowstone,(2018–2023),18,60 min,"\nDrama, Western",8.7


In [15]:
shows.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                250 non-null    object 
 1   name              250 non-null    object 
 2   year              204 non-null    object 
 3   certificate       204 non-null    object 
 4   runtime(minutes)  230 non-null    object 
 5   genre             250 non-null    object 
 6   ratings           250 non-null    float64
dtypes: float64(1), object(6)
memory usage: 13.8+ KB


# Data Preprocesing

In [16]:
def runtime(data):
    if type(data) == str:
        return int(data.split()[0])
    else: return data

In [17]:
def year(data):
    if type(data) == str:
        if 'II' in data :return data[6:-1]
        else: return data[1:-1]
    else:
        return data

In [18]:
shows['year'            ] = shows['year'            ].apply( year                )
shows['runtime(minutes)'] = shows['runtime(minutes)'].apply( runtime             )
shows['genre'           ] = shows['genre'           ].apply( lambda x: x.strip() )

In [19]:
shows.head()

Unnamed: 0,id,name,year,certificate,runtime(minutes),genre,ratings
0,tt3581920,The Last of Us,2023–,UA 16+,50.0,"Action, Adventure, Drama",9.2
1,tt14269590,Poker Face,,,58.0,"Crime, Drama, Mystery",8.2
2,tt13406094,The White Lotus,2021–2023,18,60.0,"Comedy, Drama",7.9
3,tt13802576,Lockwood & Co,2023–,UA 13+,44.0,"Action, Adventure, Drama",7.5
4,tt4236770,Yellowstone,2018–2023,18,60.0,"Drama, Western",8.7


In [None]:
shows.to_csv('Top_250_Shows.csv', index = False)