# Import Library

In [1]:
import numpy    as np # linear algebra
import pandas   as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import requests as rq
from selenium import webdriver as wb
from bs4  import BeautifulSoup as BS
from tqdm import tqdm
import multiprocessing

# Loading Dataset

In [2]:
shows = pd.read_csv('Shows.csv')
del shows['Unnamed: 0']
shows.head()

Unnamed: 0,id,name,year,certificate,runtime,genre,ratings
0,tt3581920,The Last of Us,(2023– ),TV-MA,50 min,"\nAction, Adventure, Drama",9.3
1,tt14153790,Velma,(2023– ),TV-MA,,"\nAnimation, Adventure, Comedy",1.3
2,tt15591076,That '90s Show,(2023– ),TV-14,30 min,"\nComedy, Drama, Romance",6.5
3,tt13406094,The White Lotus,(2021–2023),TV-MA,60 min,"\nComedy, Drama",7.9
4,tt4236770,Yellowstone,(2018– ),TV-MA,60 min,"\nDrama, Western",8.7


In [3]:
shows.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235450 entries, 0 to 235449
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   id           235450 non-null  object 
 1   name         235450 non-null  object 
 2   year         235450 non-null  object 
 3   certificate  235450 non-null  object 
 4   runtime      209551 non-null  object 
 5   genre        235450 non-null  object 
 6   ratings      235450 non-null  float64
dtypes: float64(1), object(6)
memory usage: 12.6+ MB


# Preprocessing

In [4]:
def runtime(data):
    if type(data) == str:
        return int(data.split()[0])
    else: return data

In [5]:
def year(data):
    if type(data) == str:
        if 'II' in data :return data[6:-1]
        else: return data[1:-1]
    else:
        return data

In [6]:
shows['year'   ] = shows['year'   ].apply( year                )
shows['runtime'] = shows['runtime'].apply( runtime             )
shows['genre'  ] = shows['genre'  ].apply( lambda x: x.strip() )

In [7]:
shows.head()

Unnamed: 0,id,name,year,certificate,runtime,genre,ratings
0,tt3581920,The Last of Us,2023–,TV-MA,50.0,"Action, Adventure, Drama",9.3
1,tt14153790,Velma,2023–,TV-MA,,"Animation, Adventure, Comedy",1.3
2,tt15591076,That '90s Show,2023–,TV-14,30.0,"Comedy, Drama, Romance",6.5
3,tt13406094,The White Lotus,2021–2023,TV-MA,60.0,"Comedy, Drama",7.9
4,tt4236770,Yellowstone,2018–,TV-MA,60.0,"Drama, Western",8.7


# IMDb Web-Scraping Indiviual

## Create function

In [8]:
def bs_content_driver(link,driver):
    driver.get(link)
    return BS(driver.page_source,'html.parser')

def bs_content(link):
    res = rq.get(link)
    
    if res.ok:
        return BS(res.content, 'html.parser')
    else: return False

In [9]:
def moviescrap(start,end,file_name):
    columns = ['show_id','director_ids','directors', 'writer_ids', 'writers', 'cast_ids', 'casts','characters']
    data    = { k : [] for k in columns }
#     driver  = wb.Chrome('webdrive/chromedriver.exe')

    for id_ in tqdm(shows.loc[start:end,'id']):

        link = 'https://www.imdb.com/title/' + id_ + '/fullcredits/'

#         content = bs_content_driver(link,driver)
        content = bs_content(link)
        if not content: continue

        cast_list       = content.find('table', class_ = 'cast_list')
        director_writer = content.find_all('table', class_ ='simpleTable simpleCreditsTable')[:2]

        directors       = director_writer[0].find_all('a')
        writers         = director_writer[1].find_all('a')

        cast_names      = cast_list.find_all('td', class_ = 'primary_photo')
        char_names      = cast_list.find_all('td', class_ = 'character')

        dir_id          = [ i.get('href')[1:-1].split('/')[-1] for i in directors  ]
        dir_name        = [ i.text.strip()                     for i in directors  ]

        writer_id       = [ i.get('href')[1:-1].split('/')[-1] for i in writers    ]
        writer_name     = [ i.text.strip()                     for i in writers    ]

        cast_names      = [ i.find('a')                        for i in cast_names ]

        cast_id         = [ i.get('href')[1:-1].split('/')[-1] for i in cast_names ]
        cast_names      = [ i.find('img').get('alt')           for i in cast_names ]

        char_names      = [ i.find('a').text                   for i in char_names ]


        dir_id          = ','.join(dir_id)
        dir_name        = ','.join(dir_name)
        writer_id       = ','.join(writer_id)
        writer_name     = ','.join(writer_name)
        cast_id         = ','.join(cast_id)
        cast_names      = ','.join(cast_names)
        char_names      = ','.join(char_names)

        data['show_id'     ].append(id_)
        data['director_ids'].append(dir_id)
        data['directors'   ].append(dir_name)
        data['writer_ids'  ].append(writer_id)
        data['writers'     ].append(writer_name)
        data['cast_ids'    ].append(cast_id)
        data['casts'       ].append(cast_names)
        data['characters'  ].append(char_names)
        
    data = pd.DataFrame(data)
    data.to_csv(f"{file_name}.csv", index = False)

In [10]:
#moviescrap(0,235450,"test.csv")

## Create Pipeline

In [11]:
p1 = multiprocessing.Process(target = moviescrap, args =(0, 4709, 'show_1.csv'))
p2 = multiprocessing.Process(target = moviescrap, args =(4709, 9418, 'show_2.csv'))
p3 = multiprocessing.Process(target = moviescrap, args =(9418, 14127, 'show_3.csv'))
p4 = multiprocessing.Process(target = moviescrap, args =(14127, 18836, 'show_4.csv'))
p5 = multiprocessing.Process(target = moviescrap, args =(18836, 23545, 'show_5.csv'))
p6 = multiprocessing.Process(target = moviescrap, args =(23545, 28254, 'show_6.csv'))
p7 = multiprocessing.Process(target = moviescrap, args =(28254, 32963, 'show_7.csv'))
p8 = multiprocessing.Process(target = moviescrap, args =(32963, 37672, 'show_8.csv'))
p9 = multiprocessing.Process(target = moviescrap, args =(37672, 42381, 'show_9.csv'))
p10 = multiprocessing.Process(target = moviescrap, args =(42381, 47090, 'show_10.csv'))

In [12]:
p1.start()
p2.start()
p3.start()
p4.start()
p5.start()
p6.start()
p7.start()
p8.start()
p9.start()
p10.start()


p1.join()
p2.join()
p3.join()
p4.join()
p5.join()
p6.join()
p7.join()
p8.join()
p9.join()
p10.join()