In [1]:
import numpy as np
import pandas as pd
import scipy.stats as sps
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [37]:
df = pd.read_excel('NetflixShows.xlsx')
df = df.rename(columns={x:x.replace(' ', '_') for x in df.columns})
df = df.drop_duplicates()
df['dataset'] = 'small'
df.title = df.title.astype(str)
print(df.shape)
df

(500, 9)


Unnamed: 0,title,rating,ratingLevel,ratingDescription,release_year,user_rating_score,user_rating_size,link,dataset
0,White Chicks,PG-13,"crude and sexual humor, language and some drug...",80,2004,82.0,80,https://www.imdb.com/title/tt0381707,small
1,Lucky Number Slevin,R,"strong violence, sexual content and adult lang...",100,2006,,82,https://www.imdb.com/title/tt0425210,small
2,Grey's Anatomy,TV-14,Parents strongly cautioned. May be unsuitable ...,90,2016,98.0,80,https://www.imdb.com/title/tt0413573,small
3,Prison Break,TV-14,Parents strongly cautioned. May be unsuitable ...,90,2008,98.0,80,https://www.imdb.com/video/vi3183588,small
4,How I Met Your Mother,TV-PG,Parental guidance suggested. May not be suitab...,70,2014,94.0,80,https://www.imdb.com/title/tt0460649,small
...,...,...,...,...,...,...,...,...,...
495,Russell Madness,PG,some rude humor and sports action,60,2015,,82,https://www.imdb.com/title/tt4257950,small
496,Wiener Dog Internationals,G,General Audiences. Suitable for all ages.,35,2015,,82,https://www.imdb.com/title/tt3462880,small
497,Pup Star,G,General Audiences. Suitable for all ages.,35,2016,,82,https://www.imdb.com/title/tt5449296,small
498,Precious Puppies,TV-G,Suitable for all ages.,35,2003,,82,https://www.imdb.com/title/tt6500946,small


In [38]:
import requests
from bs4 import BeautifulSoup
import time
import re

# Функция для извлечения информации с IMDb
def get_imdb_data(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        return None  # Если ошибка, пропускаем

    soup = BeautifulSoup(response.content, 'html.parser')

    try:
        title = soup.find('h1').text.strip()
    except AttributeError:
        title = None

    try:
        imdb_id = re.search(r'tt\d+', url).group()
    except AttributeError:
        imdb_id = None

    try:
        directors = [a.text.strip() for a in soup.select('li[data-testid="title-pc-principal-credit"] a[href*="tt_ov_dr"]')]
    except AttributeError:
        directors = []

    try:
        writers = [a.text.strip() for a in soup.select('li[data-testid="title-pc-principal-credit"] a[href*="tt_ov_wr"]') if 'Creator' not in a.find_parent('li').text and 'Writers' not in a.text]
        writers = ', '.join(set(writers)).lstrip(', ') if writers else None
    except AttributeError:
        writers = None

    # Если нет режиссеров, используем сценаристов в качестве режиссеров
    director_list = directors if directors else (writers.split(', ') if writers else [])
    director = ', '.join(set(director_list)) if director_list else None

    try:
        duration = soup.find('li', {'data-testid': 'title-techspec_runtime'}).text.replace('Runtime', '').strip()
    except AttributeError:
        duration = None

    try:
        country = soup.find('a', {'href': lambda x: x and 'country_of_origin' in x}).text.strip()
    except AttributeError:
        country = None

    try:
        genres = [a.text.strip() for a in soup.select('div[data-testid="interests"] a[href*="interest"] span')]
        genres = ', '.join(set(genres)).strip(', ') if genres else None
    except AttributeError:
        genres = None

    try:
        release_date = soup.find('a', {'href': lambda x: x and 'releaseinfo' in x}).text.strip()
    except AttributeError:
        release_date = None

    try:
        rating = soup.find('span', {'class': 'sc-d541859f-1 imUuxf'}).text.strip()
    except AttributeError:
        rating = None

    try:
        rating_count = soup.find('a', {'href': lambda x: x and 'ratings' in x}).find('div', class_='sc-d541859f-3 dwhNqC').text.strip()
    except AttributeError:
        rating_count = None

    try:
        popularity = soup.find('div', {'data-testid': 'hero-rating-bar__popularity__score'}).text.strip().replace(',', '')
        popularity = int(popularity)
    except (AttributeError, ValueError):
        popularity = None

    try:
        production_companies = [a.text.strip() for a in soup.select('li[data-testid="title-details-companies"] a[href*="tt_dt_cmpy"]')]
        production_companies = ', '.join(set(production_companies)).replace('Production companies', '').strip(', ') if production_companies else None
    except AttributeError:
        production_companies = None

    try:
        stars = [a.text.strip() for a in soup.select('a[href*="tt_ov_st"]')]
        stars = ', '.join(set(stars)).replace('Stars', '').strip(', ') if stars else None
    except AttributeError:
        stars = None

    try:
        box_office = soup.find('li', {'data-testid': 'title-boxoffice-budget'}).find('span', class_='ipc-metadata-list-item__list-content-item').text.replace(' (estimated)', '').strip()
    except AttributeError:
        box_office = None

    return {
        'Title': title,
        'IMDb ID': imdb_id,
        'Director': director,
        'Writers': writers,
        'Duration': duration,
        'Country': country,
        'Genres': genres,
        'Release Date': release_date,
        'Rating': rating,
        'Rating Count': rating_count,
        'Popularity': popularity,
        'Production Companies': production_companies,
        'Stars': stars,
        'Box Office': box_office,
        'URL': url
    }

# Создаём список для хранения данных
movies_data = []

# Обрабатываем каждую ссылку из таблицы
for index, row in df.iterrows():
    url = row['link']
    if pd.notna(url):  # Проверка на пустые значения
        movie_info = get_imdb_data(url)
        if movie_info:
            movies_data.append(movie_info)
    time.sleep(1)  # Делаем задержку, чтобы не перегружать сервер IMDb

# Создаём итоговый DataFrame
final_df = pd.DataFrame(movies_data)

final_df

Unnamed: 0,Title,IMDb ID,Director,Writers,Duration,Country,Genres,Release Date,Rating,Rating Count,Popularity,Production Companies,Stars,Box Office,URL
0,White Chicks,tt0381707,Keenen Ivory Wayans,"Marlon Wayans, Shawn Wayans, Keenen Ivory Wayans",1 hour 49 minutes,United States,"Comedy, Police Procedural, Crime, Buddy Comedy...",2004,5.8,174K,689.0,"Wayans Bros. Entertainment, Gone North Product...","Busy Philipps, , Marlon Wayans, Shawn Wayans","$37,000,000",https://www.imdb.com/title/tt0381707
1,Lucky Number Slevin,tt0425210,Paul McGuigan,Jason Smilovic,1 hour 50 minutes,United Kingdom,"Drama, Crime, Thriller",2006,7.7,331K,3674.0,"The Weinstein Company, Ascendant Pictures, , F...","Ben Kingsley, Josh Hartnett, Morgan Freeman","$27,000,000",https://www.imdb.com/title/tt0425210
2,Grey's Anatomy,tt0413573,"Michelle Lirtzman, Shonda Rhimes","Michelle Lirtzman, Shonda Rhimes",41 minutes,United States,"Drama, Romance, Feel-Good Romance, Medical Drama",2005–,7.6,355K,45.0,"The Mark Gordon Company, , Shondaland, ABC Stu...","Ellen Pompeo, , Chandra Wilson, James Pickens Jr.",,https://www.imdb.com/title/tt0413573
3,How I Met Your Mother,tt0460649,"Craig Thomas, Carter Bays","Craig Thomas, Carter Bays",23 minutes,United States,"Romantic Comedy, Comedy, Drama, Sitcom, Romance",2005–2014,8.3,752K,125.0,"20th Century Fox Television, Bays Thomas Produ...","Jason Segel, , Josh Radnor, Cobie Smulders",,https://www.imdb.com/title/tt0460649
4,Supernatural,tt0460681,Eric Kripke,Eric Kripke,44 minutes,United States,"Horror, Drama, Monster Horror, Supernatural Fa...",2005–2020,8.4,503K,42.0,"Warner Bros. Television, , Wonderland Sound an...","Jim Beaver, , Jensen Ackles, Jared Padalecki",,https://www.imdb.com/title/tt0460681
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
490,Russell Madness,tt4257950,Robert Vince,"Anna McRoberts, Aaron Brooks, Robert Vince",1 hour 32 minutes,United States,"Family, Sport",2015,4.3,742,,"Production company, Air Bud Entertainment","David Milchard, , Sean Giambrone, Steve Richmond",,https://www.imdb.com/title/tt4257950
491,Wiener Dog Internationals,tt3462880,Kevan Peterson,Kevan Peterson,1 hour 32 minutes,United States,Family,2016,4.0,241,,"Fromage Pictures, , Decipher Entertainment, Re...","Morgan Fairchild, Bryan Batt, Jason London",,https://www.imdb.com/title/tt3462880
492,Pup Star,tt5449296,Robert Vince,"Anna McRoberts, Michael Teigen, Kirsten Hansen",1 hour 32 minutes,Canada,"Comedy, Family",2016,4.4,558,,"Production company, Air Bud Entertainment","Makenzie Moss, , Carla Jimenez, Susie Wall",,https://www.imdb.com/title/tt5449296
493,Precious Puppies,tt6500946,Greg Grainger,,53 minutes,Australia,Documentary,2003,6.1,81,,"Grainger Television Australia, Production company",Lizzy Lovette,,https://www.imdb.com/title/tt6500946


In [39]:
final_df.to_csv('IMDb_sraped.csv')