In [1]:
# # data is refreshed daily.

# title.akas.tsv.gz - Contains the following information for titles:

# titleId (string) - a tconst, an alphanumeric unique identifier of the title
# ordering (integer) – a number to uniquely identify rows for a given titleId
# title (string) – the localized title
# region (string) - the region for this version of the title
# language (string) - the language of the title
# types (array) - Enumerated set of attributes for this alternative title. One or more of the following: "alternative", "dvd", "festival", "tv", "video", "working", "original", "imdbDisplay". New values may be added in the future without warning
# attributes (array) - Additional terms to describe this alternative title, not enumerated
# isOriginalTitle (boolean) – 0: not original title; 1: original title

# title.basics.tsv.gz - Contains the following information for titles:
# tconst (string) - alphanumeric unique identifier of the title
# titleType (string) – the type/format of the title (e.g. movie, short, tvseries, tvepisode, video, etc)
# primaryTitle (string) – the more popular title / the title used by the filmmakers on promotional materials at the point of release
# originalTitle (string) - original title, in the original language
# isAdult (boolean) - 0: non-adult title; 1: adult title
# startYear (YYYY) – represents the release year of a title. In the case of TV Series, it is the series start year
# endYear (YYYY) – TV Series end year. ‘\N’ for all other title types
# runtimeMinutes – primary runtime of the title, in minutes
# genres (string array) – includes up to three genres associated with the title

# title.crew.tsv.gz – Contains the director and writer information for all the titles in IMDb. Fields include:
# tconst (string) - alphanumeric unique identifier of the title
# directors (array of nconsts) - director(s) of the given title
# writers (array of nconsts) – writer(s) of the given title

# title.episode.tsv.gz – Contains the tv episode information. Fields include:
# tconst (string) - alphanumeric identifier of episode
# parentTconst (string) - alphanumeric identifier of the parent TV Series
# seasonNumber (integer) – season number the episode belongs to
# episodeNumber (integer) – episode number of the tconst in the TV series
# title.principals.tsv.gz – Contains the principal cast/crew for titles
# tconst (string) - alphanumeric unique identifier of the title
# ordering (integer) – a number to uniquely identify rows for a given titleId
# nconst (string) - alphanumeric unique identifier of the name/person
# category (string) - the category of job that person was in
# job (string) - the specific job title if applicable, else '\N'
# characters (string) - the name of the character played if applicable, else '\N'
# title.ratings.tsv.gz – Contains the IMDb rating and votes information for titles
# tconst (string) - alphanumeric unique identifier of the title
# averageRating – weighted average of all the individual user ratings
# numVotes - number of votes the title has received

# name.basics.tsv.gz – Contains the following information for names:
# nconst (string) - alphanumeric unique identifier of the name/person
# primaryName (string)– name by which the person is most often credited
# birthYear – in YYYY format
# deathYear – in YYYY format if applicable, else '\N'
# primaryProfession (array of strings)– the top-3 professions of the person
# knownForTitles (array of tconsts) – titles the person is known for



In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
url = 'https://datasets.imdbws.com/'
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')

In [4]:
urls = []
for link in soup.find_all('a')[1:]:
    urls.append(link.get('href'))

In [5]:
for url in urls:
    
    df_suffix = url.split('/')[-1].split('.')[0] + '.' + url.split('/')[-1].split('.')[1]

    if df_suffix == 'name.basics':
        name_basics = pd.read_csv(url, compression='gzip', on_bad_lines='skip', sep='\t')
        print(name_basics.shape)

(12243512, 6)


In [6]:
# Split the columns having multiple values into multiple columns
name_basics[['primaryProfession1', 'primaryProfession2',  'primaryProfession3']] = name_basics['primaryProfession'].str.split(',', expand=True)

In [53]:
name_basics['famous_titles'] = name_basics['knownForTitles'].str.split(',')

# # Drop the not required columns
# name_basics.drop('primaryProfession', axis=1, inplace=True)
# name_basics.head()

0    tt0050419,tt0053137,tt0045537,tt0072308
Name: knownForTitles, dtype: object


In [None]:
name_basics[['famous_title1', 'famous_title2', 'famous_title3', 'famous_title4']] = name_basics['famous_titles'].apply(lambda x: pd.Series(str(x).split(",")))
name_basics.head()

##### name_basics['knownForTitles'].str.split(',').len().max()

In [35]:
name_basics.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles,primaryProfession1,primaryProfession2,primaryProfession3
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0050419,tt0053137,tt0045537,tt0072308",soundtrack,actor,miscellaneous
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0037382,tt0117057,tt0071877,tt0038355",actress,soundtrack,
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department","tt0056404,tt0054452,tt0049189,tt0057345",actress,soundtrack,music_department
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0077975,tt0080455,tt0078723,tt0072562",actor,soundtrack,writer
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050986,tt0060827,tt0083922,tt0050976",writer,director,actor


In [37]:
name_basics.tail()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles,primaryProfession1,primaryProfession2,primaryProfession3
12243507,nm9993714,Romeo del Rosario,\N,\N,"animation_department,art_department","tt2455546,tt14069590,tt11657662",animation_department,art_department,
12243508,nm9993716,Essias Loberg,\N,\N,,\N,,,
12243509,nm9993717,Harikrishnan Rajan,\N,\N,cinematographer,tt8736744,cinematographer,,
12243510,nm9993718,Aayush Nair,\N,\N,cinematographer,\N,cinematographer,,
12243511,nm9993719,Andre Hill,\N,\N,,\N,,,


In [None]:
name_basics.rename(columns = {'nconst':'name_const', 'primaryName': 'primary_name', 'birthYear': 'birth_year', deathYear: death_year, }, inplace = True)

In [8]:
name_basics.head(20)

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles,primaryProfession1,primaryProfession2,primaryProfession3
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0050419,tt0053137,tt0045537,tt0072308",soundtrack,actor,miscellaneous
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0037382,tt0117057,tt0071877,tt0038355",actress,soundtrack,
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department","tt0056404,tt0054452,tt0049189,tt0057345",actress,soundtrack,music_department
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0077975,tt0080455,tt0078723,tt0072562",actor,soundtrack,writer
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050986,tt0060827,tt0083922,tt0050976",writer,director,actor
5,nm0000006,Ingrid Bergman,1915,1982,"actress,soundtrack,producer","tt0036855,tt0038787,tt0038109,tt0034583",actress,soundtrack,producer
6,nm0000007,Humphrey Bogart,1899,1957,"actor,soundtrack,producer","tt0042593,tt0037382,tt0034583,tt0043265",actor,soundtrack,producer
7,nm0000008,Marlon Brando,1924,2004,"actor,soundtrack,director","tt0070849,tt0068646,tt0078788,tt0047296",actor,soundtrack,director
8,nm0000009,Richard Burton,1925,1984,"actor,soundtrack,producer","tt0061184,tt0087803,tt0057877,tt0059749",actor,soundtrack,producer
9,nm0000010,James Cagney,1899,1986,"actor,soundtrack,director","tt0042041,tt0029870,tt0031867,tt0035575",actor,soundtrack,director
