# TV Series from IMDb

Download the `title.basics.tsv.gz` file from https://datasets.imdbws.com/ and extract `data.tsv` from archive

In [107]:
import pandas as pd
data = pd.read_csv('data.tsv', sep='\t', header=0)
data.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [108]:
# What type of movies there are?
data['titleType'].value_counts()

tvEpisode       4871527
short            741249
movie            551355
video            265838
tvSeries         184519
tvMovie          121206
tvMiniSeries      31099
tvSpecial         29202
videoGame         25559
tvShort           12557
Name: titleType, dtype: int64

In [109]:
# Save tv shows
series = data[data['titleType']=='tvSeries']
tvmovie = data[data['titleType']=='tvMovie']
miniseries = data[data['titleType']=='tvMiniSeries']
special = data[data['titleType']=='tvSpecial']
short = data[data['titleType']=='tvShort']

# Concatenate
frames = [series, tvmovie, miniseries, special, short]
tv = pd.concat(frames)
tv.shape

(378583, 9)

In [110]:
# We previously saved the movie ids without the 'tt' prefix
# Drop the 'tt' prefix, so that we have the same approach
ids = tv['tconst'].tolist()
tv_ids = []
for id in ids:
    tv_ids.append(id.lstrip('tt'))
tv['tconst']=tv_ids
tv.sample(3)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
5535962,7159236,tvSeries,MNShorts,MNShorts,0,2017,2017,\N,Talk-Show
3728487,3115410,tvSpecial,Carly in Concert: Coming Around Again,Carly in Concert: Coming Around Again,0,1987,\N,\N,Music
2927022,1856400,tvSeries,Mit Arved Fuchs durch den Nordatlantik,Mit Arved Fuchs durch den Nordatlantik,0,2011,\N,43,Documentary


In [111]:
# Replace "\N" with NaN values
import numpy as np
tv = tv.replace('\\N', np.nan)
tv.sample(6)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
68867,70272,tvMovie,The Men Who Made the Movies: King Vidor,The Men Who Made the Movies: King Vidor,0,1973.0,,55.0,"Biography,Documentary"
1750369,11365256,tvSpecial,Candy and Smiley,Candy and Smiley,0,,,,Documentary
1602754,11106838,tvSeries,SweetyX,SweetyX,1,2017.0,,,Adult
4830660,5603576,tvSeries,Cherepashki Megablog,Cherepashki Megablog,0,2016.0,,,Game-Show
3081303,2016490,tvSeries,Hi-5 UK,Hi-5 UK,0,2008.0,,,"Adventure,Family,Music"
3120158,2056502,tvSpecial,"9/11 Memorial from Ground Zero, Tenth Anniverary","9/11 Memorial from Ground Zero, Tenth Anniverary",0,2011.0,,300.0,Documentary


In [120]:
tv.to_csv('tvshows.csv', sep='\t', index=False)

***Insert into the DB using pgAdmin***