## Importing necessary libraries:

In [1]:
from google.colab import drive
import pandas as pd
import numpy as np
from tqdm import tqdm
from time import sleep
drive.mount('/content/drive')

Mounted at /content/drive


## Installing imdbpy 

In [2]:
!pip install imdbpy
from imdb import IMDb

Collecting imdbpy
[?25l  Downloading https://files.pythonhosted.org/packages/93/49/1d6c29154deed34de46340bf968839a63639f1b23b6f7402b861a222376c/IMDbPY-2021.4.18-py3-none-any.whl (298kB)
[K     |████████████████████████████████| 307kB 3.0MB/s 
Installing collected packages: imdbpy
Successfully installed imdbpy-2021.4.18


## Reading Input:
#### We only do a test run with 100 movie IDs here. you can also pass the whole data as input.

In [12]:
movie_df = pd.read_csv('/content/drive/MyDrive/imdb_datasets/imdb_movies.csv', low_memory=False)
movie_df.set_index('movie_id', inplace=True)
movie_df = movie_df[movie_df['country'].str.contains('USA')]
movie_df = movie_df.sample(100)
movies_index = movie_df.index
movie_df.head()

Unnamed: 0_level_0,name,org_name,date,title_year,point,point_volume,metascore,user_reviews,critic_reviews,director,writer,story_line,cast,genres,country,language,budget,world_gross,usa_gross,runtime,production_companies,dollar_budget,w_gross_money,u_gross_money,inflation_coeff,casts_id,BlogPage,CompPage,HomePage,release_month,release_day,keywords
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
tt1877707,The Black Hole,Mind's Eye,2016-12-02,2016.0,2.8,383.0,,22.0,0.0,Mark Steven Grove,Mark Steven Grove,Mattie Carver's world is turning curiouser and...,"Malcolm McDowell,Izzie Steele,Natalie Distler,...","Sci-Fi,Thriller",USA,English,$800000,,,111.0,"Black Wing Digital,Red Pine Studios,Asgard Ent...",800000.0,,,1.084008,castIDs_not_provided,0,0,0,12.0,2.0,school
tt0811106,The Ten,The Ten,2008-04-25,2007.0,5.0,16211.0,50.0,114.0,66.0,David Wain,"Ken Marino,David Wain",Quirky inter-related stories. Standing by two ...,"Paul Rudd,Jon Hamm,Ken Marino,Mather Zickel,Ra...","Comedy,Romance","USA,Mexico","English,Spanish",$5250000,$785528,$769726,96.0,"City Lights Pictures,Inverted Film,Jade Films",5250000.0,785528.0,769726.0,1.254971,"/name/nm0748620/,/name/nm0358316/,/name/nm0547...",0,0,0,4.0,25.0,"police-officer,two-word-title,sabbath,surgeon,..."
tt2249125,The Los Angeles Ripper,The Los Angeles Ripper,2011-08-01,2011.0,3.7,82.0,,1.0,19.0,wridir_not_provided,"Celeste Marie Martinez,Craig J. McIntyre",Kristy White moves to Los Angeles to stay with...,"Celeste Marie Martinez,Ava Rose,Beverley Basse...","Crime,Horror,Thriller","UK,USA",English,,,,78.0,"Moss Stomper Productions,372 Film Production,5x35",,,,1.157113,castIDs_not_provided,0,0,0,8.0,1.0,keys_not_provided
tt0038157,That's the Spirit,That's the Spirit,1945-06-01,1945.0,7.4,47.0,,5.0,2.0,wridir_not_provided,"Michael Fessier,Ernest Pagano",A vaudeville performer returns from the dead t...,"Peggy Ryan,June Vincent,Johnny Coy,Arthur Trea...","Comedy,Musical",USA,English,,,,87.0,Universal Pictures,,,,14.448684,castIDs_not_provided,0,0,0,6.0,1.0,wife
tt2293060,Americons,Americons,2015-01-22,2015.0,5.1,345.0,25.0,2.0,5.0,Theo Avgerinos,"Beau Martin Williams,Matt Funke",The year is 2007 and twenty percent of the sta...,"Beau Martin Williams,Trai Byers,Sam McMurray,T...","Action,Biography,Crime,Drama",USA,English,$1250000,$28782,$28782,89.0,"Industry Artists Group,Martin Entertainment Gr...",1250000.0,28782.0,28782.0,1.0981,"/name/nm2842106/,/name/nm1784293/,/name/nm0573...",0,0,0,1.0,22.0,journey


## Using IMDBpy API:
#### please note that it is better to use some sleep time between interations in order to avoid flooding the server with requests. Also, the key to finding movie metadata by this API is to use IMDB movie id. In the form of 'tt-------'. Finally, it is worth mentioning that although we already have some keywords associated with each title in our dataset, this method will yield all available keywords for each movie.

In [13]:
ia = IMDb()
keywords_dict = {}
for movie_index in tqdm(movies_index):
    sleep(0.5)
    try:
        keywords_dict[movie_index] = ia.get_movie_keywords(movie_index[2:])['data']['keywords']
    except:
        keywords_dict[movie_index] = '' 

100%|██████████| 100/100 [02:47<00:00,  1.67s/it]


In [14]:
keywords = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in keywords_dict.items() ])).transpose()
keywords = keywords.apply(lambda x: ','.join(x.dropna()), axis=1)
keywords = pd.DataFrame(keywords) 
keywords.rename(columns={0:'keywords'}, inplace=True)
keywords.index.rename('movie_id', inplace=True)
keywords.to_csv('sample_keywords.csv')
!cp sample_keywords.csv "/content/drive/MyDrive/imdb_datasets"  

In [16]:
key_df = pd.read_csv('/content/drive/MyDrive/imdb_datasets/sample_keywords.csv')
key_df[key_df['keywords'].notnull()]

Unnamed: 0,movie_id,keywords
0,tt1877707,school
1,tt0811106,"police-officer,two-word-title,sabbath,surgeon,..."
3,tt0038157,wife
4,tt2293060,journey
5,tt0016381,"b-movie,double-feature-film,race-car-driver,ra..."
...,...,...
93,tt0045502,"yuma-arizona,arizona-territory,sonora-mexico,y..."
94,tt0034432,"murder,assumed-name,small-town,radio-program,b..."
95,tt3081722,friendship
97,tt0181594,"jewish-brigade,israel,refugee,holocaust,palest..."
