In [16]:
import pandas as pd
from imdb import Cinemagoer
import imdb
import time
import glob
import os

In [17]:
class Paths:
    notebook = os.path.dirname(os.path.realpath('filmweb_scraping'))
    proj = os.path.dirname(notebook)
    driver = os.path.join(proj, 'drivers', 'chromedriver')
    data = os.path.join(proj, 'data')
    data_fw = os.path.join(proj, 'data', 'filmweb_data')

## Getting data from IMDB

In [8]:
def get_imdb_data(df):
    ia = imdb.Cinemagoer()
    imdb_dict={}
    imdb_list=[]

    for elem in df['original_title']:

        m = ia.search_movie(elem)
        try:
            movie = ia.get_movie(m[0].movieID)
        except IndexError:
            imdb_dict['title'] = elem
            imdb_dict['direction'] = '-'
            imdb_dict['screenplay'] = '-'
            imdb_dict['rating'] = '-'

            imdb_list.append(imdb_dict)
            imdb_dict={}
            
            continue

        try:
            writer = [movie.get('writer')[x] for x in range(len(movie.get('writer')))]
        except TypeError:
            writer = '-'
        try:
            director = [movie.get('director')[x] for x in range(len(movie.get('director')))]
        except TypeError:
            director = '-'

        rating = movie.get('rating')
        try:
            genres = movie.get('genres')
        except TypeError:
            genres = '-'

        imdb_dict['title'] = elem
        imdb_dict['direction'] = ', '.join(str(x) for x in director if str(x) != '')
        imdb_dict['screenplay'] = ', '.join(str(x) for x in writer if str(x) != '')
        imdb_dict['rating'] = rating
        try:
            imdb_dict['genres'] = ', '.join(str(x) for x in genres if str(x) != '')
        except Exception:
            imdb_dict['genres'] = '-'

        imdb_list.append(imdb_dict)
        imdb_dict={}
        
    return imdb_list


In [9]:
df = pd.read_csv(
    fr'{Paths.data_fw}\fw_data_concat.csv',
    sep=',',
    decimal='.',
    encoding='utf-8'
)

data_dict = get_imdb_data(df)

In [13]:
df_imdb = pd.DataFrame.from_dict(data_dict)
df_imdb

Unnamed: 0,title,direction,screenplay,rating,genres
0,The Shawshank Redemption,Frank Darabont,"Stephen King, Frank Darabont",9.3,Drama
1,Intouchables,"Olivier Nakache, Éric Toledano","Olivier Nakache, Philippe Pozzo di Borgo, Éric...",8.5,"Biography, Comedy, Drama"
2,The Green Mile,Frank Darabont,"Stephen King, Frank Darabont",8.6,"Crime, Drama, Fantasy, Mystery"
3,The Godfather,Francis Ford Coppola,"Mario Puzo, Francis Ford Coppola, Mario Puzo",9.2,"Crime, Drama"
4,12 Angry Men,Sidney Lumet,"Reginald Rose, Reginald Rose",9.0,"Crime, Drama"
...,...,...,...,...,...
1340,Orzeł,-,"Mai Brostrøm, Peter Thorsboe",7.4,"Crime, Drama, Mystery"
1341,Heaven & Earth,Oliver Stone,"Le Ly Hayslip, Jay Wurts, Le Ly Hayslip, James...",6.8,"Action, Biography, Drama, History, War"
1342,A zori zdes tikhie,Stanislav Rostotskiy,"Stanislav Rostotskiy, Boris Vasilev",8.2,"Drama, History, War"
1343,Pearl Harbor,Michael Bay,Randall Wallace,6.2,"Action, Drama, History, Romance, War"


## Merging and saving data

In [15]:
fw_imdb_merge = pd.merge(
    left=df,
    right=df_imdb,
    how='left',
    left_on='original_title',
    right_on='title'
)
fw_imdb_merge.head()

Unnamed: 0,polish_title,original_title,year,rating_x,title,direction,screenplay,rating_y,genres
0,Skazani na Shawshank,The Shawshank Redemption,1994,876,The Shawshank Redemption,Frank Darabont,"Stephen King, Frank Darabont",9.3,Drama
1,Nietykalni,Intouchables,2011,861,Intouchables,"Olivier Nakache, Éric Toledano","Olivier Nakache, Philippe Pozzo di Borgo, Éric...",8.5,"Biography, Comedy, Drama"
2,Zielona mila,The Green Mile,1999,860,The Green Mile,Frank Darabont,"Stephen King, Frank Darabont",8.6,"Crime, Drama, Fantasy, Mystery"
3,Ojciec chrzestny,The Godfather,1972,859,The Godfather,Francis Ford Coppola,"Mario Puzo, Francis Ford Coppola, Mario Puzo",9.2,"Crime, Drama"
4,Dwunastu gniewnych ludzi,12 Angry Men,1957,856,12 Angry Men,Sidney Lumet,"Reginald Rose, Reginald Rose",9.0,"Crime, Drama"


In [16]:
fw_imdb_merge.rename(columns={'rating_x': 'rating_fw', 'rating_y': 'rating_imdb'}, inplace=True)
fw_imdb_merge.drop('title', 1, inplace=True)
fw_imdb_merge.head()

Unnamed: 0,polish_title,original_title,year,rating_fw,direction,screenplay,rating_imdb,genres
0,Skazani na Shawshank,The Shawshank Redemption,1994,876,Frank Darabont,"Stephen King, Frank Darabont",9.3,Drama
1,Nietykalni,Intouchables,2011,861,"Olivier Nakache, Éric Toledano","Olivier Nakache, Philippe Pozzo di Borgo, Éric...",8.5,"Biography, Comedy, Drama"
2,Zielona mila,The Green Mile,1999,860,Frank Darabont,"Stephen King, Frank Darabont",8.6,"Crime, Drama, Fantasy, Mystery"
3,Ojciec chrzestny,The Godfather,1972,859,Francis Ford Coppola,"Mario Puzo, Francis Ford Coppola, Mario Puzo",9.2,"Crime, Drama"
4,Dwunastu gniewnych ludzi,12 Angry Men,1957,856,Sidney Lumet,"Reginald Rose, Reginald Rose",9.0,"Crime, Drama"


In [17]:
fw_imdb_merge.to_csv(
    fr'{Paths.data}\movies_data.csv',
    sep=',',
    decimal='.',
    encoding='utf-8',
    index=False
)