In [27]:
from selenium.webdriver import Chrome
import pandas as pd
import time
from selenium.webdriver.common.keys import Keys
import glob

## Scraping Filmweb

In [14]:
def get_movies_data(url, length):
    movies = []
    browser = Chrome(r'\Users\micha\project_filmweb_copy\drivers\chromedriver')
    browser.get(f'{url}')
    
    html = browser.find_element_by_tag_name('html')
    time.sleep(3)
    html.find_element_by_class_name('ws__skipButton').click()
    time.sleep(1)
    html.find_element_by_id('didomi-notice-agree-button').click()
    
    list_container = html.find_element_by_class_name('page__container.rankingTypeSection__container')
    rank_num = list_container.find_elements_by_class_name('rankingType__position')
    
    while True:
        if rank_num[-1].text != length:
            rank_num = list_container.find_elements_by_class_name('rankingType__position')
            html.send_keys(Keys.PAGE_DOWN)
            time.sleep(0.4)
        else:
            break
            
    movie_data = list_container.find_elements_by_class_name('rankingType')
    
    for elem in range(len(movie_data)):
        movie_dict = {}
        
        movie_dict['polish_title'] = movie_data[elem].find_element_by_class_name('rankingType__title').text
        movie_dict['original_title'] = movie_data[elem].find_element_by_class_name('rankingType__originalTitle').text[:-5]
        movie_dict['year'] = movie_data[elem].find_element_by_class_name('rankingType__year').text
        movie_dict['rating'] = movie_data[elem].find_element_by_class_name('rankingType__rate--value').text
        
        movies.append(movie_dict)
        
    browser.close()

    return movies

#### List of web pages

In [20]:
urls = [
    {
        'url': 'https://www.filmweb.pl/ranking/film', 
        'genre': 'top_500',
        'length': '500'
    },
    {
        'url': 'https://www.filmweb.pl/ranking/film/Komedia/13',
        'genre': 'comedy',
        'length': '100'
    },
    {
        'url': 'https://www.filmweb.pl/ranking/film/Horror/12',
        'genre': 'horror',
        'length': '100'
    },
    {
        'url': 'https://www.filmweb.pl/ranking/film/Thriller/24',
        'genre': 'thriller',
        'length': '100'
    },
    {
        'url': 'https://www.filmweb.pl/ranking/film/Komedia+rom./30',
        'genre': 'rom_com',
        'length': '100'
    },
    {
        'url': 'https://www.filmweb.pl/ranking/film/Animacja/2',
        'genre': 'animation',
        'length': '100'
    },
    {
        'url': 'https://www.filmweb.pl/ranking/film/Wojenny/26',
        'genre': 'war',
        'length': '100'
    },
    {
        'url': 'https://www.filmweb.pl/ranking/film/Sci-Fi/33',
        'genre': 'sci-fi',
        'length': '100'
    },
    {
        'url': 'https://www.filmweb.pl/ranking/film/Akcja/28',
        'genre': 'action',
        'length': '100'
    },
    {
        'url': 'https://www.filmweb.pl/ranking/film/Dla+m%C5%82odzie%C5%BCy/41',
        'genre': 'kids',
        'length': '100'
    },
    {
        'url': 'https://www.filmweb.pl/ranking/film/Krymina%C5%82/15',
        'genre': 'crime',
        'length': '100'
    },
    {
        'url': 'https://www.filmweb.pl/ranking/film/Fantasy/9',
        'genre': 'fantasy',
        'length': '100'
    },
    {
        'url': 'https://www.filmweb.pl/ranking/film/%C5%9Awi%C4%85teczny/78',
        'genre': 'christmas',
        'length': '100'
    },
    {
        'url': 'https://www.filmweb.pl/ranking/film/Biograficzny/3',
        'genre': 'biopic',
        'length': '100'
    }
]


#### Saving data 

In [17]:
def save_data(data, genre):
    df = pd.DataFrame.from_dict(data)
    df.to_csv(
        fr'\Users\micha\project_filmweb_copy\data\filmweb_data\fw_data_{genre}.csv',
        sep=',',
        decimal='.',
        encoding='utf-8',
        index=False
)

In [21]:
for url in urls:
    data_dict = get_movies_data(url['url'], url['length'])
    save_data(data_dict, url['genre'])

  browser = Chrome(r'\Users\micha\project_filmweb_copy\drivers\chromedriver')
  html = browser.find_element_by_tag_name('html')


## Concatenating, filtering

In [30]:
files = glob.glob(r'\Users\micha\project_filmweb_copy\data\filmweb_data\fw_data_*.csv', recursive=True)
files.remove('\\Users\\micha\\project_filmweb_copy\\data\\filmweb_data\\fw_data_top_500.csv')

df_new = pd.read_csv(
    r'\Users\micha\project_filmweb_copy\data\filmweb_data\fw_data_top_500.csv',
    sep=',',
    decimal='.',
    encoding='utf-8',    
)

for file in files:
    df_open = pd.read_csv(
        fr'{file}',
        sep=',',
        decimal='.',
        encoding='utf-8'
    )

    df_new = pd.concat([df_new, df_open], ignore_index=True, axis=0)

In [33]:
df_new

Unnamed: 0,polish_title,original_title,year,rating
0,Skazani na Shawshank,The Shawshank Redemption,1994,876
1,Nietykalni,Intouchables,2011,861
2,Zielona mila,The Green Mile,1999,860
3,Ojciec chrzestny,The Godfather,1972,859
4,Dwunastu gniewnych ludzi,12 Angry Men,1957,856
...,...,...,...,...
1795,Orzeł,,1958,743
1796,Pomiędzy niebem a ziemią,Heaven & Earth,1993,742
1797,Tak tu cicho o zmierzchu,A zori zdes tikhie,1972,741
1798,Pearl Harbor,,2001,741


In [36]:
df_new.original_title = df_new.original_title.fillna(df_new['polish_title'])
df_new

Unnamed: 0,polish_title,original_title,year,rating
0,Skazani na Shawshank,The Shawshank Redemption,1994,876
1,Nietykalni,Intouchables,2011,861
2,Zielona mila,The Green Mile,1999,860
3,Ojciec chrzestny,The Godfather,1972,859
4,Dwunastu gniewnych ludzi,12 Angry Men,1957,856
...,...,...,...,...
1795,Orzeł,Orzeł,1958,743
1796,Pomiędzy niebem a ziemią,Heaven & Earth,1993,742
1797,Tak tu cicho o zmierzchu,A zori zdes tikhie,1972,741
1798,Pearl Harbor,Pearl Harbor,2001,741


In [40]:
print(f'Shape: {df_new.shape}')
df_new.drop_duplicates(subset=['original_title', 'year'], inplace=True, keep='first', ignore_index=True)
print(f'After dropping duplicates: {df_new.shape}')

Shape: (1800, 4)
After dropping duplicates: (1345, 4)


In [42]:
df_new.to_csv(
    r'\Users\micha\project_filmweb_copy\data\filmweb_data\fw_data_concat.csv',
    sep=',',
    decimal='.',
    encoding='utf-8',
    index=False
)