In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
from datetime import datetime
import pandas as pd
import re
import geopandas as gpd
import folium

In [2]:
# driver
url = 'https://cinematicket.org/movie/more/3/%D8%AA%D8%A6%D8%A7%D8%AA%D8%B1%20%DA%A9%D9%85%D8%AF%DB%8C'
driver = webdriver.Firefox()
driver.get(url)

In [3]:
# func
def get_movies():
    time_out = 2
    _ = WebDriverWait(driver, time_out).until(EC.presence_of_element_located((By.CLASS_NAME, 'detailBox')))
    lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
    match=False
    while(match==False):
        lastCount = lenOfPage
        time.sleep(1)
        lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
        if lastCount==lenOfPage:
            match=True
    time.sleep(1)
    soup = BeautifulSoup(driver.page_source)
    movie_title = [i.text for i in soup.select('#plate h3')]
    rate = [i.text for i in soup.select('.rate')]
    rate_count = [i.text for i in soup.select('.duration')]
    movie_link = ['https://cinematicket.org'+i['href'] for i in soup.select('#plate a')]
    movie_id = [i['href'].replace('/movie/detail/', '') for i in soup.select('#plate a')]
    movies = pd.DataFrame({'movie_id':movie_id, 'movie_title':movie_title, 'rate':rate, 'rate_count':rate_count, 'movie_link':movie_link})
    movies['etl_date'] = datetime.now().strftime('%Y-%m-%d')
    #movies['actors'] = ''
    #movies['comment_count'] = ''

    return movies

def get_movies_detail(movies):
    detail_df = pd.DataFrame()
    for i in range(movies.shape[0]):
        detail_df_i = get_movie_detail(movies.loc[i, 'movie_link'])
        #movies.loc[i, 'actors'] = actors
        #movies.loc[i, 'comment_count'] = comment_count
        detail_df = pd.concat([detail_df, detail_df_i])
        print(i, 'Done')
    months = {'فروردین':'01', 'اردیبهشت':'02', 'خرداد':'03', 'تیر':'04', 'مرداد':'05', 'شهریور':'06', 'مهر':'07', 'آبان':'08', 'آذر':'09', 'دی':'10', 'بهمن':'11', 'اسفند':'12'}
    detail_df = detail_df.reset_index(drop=True)
    detail_df = detail_df[detail_df['times_day']!='Close']
    detail_df['times_day'] = detail_df['times_day'].apply(lambda x: x.replace('سانس',''))
    detail_df['movie_id'] = detail_df['url'].apply(lambda x: x.split('/')[-1])
    detail_df['prices_day'] = detail_df['prices_day'].apply(lambda x: re.search(r'\d+', x).group())
    detail_df['persian_date'] = detail_df['day'].apply(lambda x: '1402/'+months[''.join(re.findall(r'[\u0600-\u06FF]', x))]+'/'+re.findall(r'\d+', x)[0])
    detail_df = detail_df.astype({'movie_id':'int64', 'prices_day':'int64'})
    detail_df = detail_df.drop(['url', 'day'], axis=1)
    return detail_df.reset_index(drop=True)

def get_movie_detail(url:str):
    movie_link_i = url
    print(movie_link_i)
    driver.get(movie_link_i)
    time.sleep(5)
    driver.find_element(By.XPATH, '//*[@id="mat-mdc-dialog-0"]/div/div/app-choose-city-dialog/div/div/div[1]/button/span[3]').click()
    soup_detail = BeautifulSoup(driver.page_source)
    #actors = '' if len(soup_detail.select('.colored .name'))==0 else str([i.text for i in soup_detail.select('.colored .name')])
    #comment_count = 0 if len(soup_detail.select('.count'))==0 else int(soup_detail.select('.count')[0].text.replace(') دیدگاه ثبت شده', '').replace('(', ''))

    driver.execute_script("window.scrollTo(0, 700);")
    time.sleep(2)
    if soup_detail.select('.subtitle')[0].text!='':
        detail_dict_i = {}
        days = [i.text for i in soup_detail.select('.subtitle')]
        for i in range(len(days)):
            d = {}
            d['day'] = days[i]
            d['cinema_title'] = '' if len(soup_detail.select('.cinemaBox .title'))==0 else soup_detail.select('.cinemaBox .title')[0].text
            d['cinema_address'] = '' if len(soup_detail.select('.cinemaBox .title'))==0 else driver.find_element(By.XPATH, '//div[1]/div[1]/div[2]/div[2]/span').text
            d['prices_day'] = ['Close'] if len(soup_detail.select('.cinemaBox .title'))==0 else [p.text for p in soup_detail.select('#cinemas .amount.ng-star-inserted')]
            if len(d['prices_day'])==0:
                d['prices_day'] = [p.text for p in soup_detail.select('.cinemaBox .amount')]
            d['times_day'] = ['Close'] if len(soup_detail.select('.cinemaBox .title'))==0 else [t.text for t in soup_detail.select('.cinemaBox .start')]
            detail_dict_i[i] = d
            if i!=len(days)-1:
                driver.find_element(By.XPATH, '//*[@id="sessionDateList"]/div[{}]/span[1]'.format(i+2)).click()
                time.sleep(2)
                soup_detail = BeautifulSoup(driver.page_source)
        detail_df_i = pd.concat([pd.DataFrame(detail_dict_i[i]) for i in detail_dict_i.keys()]).reset_index(drop=True)
        detail_df_i['url'] = url
    return detail_df_i

def load_to_excel(movies_df, movies_detail_df):
    with pd.ExcelWriter('./theatres.xlsx', engine='xlsxwriter') as excel_writer:
        movies_df.style.set_properties(**{'text-align': 'center'}).to_excel(excel_writer, sheet_name='theatres', index=False)
        worksheet = excel_writer.sheets['theatres']
        worksheet.add_table(0, 0, movies_df.shape[0], movies_df.shape[1] - 1, {'columns': [{'header': header} for header in movies_df.columns]})
        worksheet.autofit()

        workbook = excel_writer.book
        worksheet_detail = workbook.add_worksheet('theatres_times')
        movies_detail_df.style.set_properties(**{'text-align': 'center'}).to_excel(excel_writer, sheet_name='theatres_times', index=False)
        worksheet_detail.add_table(0, 0, movies_detail_df.shape[0], movies_detail_df.shape[1] - 1, {'columns': [{'header': header} for header in movies_detail_df.columns]})
        worksheet_detail.autofit()

def get_cinemas_location():
    static_cinema_points = {
        'سینما حافظ تهران' :                    'POINT (51.42616268140596, 35.69254602974092)',
        'سینما دهکده المپیک تهران' :           'POINT (51.26397804703257, 35.76151901644865)',
        'سینما صحرا تهران' :                    'POINT (51.43454901237794, 35.707912892771326)',
        'سینما صحرا تهران' :                    'POINT (51.43452755470618, 35.707904180924736)',
        'سینما جوان تهران' :                    'POINT (51.43628641238085, 35.78549376021636)',
        'سینما تئاتر گلریز' :                   'POINT (51.40609599703529, 35.733149254730705)',
        'سینما قدس تهران' :                     'POINT (51.40658396819857, 35.71129464828836)',
        'سالن سوده' :                            'POINT (51.33683942772283, 35.746943539521574)',
        'سینما میلاد تهران' :                    'POINT (51.447510470048854, 35.6893792640906)',
        'پردیس سینمایی صبا مال' :               'POINT (51.39311596819754, 35.684372475577405)',
        'سینما ایران تهران' :                   'POINT (51.44012879268799, 35.71623747234535)',
        'پردیس سینمایی نارسیس' :                'POINT (51.53558865470763, 35.74582981132521)',
        'پردیس سینمایی پرده طلایی' :             'POINT (51.34338595470332, 35.63574446606407)',
        'سالن تئاتر دماوند' :                   'POINT (51.45684391052673, 35.70502778909879)',
        'تماشاخانه سبحان.' :                    'POINT (51.4615254547084, 35.768029157020024)',
        'سالن همایش\u200cهای امام علی' :        'POINT (51.31425628661024, 35.746620906041265)',
        'پردیس سینمایی هروی سنتر' :             'POINT (51.47907749703658, 35.76680089171342)',
        'سالن نمایش خانه مشق' :                 'POINT (51.46255005958102, 35.59208255349042)',
        'سرای محله زعفرانیه' :                  'POINT (51.408809268202475, 35.81154918389405)',
        'سینما تیراژه ۲' :                      'POINT (51.451366481691046, 35.70897350166831)',
        'سینما سپیده تهران' :                   'POINT (51.39951932587001, 35.70154341021152)',
        'پردیس سینمایی شهرک' :                  'POINT (51.365321012379546, 35.75239731238989)',
        'سالن جام جم همت' :                     'POINT (51.32603288169255, 35.753945929499366)',
        'سینما سروش تهران' :                    'POINT (51.435884641213796, 35.709644787660125)',
        'سالن ایوانک غربی' :                    'POINT (51.3445377700516, 35.7564111036055)',
        'سالن همایش و نمایش طهران' :            'POINT (51.32798945470792, 35.75326684502728)',
        'پردیس سینمایی لوتوس مال' :             'POINT (51.4050380565549, 35.63938137873617)',
        'پردیس تئاتر و موسیقی باغ کتاب' :       'POINT (51.43262425470769, 35.75107087243446)',
        'پردیس سینمایی زندگی' :                 'POINT (51.32641091422993, 35.73369329856013)',
        'سینما دزاشیب تهران' :                  'POINT (51.450480124022924, 35.81015328454983)',
        'پردیس سینمایی مگامال' :                'POINT (51.30797795470598, 35.70512447664897)',
        'پردیس سینمایی معین مال' :              'POINT (51.34768841237744, 35.69757453628004)',
        'سینما ماندانا تهران' :                 'POINT (51.491220412378205, 35.71481149460921)',
        'پردیس سینمایی کورش' :                  'POINT (51.3137274277225, 35.73865169351137)'
    }
    cinema_points_df = pd.DataFrame.from_dict(static_cinema_points, orient='index').reset_index().rename({'index':'cinema_title', 0:'location'}, axis=1)
    
    return cinema_points_df


In [4]:
# get_movies
movies_df = get_movies()

In [None]:
# get_movies_detail
movies_detail_df = get_movies_detail(movies_df)

In [None]:
# init df from excel
#movies_detail_df = pd.read_excel('./theatres.xlsx', sheet_name='theatres_times')
#movies_detail_df.drop(['location','location_x', 'location_y'], axis=1, inplace=True)
#movies_df = pd.read_excel('./theatres.xlsx')[['movie_id', 'movie_title', 'rate', 'rate_count', 'movie_link', 'etl_date']]

In [19]:
# load_to_excel
movies_df = movies_df.astype({'movie_id':'int64', 'rate':'float', 'rate_count':'int64'})
movies_df['satisfied_rate_count'] = (movies_df['rate_count'] * movies_df['rate'] / 5).astype('int64')
cinema_points_df = get_cinemas_location()
movies_detail_df = movies_detail_df.merge(cinema_points_df, how='left', on='cinema_title')
movies_df = movies_df.merge(movies_detail_df[['movie_id', 'cinema_title', 'cinema_address', 'location']].drop_duplicates().reset_index(drop=True), how='left', on='movie_id')
load_to_excel(movies_df, movies_detail_df)

In [None]:
# geodf
from shapely.wkt import loads
location_df = pd.read_excel('./theatres.xlsx')[['movie_title', 'satisfied_rate_count', 'cinema_title', 'location']]
location_df['geometry'] = location_df['location'].str.replace(',', '').apply(loads)
gdfs = gpd.GeoDataFrame(location_df, crs='epsg:4326')
gdfs.head()

In [79]:
# folium
upper = 2500
lower = 100
gdfs['radius'] = gdfs['satisfied_rate_count'].apply(lambda x:int(100+(upper-lower)*(x/gdfs['satisfied_rate_count'].max())))
my_map = folium.Map(prefer_canvas=True)
folium.GeoJson(
    gdfs,
    marker=folium.Circle(fill_color="blue", color="black", weight=1),
    tooltip=folium.GeoJsonTooltip(fields=["movie_title", "satisfied_rate_count", "cinema_title"]),
    popup=folium.GeoJsonPopup(fields=["movie_title", "satisfied_rate_count", "cinema_title"]),
    style_function=lambda x: {"radius": (x['properties']['radius'])},
    highlight_function=lambda x: {"fillOpacity": 0.4}
).add_to(my_map)
my_map.fit_bounds(my_map.get_bounds())
my_map.add_child(folium.map.LayerControl())

In [None]:
# comment
comment_user_name = [i.text for i in soup_detail.select('.profileBox .name')]
comment = [i.text for i in soup_detail.select('.messageBox')]
comment_date = [i.text for i in soup_detail.select('.date')]
# click more button

In [6]:
movies_df

Unnamed: 0,movie_id,movie_title,rate,rate_count,movie_link,etl_date
0,6019,کمدی موزیکال هوو,4.4,860,https://cinematicket.org/movie/detail/6019,2023-12-10
1,6080,فیتیله پیچ,3.6,159,https://cinematicket.org/movie/detail/6080,2023-12-10
2,6087,جزیره ۱۵۰۰ (معین شو ۷),4.1,178,https://cinematicket.org/movie/detail/6087,2023-12-10
3,5920,اگه بشه چی میشه,4.5,688,https://cinematicket.org/movie/detail/5920,2023-12-10
4,6085,پارازیت,3.9,303,https://cinematicket.org/movie/detail/6085,2023-12-10
5,6051,اول آشناییمون,4.3,350,https://cinematicket.org/movie/detail/6051,2023-12-10
6,6049,تهران دهه ۷۰,3.9,281,https://cinematicket.org/movie/detail/6049,2023-12-10
7,6060,تئاتر گاوچرون,3.7,222,https://cinematicket.org/movie/detail/6060,2023-12-10
8,6083,دریاکنار,4.2,210,https://cinematicket.org/movie/detail/6083,2023-12-10
9,6070,جاده چالوس,4.3,393,https://cinematicket.org/movie/detail/6070,2023-12-10
