**Goal of this note book for ML project :- **

## Scraping Rotten Tomatoes
<p><img style="float: right; margin:5px 20px 5px 1px; width:30%" src="rottentomatoes.jpg"></p>
<p></p>
<p>Rotten Tomatoes aggregates movie and TV show reviews from critics using a tomatometer score. Based on the percentage of positive reviews, a movie is labelled Fresh or Rotten. The tomatometer score represents the opinion of hundreds of film and television critics and is trusted by million of fans.</p>

In this notebook, we scrape data from the Rotten Tomatoes website and save same into a `csv` file. The attributes obtained for each movie include `movieTitle`, `tMeterScore`, `link`, `actors`, `boxOffice`, `directedBy`, `genre`, `inTheaters`, `OnDiscStreaming`, `rating`, `runtime`, `studio`, `writtenBy`

   | Field       | Field Description 
| -------------:|:-------------|
| `movieTitle`      |Title of movie |
| `tMeterScore`     | Tomatometer score (in percentage)      |
| `link` | Hyperlink to movie on Rotten Tomatoes     |
| `actors` | Names of major actors in movie      |
| `boxOffice` | Box Office      |
| `directedBy` | Movie director(s)     |
| `genre` | Genre of movie      |
| `inTheaters` | Date in theaters      |
| `OnDiscStreaming` | Date on disc or streaming platforms      |
| `rating` | Movie rating     |
| `runtime` | Runtime      |
| `studio` | Studio which produced movie      |
| `writtenBy` | Movie writer(s)      |

In [None]:
#Import required libraries

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import urllib.request
import json
import csv
import requests

In [None]:
from selenium import webdriver
from time import sleep
import time

In [None]:
#URL
pageLength = 1 #Number of pages to scrape
url = "https://www.rottentomatoes.com/browse/dvd-streaming-all/"

# create a new Chrome session
driver = webdriver.Chrome()
driver.implicitly_wait(30)
driver.get(url)

#Open movies in list mode
first_button = driver.find_element_by_class_name('icon-list')
first_button.click()

#Click "Show More" to reveal all movies
for i in range(pageLength):
    try:
        python_button = driver.find_element_by_xpath("//button[contains(.,'Show More')]")
        python_button.click()
    except:
        break

        #Create soup
soup = BeautifulSoup(driver.page_source, 'lxml')

In [None]:
#Initialize counters
#Extract movie title, tomatometer score, movie release date and hyperlink

i = 0
d, e = {}, {}
e['actors'] = []

#Obtain details of all movies from soup
soup_ =soup.find(attrs={'class':'row'})
for item in soup_.find_all(attrs={'class': 'mb-movie'}):
    
    print(f"Doing movie number {i}.....", end='\r')
    
    e['movieTitle'] = item.find(attrs={'class': 'movieTitle'}).text
    e['tMeterScore'] = item.find(attrs={'class': 'tMeterScore'}).text
    e['releaseDate'] = item.find(attrs={'class': 'release-date'}).text
    e['link'] = item.a['href']
    
    if item.find(attrs={'class': 'actors'}):
        e['actors']+=[item.find(attrs={'class': 'actors'}).text]
    else:
        e['actors']+=['None']
        
    d[i]=e
    
    e={}
    e['actors']=[]
    i+=1

Doing movie number 0.....Doing movie number 1.....Doing movie number 2.....Doing movie number 3.....Doing movie number 4.....Doing movie number 5.....Doing movie number 6.....Doing movie number 7.....Doing movie number 8.....Doing movie number 9.....Doing movie number 10.....Doing movie number 11.....Doing movie number 12.....Doing movie number 13.....Doing movie number 14.....Doing movie number 15.....Doing movie number 16.....Doing movie number 17.....Doing movie number 18.....Doing movie number 19.....Doing movie number 20.....Doing movie number 21.....Doing movie number 22.....Doing movie number 23.....Doing movie number 24.....Doing movie number 25.....Doing movie number 26.....Doing movie number 27.....Doing movie number 28.....Doing movie number 29.....Doing movie number 30.....Doing movie number 31.....Doing movie number 32.....Doing movie number 33.....Doing movie number 34.....Doing movie number 35.....Doing movie number 36.....Doing movie

In [None]:
#Examine first item in dictionary
print(json.dumps(d[0], sort_keys=True, indent=4))

{
    "actors": [
        "Sam Earle, Victoria Diamond"
    ],
    "link": "https://www.rottentomatoes.com/m/game_of_death_2020",
    "movieTitle": "Game of Death",
    "releaseDate": "Available Jul 14",
    "tMeterScore": "63%"
}


In [None]:
#Save as json file
# with open('data1.json', 'w') as f:
#     json.dump(d, f)

In [None]:
#Examine as dataframe file
dfMovies = pd.DataFrame(d).T
dfMovies.head()

Unnamed: 0,actors,link,movieTitle,releaseDate,tMeterScore
0,"[Sam Earle, Victoria Diamond]",https://www.rottentomatoes.com/m/game_of_death...,Game of Death,Available Jul 14,63%
1,"[Axelle Laffont, Virginie Ledoyen]",https://www.rottentomatoes.com/m/milf_2018,MILF,Available Jul 16,17%
2,"[Tom Hanks, Stephen Graham]",https://www.rottentomatoes.com/m/greyhound,Greyhound,Available Jul 10,79%
3,"[Keita Ninomiya, Satoshi Mizuno]",https://www.rottentomatoes.com/m/we_are_little...,We Are Little Zombies,Available Jul 10,93%
4,"[Jenny Slate, Alex Sharp]",https://www.rottentomatoes.com/m/the_sunlit_night,The Sunlit Night,Available Jul 17,34%


In [None]:
df.__len__()

64

In [None]:
#Extract other details about movie including boxoffice, movie writers, producers, etc.

r = 0
d, e = {}, {}
for link in df.link:
    start_time = time.time()
    
    response = requests.get(link)
    html = response.content
    bsObj = BeautifulSoup(html)

    j={}
    for i,q in zip(bsObj.find_all(attrs={'class': 'meta-value'}),
              bsObj.find_all(attrs={'class': 'meta-label'})):
    #     print(i)
        k = i.text.strip().replace('\n','').replace('\t','')
        k = k.split()
        p = ' '.join(k)
        j[q.text.replace(':','')] = p
    
    
    e = {**e, **j}
    d[r] = e
    end_time = time.time()

    print(f"Doing movie number {r}.....in {end_time-start_time}", end='\r')
    r+=1
    
    with open('data2.json', 'w') as f:
        json.dump(d, f)

Doing movie number 63.....in 0.76030325889587484

In [None]:
print(d[0])

{'Rating': 'NR', 'Genre': 'Horror, Mystery & Suspense', 'Directed By': 'Sébastien Landry, Laurence Morais-Lagace', 'Written By': 'Edouard H. Bond, Philip Kalin-Hajdu, Laurence Morais-Lagace, Sébastien Landry', 'In Theaters': 'Jul 14, 2020 limited', 'On Disc/Streaming': 'Jul 14, 2020', 'Runtime': '73 minutes', 'Studio': 'Cleopatra Entertainment'}


In [None]:
dfGenres = pd.read_json(json.dumps(d)).T
dfGenres.head()

Unnamed: 0,Directed By,Genre,In Theaters,On Disc/Streaming,Rating,Runtime,Studio,Written By
0,"Sébastien Landry, Laurence Morais-Lagace","Horror, Mystery & Suspense","Jul 14, 2020 limited","Jul 14, 2020",NR,73 minutes,Cleopatra Entertainment,"Edouard H. Bond, Philip Kalin-Hajdu, Laurence ..."
1,Axelle Laffont,"Art House & International, Comedy","Jul 14, 2020 limited","Jul 16, 2020",NR,101 minutes,Netflix,"Jerome L'Hotsky, Stéphane Kramer, Axelle Laffo..."
2,Aaron Schneider,"Action & Adventure, Drama","Jun 12, 2020 wide","Jul 10, 2020",PG-13 (for war-related action/violence and bri...,101 minutes,Apple TV+,Tom Hanks
3,Makoto Nagahisa,Drama,"Jul 10, 2020 limited","Jul 10, 2020",NR,120 minutes,Oscilloscope Laboratories,Makoto Nagahisa
4,David Wnendt,"Drama, Romance","Jul 10, 2020 limited","Jul 17, 2020",NR,106 minutes,Quiver Distribution,Rebecca Dinerstein


In [None]:
dfMovies.loc[:, "actors"] = dfMovies.actors.apply(lambda x: str(x).split("['")[-1].rsplit("']")[0])

In [None]:
dfMovies.head()

Unnamed: 0,actors,link,movieTitle,releaseDate,tMeterScore
0,"Sam Earle, Victoria Diamond",https://www.rottentomatoes.com/m/game_of_death...,Game of Death,Available Jul 14,63%
1,"Axelle Laffont, Virginie Ledoyen",https://www.rottentomatoes.com/m/milf_2018,MILF,Available Jul 16,17%
2,"Tom Hanks, Stephen Graham",https://www.rottentomatoes.com/m/greyhound,Greyhound,Available Jul 10,79%
3,"Keita Ninomiya, Satoshi Mizuno",https://www.rottentomatoes.com/m/we_are_little...,We Are Little Zombies,Available Jul 10,93%
4,"Jenny Slate, Alex Sharp",https://www.rottentomatoes.com/m/the_sunlit_night,The Sunlit Night,Available Jul 17,34%


In [None]:
dff = pd.concat([dfMovies, dfGenres], axis=1)
dff.head()

Unnamed: 0,actors,link,movieTitle,releaseDate,tMeterScore,Directed By,Genre,In Theaters,On Disc/Streaming,Rating,Runtime,Studio,Written By
0,"Sam Earle, Victoria Diamond",https://www.rottentomatoes.com/m/game_of_death...,Game of Death,Available Jul 14,63%,"Sébastien Landry, Laurence Morais-Lagace","Horror, Mystery & Suspense","Jul 14, 2020 limited","Jul 14, 2020",NR,73 minutes,Cleopatra Entertainment,"Edouard H. Bond, Philip Kalin-Hajdu, Laurence ..."
1,"Axelle Laffont, Virginie Ledoyen",https://www.rottentomatoes.com/m/milf_2018,MILF,Available Jul 16,17%,Axelle Laffont,"Art House & International, Comedy","Jul 14, 2020 limited","Jul 16, 2020",NR,101 minutes,Netflix,"Jerome L'Hotsky, Stéphane Kramer, Axelle Laffo..."
2,"Tom Hanks, Stephen Graham",https://www.rottentomatoes.com/m/greyhound,Greyhound,Available Jul 10,79%,Aaron Schneider,"Action & Adventure, Drama","Jun 12, 2020 wide","Jul 10, 2020",PG-13 (for war-related action/violence and bri...,101 minutes,Apple TV+,Tom Hanks
3,"Keita Ninomiya, Satoshi Mizuno",https://www.rottentomatoes.com/m/we_are_little...,We Are Little Zombies,Available Jul 10,93%,Makoto Nagahisa,Drama,"Jul 10, 2020 limited","Jul 10, 2020",NR,120 minutes,Oscilloscope Laboratories,Makoto Nagahisa
4,"Jenny Slate, Alex Sharp",https://www.rottentomatoes.com/m/the_sunlit_night,The Sunlit Night,Available Jul 17,34%,David Wnendt,"Drama, Romance","Jul 10, 2020 limited","Jul 17, 2020",NR,106 minutes,Quiver Distribution,Rebecca Dinerstein


In [None]:
mapper = {'movieTitle': 'movieTitle',
         'tMeterScore': 'tMeterScore',
         'link': 'link',
         'actors': 'actors',
          'releaseDate': 'releaseDate',
         'Box Office': 'boxOffice',
         'Directed By': 'directedBy',
         'Genre': 'genre',
         'In Theaters': 'inTheaters',
         'On Disc/Streaming': 'onDiscStreaming',
         'Rating': 'rating',
         'Runtime': 'runtime',
         'Studio': 'studio',
         'Written By': 'writtenBy'}

cols = ['movieTitle', 'tMeterScore', 'releaseDate', 'actors',
       'link', 'genre', 'directedBy', 'inTheaters', 'onDiscStreaming',
       'rating', 'runtime', 'studio', 'writtenBy']
dff.rename(columns=mapper, inplace=True)

In [None]:
dff[cols].head(2)

Unnamed: 0,movieTitle,tMeterScore,releaseDate,actors,link,genre,directedBy,inTheaters,onDiscStreaming,rating,runtime,studio,writtenBy
0,Game of Death,63%,Available Jul 14,"Sam Earle, Victoria Diamond",https://www.rottentomatoes.com/m/game_of_death...,"Horror, Mystery & Suspense","Sébastien Landry, Laurence Morais-Lagace","Jul 14, 2020 limited","Jul 14, 2020",NR,73 minutes,Cleopatra Entertainment,"Edouard H. Bond, Philip Kalin-Hajdu, Laurence ..."
1,MILF,17%,Available Jul 16,"Axelle Laffont, Virginie Ledoyen",https://www.rottentomatoes.com/m/milf_2018,"Art House & International, Comedy",Axelle Laffont,"Jul 14, 2020 limited","Jul 16, 2020",NR,101 minutes,Netflix,"Jerome L'Hotsky, Stéphane Kramer, Axelle Laffo..."


In [None]:
#Save in csv file
# dff.to_csv('data.csv')

Rotten Tomatoes maintains a tomatometer score based on percentage of reviews for a movie or TV show. When at least 60% of reviews for a movie or show are positive, a red tomato is displayed to indicate its fresh status. Otherwise a green splat is displayed to indicate its rotten status. When the title hasn't been released or there are not enough ratings to generate a score, no tomatometer score is available. In the generated dataframe the tomatometer score is given as `tMeterScore`. An average rating over 10 is also available alongside `tMeterScore`. This can be obtained by running the function `getScores` for each link. It is expected that `tMeterScore` correlates with this average rating, therefore there may be no need to make the average rating available in the dataframe.

In [None]:
#Get average tomato rating over 10
def getScores(link):
    url = link
    # create a new Chrome session
    driver = webdriver.Chrome()
    driver.implicitly_wait(30)
    driver.get(url)

    python_button = driver.find_element_by_xpath("//button[contains(.,'See score details')]")
    python_button.click()    
    
    soup_level4 = BeautifulSoup(driver.page_source, 'lxml')
    avgRating = soup_level4.find_all(attrs={'class': 
                                            'score-details__stats'})[0].find(attrs={'class': 
                                                                                 'js-tomatometer-score-info'}).text
    #Quit session
    driver.quit()
    
    return avgRating

In [None]:
getScores(df.iloc[0].link)

'6.14/10'

Now we Have to get IMDB Data set 

# IMDB data set 
URL :- https://www.imdb.com/search/title/?release_date=2006-01-01,2020-07-25&count=250



In [None]:
# If any of the import  is not working , please remove ## and run the pip install command 
##!pip install requests
##!pip install bs4


from requests import get
import pandas as pd
from bs4 import BeautifulSoup


In [None]:
#date range for release_date started from 2006-01-01 and end in July 25 2020 (2020-07-25)
url = 'https://www.imdb.com/search/title/?release_date=2006-01-01,2020-07-25&count=250'
rankpage1 = get(url)
#print(response.text[:500])

In [None]:
soup = BeautifulSoup(rankpage1.text, 'html.parser')
type(soup)
total_titles = int(soup.findAll('div', class_='desc')[0].span.text.split("of")[1].split()[0].replace(',',''))
total_titles

3860379

In [None]:
movie_containers = soup.find_all('div', class_ = 'lister-item mode-advanced')
print(type(movie_containers))
print(len(movie_containers))


<class 'bs4.element.ResultSet'>
250


In [None]:
# this cell takes long to run , just wait for 7 min (11:18 --> 10:40 )** i ran it couple of time and time is consistent 
for i in range(251,3860385+1,250): 

  next_page_url='https://www.imdb.com/search/title/?release_date=2006-01-01,2020-07-25&count=250&start='+str(i)+'&ref_=adv_nxt'
  rankpage2 = get(next_page_url)
  soup1 = BeautifulSoup(rankpage2.text, 'html.parser')
  movie_containers_t= soup1.find_all('div', class_ = 'lister-item mode-advanced')
  movie_containers=movie_containers+movie_containers_t
print(type(movie_containers))
print(len(movie_containers))

Possible solution if it needs to run faster



> 1) reduce the time frame (currently 01-Jan-2006 till date). --More feasible given we have to keep the movie containers as well as the final dataframe in memory.



>  Break the loop into multiple loops and clean memory after each iteration. 


In [None]:
if movie_containers[7].find('strong') is None:
  print("Hello")
else:
  print(movie_containers[7].find('strong').text)

3.3


In [None]:
# Lists to store the scraped data 
names = []
years = []
imdb_ratings = []

# Extract data from individual movie container
for container in movie_containers:
  #if container.strong.text is not None:
# The name
    name = container.h3.a.text
    names.append(name)
# The year
    year = container.h3.find('span', class_ = 'lister-item-year').text
    years.append(year)
# The IMDB rating
    if container.find('strong') is None:
      imdb = 0
    else:
      imdb = container.find('strong').text
    
    imdb_ratings.append(imdb)

In [None]:
import pandas as pd
movie_imdb_df = pd.DataFrame({'movie': names,
'year': years,
'imdb_ratings': imdb_ratings,
})
#print(movie_imdb_df.info())

In [None]:
movie_imdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33500 entries, 0 to 33499
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   movie         33500 non-null  object
 1   year          33500 non-null  object
 2   imdb_ratings  33500 non-null  object
dtypes: object(3)
memory usage: 785.3+ KB


In [None]:
movie_imdb_df.head()

Unnamed: 0,movie,year,imdb_ratings
0,Cursed,(2020– ),5.7
1,The Old Guard,(2020),6.7
2,Greyhound,(2020),7.1
3,Palm Springs,(2020),7.5
4,Hamilton,(2020),8.8


In [None]:
movie_imdb_df.tail()

Unnamed: 0,movie,year,imdb_ratings
33495,Devil's Gate,(2017),5.1
33496,Death in Paradise,(2011– ),7.8
33497,The Americans,(2013–2018),8.4
33498,Bombshell,(2019),6.8
33499,Marcella,(2016– ),7.4


In [None]:
movie_imdb_df.describe()

Unnamed: 0,movie,year,imdb_ratings
count,33500,33500,33500.0
unique,9707,273,89.0
top,Snowpiercer,(2019),8.4
freq,190,3783,1814.0


In [None]:
 movie_imdb_df.sort_values(by='year')

Unnamed: 0,movie,year,imdb_ratings
6659,The IT Crowd,(2006 TV Movie),6.4
8557,Read It and Weep,(2006 TV Movie),5.5
8239,The Librarian: Return to King Solomon's Mines,(2006 TV Movie),6.3
8999,The Cheetah Girls 2,(2006 TV Movie),5.0
1181,High School Musical,(2006 TV Movie),5.4
...,...,...,...
1187,The Gift,(VI) (2015),7.0
7038,Anna,(VII) (2013),6.5
6508,Sacrifice,(VII) (2019),4.3
7528,Still,(VIII) (2018),5.1
