<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Loading-Libraries" data-toc-modified-id="Loading-Libraries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Loading Libraries</a></span></li><li><span><a href="#Reading-URL" data-toc-modified-id="Reading-URL-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Reading URL</a></span></li><li><span><a href="#Extracting-useful-information" data-toc-modified-id="Extracting-useful-information-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Extracting useful information</a></span></li></ul></div>

## Loading Libraries

In [11]:
library("rvest")
library("XML")
library("xml2")

## Reading URL

In [2]:
# IMDB Top 250 Movies
url = "http://www.imdb.com/chart/top"
page = read_html(url)
movie.nodes <- html_nodes(page,'.titleColumn a')
movie.nodes

{xml_nodeset (250)}
 [1] <a href="/title/tt0111161/?pf_rd_m=A2FGELUUNOQJNL&amp;pf_rd_p=e31d89dd-3 ...
 [2] <a href="/title/tt0068646/?pf_rd_m=A2FGELUUNOQJNL&amp;pf_rd_p=e31d89dd-3 ...
 [3] <a href="/title/tt0071562/?pf_rd_m=A2FGELUUNOQJNL&amp;pf_rd_p=e31d89dd-3 ...
 [4] <a href="/title/tt0468569/?pf_rd_m=A2FGELUUNOQJNL&amp;pf_rd_p=e31d89dd-3 ...
 [5] <a href="/title/tt0050083/?pf_rd_m=A2FGELUUNOQJNL&amp;pf_rd_p=e31d89dd-3 ...
 [6] <a href="/title/tt0108052/?pf_rd_m=A2FGELUUNOQJNL&amp;pf_rd_p=e31d89dd-3 ...
 [7] <a href="/title/tt0167260/?pf_rd_m=A2FGELUUNOQJNL&amp;pf_rd_p=e31d89dd-3 ...
 [8] <a href="/title/tt0110912/?pf_rd_m=A2FGELUUNOQJNL&amp;pf_rd_p=e31d89dd-3 ...
 [9] <a href="/title/tt0060196/?pf_rd_m=A2FGELUUNOQJNL&amp;pf_rd_p=e31d89dd-3 ...
[10] <a href="/title/tt0137523/?pf_rd_m=A2FGELUUNOQJNL&amp;pf_rd_p=e31d89dd-3 ...
[11] <a href="/title/tt0120737/?pf_rd_m=A2FGELUUNOQJNL&amp;pf_rd_p=e31d89dd-3 ...
[12] <a href="/title/tt0109830/?pf_rd_m=A2FGELUUNOQJNL&amp;pf_rd_p=e31d89dd-3 

## Extracting useful information

In [3]:
# Generating each movie link
movie.link = sapply(html_attrs(movie.nodes),`[[`,'href')
movie.link = paste0("http://www.imdb.com",movie.link)

In [4]:
# Extracting movie cast information
movie.cast = sapply(html_attrs(movie.nodes),`[[`,'title')

In [5]:
# Extracting movie names
movie.name = html_text(movie.nodes)

In [6]:
# Extracting movie year
secInfo <- html_nodes(page,'.secondaryInfo')

year = as.numeric(gsub(")","",                  # Removing )
                  gsub("\\(","",                # Removing (
                  html_text(secInfo)                # get text of HTML node  
                )))

In [7]:
# Extracting movie rating
rating.nodes = html_nodes(page,'.imdbRating strong')
rating = as.numeric(html_text(rating.nodes))

In [8]:
# Extracting movie votes
votes = as.numeric(gsub(',','',
                        gsub(' user ratings','',
                             gsub('.*?based on ','',
                                  sapply(html_attrs(rating.nodes),`[[`,'title')
                             ))))

In [9]:
# Tabulating collected data
top250 <- data.frame(movie.name, movie.cast, movie.link, year, votes, rating)
top250

movie.name,movie.cast,movie.link,year,votes,rating
The Shawshank Redemption,"Frank Darabont (dir.), Tim Robbins, Morgan Freeman",http://www.imdb.com/title/tt0111161/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=6KVACQXHD7YMV9ZXKWSF&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_1,1994,2008855,9.2
The Godfather,"Francis Ford Coppola (dir.), Marlon Brando, Al Pacino",http://www.imdb.com/title/tt0068646/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=6KVACQXHD7YMV9ZXKWSF&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_2,1972,1376327,9.2
The Godfather: Part II,"Francis Ford Coppola (dir.), Al Pacino, Robert De Niro",http://www.imdb.com/title/tt0071562/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=6KVACQXHD7YMV9ZXKWSF&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_3,1974,952901,9.0
The Dark Knight,"Christopher Nolan (dir.), Christian Bale, Heath Ledger",http://www.imdb.com/title/tt0468569/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=6KVACQXHD7YMV9ZXKWSF&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_4,2008,1977848,9.0
12 Angry Men,"Sidney Lumet (dir.), Henry Fonda, Lee J. Cobb",http://www.imdb.com/title/tt0050083/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=6KVACQXHD7YMV9ZXKWSF&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_5,1957,562897,8.9
Schindler's List,"Steven Spielberg (dir.), Liam Neeson, Ralph Fiennes",http://www.imdb.com/title/tt0108052/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=6KVACQXHD7YMV9ZXKWSF&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_6,1993,1036777,8.9
The Lord of the Rings: The Return of the King,"Peter Jackson (dir.), Elijah Wood, Viggo Mortensen",http://www.imdb.com/title/tt0167260/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=6KVACQXHD7YMV9ZXKWSF&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_7,2003,1430037,8.9
Pulp Fiction,"Quentin Tarantino (dir.), John Travolta, Uma Thurman",http://www.imdb.com/title/tt0110912/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=6KVACQXHD7YMV9ZXKWSF&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_8,1994,1568080,8.9
"Il buono, il brutto, il cattivo","Sergio Leone (dir.), Clint Eastwood, Eli Wallach",http://www.imdb.com/title/tt0060196/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=6KVACQXHD7YMV9ZXKWSF&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_9,1966,595362,8.8
Fight Club,"David Fincher (dir.), Brad Pitt, Edward Norton",http://www.imdb.com/title/tt0137523/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=6KVACQXHD7YMV9ZXKWSF&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_10,1999,1607341,8.8


In [10]:
# Finding movies from perticular duration
ConditionalIMDB <- subset(top250, year >= 1996 & year <= 1998)
row.names(ConditionalIMDB) <- NULL
ConditionalIMDB[c("movie.name", "year")]

movie.name,year
La vita è bella,1997
Saving Private Ryan,1998
American History X,1998
Mononoke-hime,1997
Good Will Hunting,1997
L.A. Confidential,1997
Bacheha-Ye aseman,1997
"Lock, Stock and Two Smoking Barrels",1998
Trainspotting,1996
Fargo,1996
