-
Notifications
You must be signed in to change notification settings - Fork 0
/
get-movie-url.R
43 lines (30 loc) · 1.39 KB
/
get-movie-url.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
##
# The script takes a list of movie names, does a google search and retrieves the URLs
# which contain 'imdb' in it. The goal is to eventually get the imdb IDs of the movies
# Some of the code is taken from this stackoverflow answerhttps://stackoverflow.com/questions/32889136/how-to-get-google-search-results/40181564
##
# Code for getting the IMDb Url for the movies
library(urltools)
library(rvest)
library(curl)
#load data
df<-read.csv("nominated_movies.csv", header = TRUE, sep = ";", stringsAsFactors = FALSE)
movie_list<-as.character(paste0(df$Film, "(", df$Year, ")"))
# Function for getting website.
getWebsite <- function(name){
url = URLencode(paste0("https://www.google.com/search?q=",name))
#page <- read_html(url)
page<-read_html(curl(url, handle = curl::new_handle("useragent"="Chrome")))
results <- page %>%
html_nodes("cite") %>% # Get all notes of type cite. You can change this to grab other node types.
html_text()
result<-results[grep("^....imdb.*", results)]
# Return results if you want to see them all.
return(result)
}
URL<-sapply(movie_list, getWebsite) # depending on the network you are on, you will most likely face a HTTP error 503
# add to dataframe
df$URL<-URL
# Write to CSV
df<-apply(df, 2, as.character)
write.csv(df, file = "MovieSite.csv", sep = ',', row.names = FALSE, append = T) # The resulting data frame will need further preprocessing