-
Notifications
You must be signed in to change notification settings - Fork 3
/
Scrape Tutorial.R
104 lines (86 loc) · 2.99 KB
/
Scrape Tutorial.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
setwd("D:/Dropbox/Public/Sean's Stuff/Grad School/Research/Scrape Tutorial") #set WD
library("rvest")
library(purrr)
library(plyr)
library(dplyr) #loading packages
##### Create Page Holder
page_holder <- rep(NA, 10)
for (i in 1:length(page_holder)){
page_holder[i] <-(paste("https://www.politico.com/news/2020-elections",i))
}
page_holder <- gsub("s ", "s/",page_holder) #minor adjustment to get formatting working
####### Create Functions ###
URLFunction <- function(url){
Sys.sleep(1)
read_html(url) %>%
html_nodes("#main h1 a") %>%
html_attr('href')
}
HeadlineFunction <- function(url){
Sys.sleep(1)
as.character(html_text(html_nodes(read_html(url), '#main h1 a')))
}
AuthorFunction <- function(url){
Sys.sleep(1)
as.character(html_text(html_nodes(read_html(url),'.story-meta__authors .vcard')))
}
DateFunction <- function(url){
Sys.sleep(1)
as.character(html_text(html_nodes(read_html(url),'time')))
}
TextFunction <- function(url){
Sys.sleep(1)
(html_text(html_nodes(read_html(url), '.story-text__paragraph')))
}
##### Scraping #####
PoliticoURL <- sapply(page_holder, URLFunction) #Apply URL Function, will take some time
PoliticoURL <- URLFunction(page_holder[1])
# PoliticoHeadline <- sapply(page_holder, HeadlineFunction) #Apply Headline function
# PoliticoHeadline <- as.character(PoliticoHeadline) #put into one column
PoliticoHeadline <- HeadlineFunction(page_holder[1])
Database <- as.data.frame(cbind(PoliticoHeadline, PoliticoURL)) #if you want to make sure they match up
Dates <- rep(NA,length(Database$PoliticoHeadline)) #we are going from actual articles now, so using a loop to deal with formatting errors/lag
for(i in 1:length(Dates)){
Dates[i] <- DateFunction(PoliticoURL[i])
print(i)
}
saveRDS(Dates, "Dates.rds")
Author <- rep(NA,length(Database$PoliticoHeadline))
for(i in 1:length(Author)){
Author[i] <- AuthorFunction(PoliticoURL[i])
print(i)
}
saveRDS(Author, "Author.rds")
Database$Date <- Dates
Database$Author <- Author
###### Scraping the Text ###
setwd("D:/Dropbox/Public/Sean's Stuff/Grad School/Research/Scrape Tutorial/Text")
for (i in 1:length(Database$PoliticoURL)){
text_holder <- NA
text_holder <- TextFunction(Database$PoliticoURL[i])
output <- paste0(i, ".txt") #create a name for each
write.csv(text_holder, output) #saves file
rm(text_holder)
print(i)
}
#### Bringing Text Back In ###
library(readr)
setwd("D:/Dropbox/Public/Sean's Stuff/Grad School/Research/Scrape Tutorial/Text")
file_list <- rep(NA, 20)
x <- 0
for (i in 1:length(file_list)){
x <- 0 + i
file_list[i] <- paste(x, ".txt") #creating list of the file names that we exported
}
file_list <- gsub(" .txt", ".txt", file_list)
for (file in file_list){
if(exists("dataset")){
temp_dataset <- read_file(file)
dataset<-rbind(dataset, temp_dataset)
rm(temp_dataset)
print(file)
}
if(!exists("dataset")){
dataset <- read_file(file)
}}
Database$Text <- dataset