-
Notifications
You must be signed in to change notification settings - Fork 0
/
nw_en_scrape.py
105 lines (85 loc) · 2.5 KB
/
nw_en_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import nltk
nltk.download('punkt')
import newspaper
import pandas as pd
import csv
from newspaper import Article
from newspaper import fulltext
import requests
from bs4 import BeautifulSoup
from pprint import pprint
import pandas as pd
import csv
from random import randint
import os
import time
from time import sleep
dfmain = pd.read_csv (r'/XXXXXXX/Covid_Master.csv', encoding = "ISO-8859-1", engine='python')
list_of_urls = dfmain['link'].tolist()
list_of_urls = dfmain['link'].tolist()
rows = []
for link in list_of_urls:
try:
a = Article(url="%s" % (link), language='en')
a.download()
a.parse()
author = a.authors
text = a.text
title = a.title
row = {'url':link,
'author':author,
'text':text,
'title': title}
rows.append(row)
except Exception as e:
print(e)
row = {'url':link,
'author':'N/A',
'text':'N/A',
'title': 'N/A'}
rows.append(row)
df_v1 = pd.DataFrame(rows)
df_v1.to_csv('my_scraped_articles_raw_russian.csv')
dfmaster = dfmain.merge(df_v1, left_on='link', right_on='url')
dfmaster.to_csv('my_scraped_articles_master_v1.csv')
###run n/a's with beautoful soup
df_na = df_v1.loc[df_v1['text'] == 'N/A']
list_of_urls = df_na['url'].tolist()
rows = []
for link in list_of_urls:
try:
sleep(randint(3, 7))
r = requests.get(link)
soup = BeautifulSoup(r.content, "html.parser")
text = soup.find_all(text=True)
output = ''
blacklist = [
'[document]',
'noscript',
'header',
'html',
'meta',
'head',
'input',
'script',
# there may be more elements you don't want, such as "style", etc.
]
for t in text:
if t.parent.name not in blacklist:
output += '{} '.format(t)
row = {'link':link,
'soup':soup,
'text':text,
'output':output}
rows.append(row)
except Exception as e:
row = {'link':link,
'soup':'N/A',
'text':'N/A',
'output':'N/A'}
rows.append(row)
df_na_scraped = pd.DataFrame(rows)
##save as backup
df_na_scraped.to_csv('my_scraped_articles_v2.csv')
dfmaster2 = dfmaster.merge(df_na_scraped, how='left', left_on='link', right_on='link')
dfmaster2.to_csv('my_scraped_articles_master_FV.csv')