/
webscraper.py
41 lines (30 loc) · 1.16 KB
/
webscraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from bs4 import BeautifulSoup
import time
from requests import get
import csv
import pandas as pd
class NasaFunction:
def NasaScraper(self, q, size):
links = []
pages = '1'
qs = q.replace(' ','+')
for page in pages:
print("Page", page)
response = get(
'https://nasasearch.nasa.gov/search?affiliate=nasa&page=' + str(page) + '&query=' + str(qs) + '&utf8=%E2%9C%93')
time.sleep(2)
html_soup = BeautifulSoup(response.text, 'html.parser')
movie_containers = html_soup.find_all('div', class_='content-block-item result')
for container in movie_containers:
link = container.h4.a
if link and 'href' in link.attrs:
all_links = link.get('href')
links.append(all_links)
#print(links)
finallinks = []
for link in links:
if 'images' not in link:
finallinks.append(link)
return finallinks[:size]
#test = NasaFunction.NasaScraper(0, 'acid',5)
#print(test)