1+ import requests
2+ import csv
3+ import time
4+ from bs4 import BeautifulSoup
5+
6+
7+ class HaikuScraper :
8+ """
9+ This scraper is designed with the purpose of scraping Haikus (Japanese poems) from Reddit.
10+ """
11+ def __init__ (self , url : str , headers : dict ):
12+ self .url = url
13+ self .headers = headers
14+
15+ def make_request (self ):
16+ time .sleep (3 )
17+ page = requests .get (self .url , headers = self .headers )
18+ soup = BeautifulSoup (page .text , 'html.parser' )
19+ return soup
20+
21+ def get_next_page (self , soup : BeautifulSoup ):
22+ time .sleep (3 )
23+ next_button = soup .find ('span' , class_ = 'next-button' )
24+ next_page_link = next_button .find ("a" ).attrs ['href' ]
25+ return next_page_link
26+
27+ def get_haikus (self , soup : BeautifulSoup ):
28+ haikus = [str (title .text ) for title in soup .find_all ("a" , class_ = "title may-blank " )]
29+ return haikus
30+
31+ def write_haikus_to_csv (self , haikus : list ):
32+ with open ('scraped_haikus_v2.txt' , 'a' ) as f :
33+ writer = csv .writer (f )
34+ for haiku in haikus :
35+ writer .writerow ([haiku ])
36+ f .close ()
37+
38+
39+
40+ url = "https://old.reddit.com/r/haiku/"
41+ # Headers to mimic a browser visit
42+ headers = {'User-Agent' : 'Mozilla/5.0' }
43+
44+ scraper = HaikuScraper (url , headers )
45+ soup = scraper .make_request ()
46+
47+ haikus = scraper .get_haikus (soup )
48+ scraper .write_haikus_to_csv (haikus )
49+
50+ counter = 1
51+
52+ while (counter <= 2500 ):
53+ time .sleep (2 )
54+ link = scraper .get_next_page (soup )
55+ print (f"Page { counter + 1 } . Link { link } ." )
56+ scraper = HaikuScraper (link , headers )
57+ soup = scraper .make_request ()
58+ haikus = scraper .get_haikus (soup )
59+ scraper .write_haikus_to_csv (haikus )
60+ counter += 1
0 commit comments