In [82]:
#!pip install --user --upgrade requests beautifulsoup4 lxml html5lib==1.0b8

import os
import json
import requests
from uuid import uuid4
from bs4 import BeautifulSoup


class PttCrawler:
    
    def __init__(self, board, page, write=False):
        self.ptt_url = 'https://www.ptt.cc'
        self.board = board
        self.page = page
        
        self.session = requests.Session()
        self.session.cookies.update({
            'over18': '1'
        })
        
        self.write = write
        
    def run(self):
        
        url = self.ptt_url + '/bbs/' + self.board
        post_link_list = self.fetchPostLinkList(url)
        post_list = [self.fetchPost(post_link) for post_link in post_link_list]
        
        #for post in post_list:
        #    print(post)
        
        #NB self is never paszsed as an argument
        
        return post_list
        
    def fetchPostLinkList(self, url):

        #resp = requests.get(
        #    url,
        #    cookies = self.session.cookies
        #)
           
        resp = self.session.get(
            url
        )
        
        soup = BeautifulSoup( resp.text.encode( "utf-8" ), "lxml" )

        post_list = soup.find('div', {'class': 'r-list-container action-bar-margin bbs-screen'})
        link_list = [tag.get('href') for tag in post_list.find_all('a')]
        
        if self.page > 0:
            paging_group = soup.find('div', {'class': 'btn-group btn-group-paging'})
            previous_link = self.ptt_url + paging_group.find_all('a')[1].get('href')
        
            self.page -= 1
            previous_post_link_list = self.fetchPostLinkList(previous_link)
            #BECAUSE IT OPERATES INSIDE THE CLASS DEFINITION self SPECIFIES THE METHOD ("NOT THE ARGUMENTS")
            if previous_post_link_list:
                link_list.extend(previous_post_link_list)
        
        return link_list
    
    def fetchPost(self, url):

        url = self.ptt_url + url

        resp = self.session.get(
            url
        )
        
        soup = BeautifulSoup( resp.text.encode( "utf-8" ), "lxml" )
        
        metadata = soup.find_all('div', {'class': 'article-metaline'})
        
        try:
            author = metadata[0].find('span', {'class': 'article-meta-value'}).text # find() RETURNS AN OBJECT WITH text MEMBER
        except:
            author = None
        
        try:
            title = metadata[1].find('span', {'class': 'article-meta-value'}).text
        except:
            title = None
        
        try:
            date = metadata[2].find('span', {'class': 'article-meta-value'}).text
        except:
            date = None

        try:
            content = metadata[-1].next_sibling # [-1] STANDS FOR LAST OF THE LIST
        except:
            content = None

        return {'title': title, 'author': author, 'date': date, 'content': content}



In [86]:
instance = PttCrawler('Gossiping', 3)
post_list = instance.run()
for post in post_list:
    print(post)

{'date': 'Thu Dec 21 15:05:27 2017', 'title': '[ＦＢ] 柯文哲 一起來玩「奔跑吧！台北」', 'author': 'Philethan (Ethan)', 'content': '\nＦＢ卦點說明：（'}
{'date': 'Thu Dec 21 15:05:42 2017', 'title': '[新聞] 搶救空污 政院宣布2030市區公車全面電動化', 'author': 'paetix (少冰微糖)', 'content': '\n\n1.媒體來源:\n'}
{'date': 'Thu Dec 21 15:06:03 2017', 'title': '[問卦] 女森是不是無法下半身思考所以輸男森', 'author': 'laser789 (南崁彭于晏)', 'content': '\n\n很多人都說男森下半身思考\n\n這應該是優點吧\n\n代表除了大腦還多一個下體來思考\n\n雙核心處理器的概念\n\n所以女森被罵沒大腦 沒邏輯是不是這個原因\n\n有沒有八卦？\n\n--\n'}
{'date': 'Thu Dec 21 15:06:15 2017', 'title': 'Re: [問卦] 有沒有下床需要30分鐘的八卦', 'author': 'blackzero1 (瘦宅)', 'content': '\n'}
{'date': 'Thu Dec 21 15:08:21 2017', 'title': '[問卦] 有沒有邏輯很好數學不好的', 'author': 'david190 (david)', 'content': '\n很多法官 邏輯很好\n\n大多數 不是台大畢業的\n\n一些甚至是同等學力考上的\n\n這些人邏輯超強\n\n\n數學不太好\n\n\n所以說 邏輯好 不等於數學好\n\n數學好 也不等於 邏輯好\n\n因為很多數學好的科學家 都很跳 跳躍式的講話思維 直觀超強 不管過程的也有\n\n所以說 邏輯 不等於 數學 反之亦然\n\n\n以後大家別用 數學好 等於 邏輯好 在那邊文字獄文科們了好嗎? 懂~就叫一聲聽聽~\n\n--\n'}
{'date': 'Thu Dec 21 15:09:28 2017', 'title': '[問卦] 有人還記得Kony 2012嗎', 'aut