In [1]:
startPage = "twitter.com"
import http.client as hc
import re
from bs4 import BeautifulSoup
from collections import deque
import json
import time
import csv

jsonDecoder = json.JSONDecoder()

def robot_valid_href(href):
    if not href:
        return False
    # Note still must abide by crawl-delay = 1
    valid = True
    valid &= (not not re.compile("twitter.com").search(href)) or href[0] == "/"
    valid &= not re.compile("/search/realtime").search(href)
    valid &= not re.compile("/search/users").search(href)
    valid &= not re.compile(r"/search/.+/grid").search(href)

    valid &= not re.compile(r"/.+/followers").search(href)
    valid &= not re.compile(r"/.+/following").search(href)

    valid &= not re.compile("/oauth").search(href)
    valid &= not re.compile("/1/oauth").search(href)

    valid &= not re.compile("/i/streams").search(href)
    valid &= not re.compile("/i/hello").search(href)
    
    valid &= not re.compile("/account/deactivated").search(href)
    valid &= not re.compile("/settings/deactivated").search(href)
    # Blocking out non-english
    valid &= not re.compile("lang=").search(href)

    return href and valid

In [2]:
def parsePage(url, basePage=startPage):
    conn1 = hc.HTTPSConnection(basePage)
    conn1.set_debuglevel(0)
    conn1.request("GET", url)
    resp = conn1.getresponse()
    soup = BeautifulSoup(resp.read(), "html.parser")
    return soup

In [3]:
def scrapeUrls(soup):
    urls = []
    for tag in soup.find_all(href=robot_valid_href):
        urls.append(tag["href"])
    for i in range(len(urls)):
        if urls[i][0] == "/":
            urls[i] = "https://" + startPage + urls[i]
    return urls

In [26]:
def tweetFilter(tag):
    return tag.has_attr("data-item-type") and tag["data-item-type"] == "tweet"

def scrapeTweet(soup):
    tweets = []
    for tag in soup.find_all(tweetFilter):
        # Get immediate subclass with tweet data
        dataTag = tag.find("div")
        if (not dataTag.has_attr("data-reply-to-users-json")):
            # print("\n\n\nERROR\n\n\n", tag, "\n\n\n", dataTag)
            continue
        tweetTag = dataTag.find("p", class_=re.compile("tweet-text"))
        # print(dataTag["data-reply-to-users-json"])
        
        userJsonData = jsonDecoder.decode(dataTag["data-reply-to-users-json"])
        # print(userJsonData)
        # print(dataTag["data-reply-to-users-json"])
        
        
        permaPath = dataTag["data-permalink-path"]
        # print(permaPath)
        tweetText = re.sub(r"(\n|\t|\r)", " ", tweetTag.get_text())
        primaryUser = userJsonData[0]
        tweets.append((primaryUser["id_str"], primaryUser["screen_name"], primaryUser["name"], permaPath, tweetText))
        # print(tweetTag.get_text(), "\n\nNEW_TWEET\n")
    return tweets
        
    

In [27]:
def processUrl(url):
    soup = parsePage(url)
    urls = scrapeUrls(soup)
    tweetLists = scrapeTweet(soup)
    return urls, tweetLists

In [28]:
def saveOutData(tweetList, urlQueue : deque, repoSet):
    # Write out output file
    with open("crawlData/output.csv", "a", encoding="utf-8") as outFile:
        csv_writer = csv.writer(outFile, quoting=csv.QUOTE_ALL)
        for tweet in tweetList:
            csv_writer.writerow(tweet)

    # Write out queue
    urlFrontier = open("crawlData/urlFrontier.txt", "w") # Write to overwrite
    for url in urlQueue:
        urlFrontier.write(url + "\n")
    urlFrontier.close()
    # Write out repo
    urlRepo = open("crawlData/urlRepo.txt", "w") # Write to overwrite
    for value in repoSet:
        urlRepo.write(value + "\n")
    urlRepo.close()

def crawlAround():
    # Load the urlFrontier into a queue
    urlFrontier = open("crawlData/urlFrontier.txt","r")
    urlFrontierQueue = deque()
    for line in urlFrontier:
        urlFrontierQueue.append(line.replace("\n", ""))
    urlFrontier.close()
    # Load the urlRepo into a set
    urlRepo = open("crawlData/urlRepo.txt", "r")
    urlRepoSet = set({})
    for line in urlRepo:
        line = line.replace("\n","") # Have to trim the end since set is so specific
        urlRepoSet.add(line)
    urlRepo.close()
    # Start crawling process

    i = 0
    runningTweetList = []
    while True:
        i += 1
        time.sleep(1)
        topUrl = urlFrontierQueue.popleft()
        # print(topUrl)
        urls, tweetLists = processUrl(topUrl)
        runningTweetList = runningTweetList + tweetLists
        for url in urls:
            if url not in urlRepoSet:
                # We only add ones not already done
                # print(url)
                urlFrontierQueue.append(url) # Adds to the right
                urlRepoSet.add(url)
        if i % 5 == 0:
            # Save out queue, repo, and output every once in a while
            i = 0 # Reset i value
            saveOutData(runningTweetList, urlFrontierQueue.copy(), urlRepoSet.copy()) # Have to use copies
            runningTweetList = [] # Reset tweet list so no duplicates


In [29]:
crawlAround()

KeyboardInterrupt: 