In [1]:

import requests
import urllib.request
import time, json, os, traceback
from json import JSONDecodeError
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from time import sleep
from collections import deque

class StockTwitsAPIScraper:
    def __init__(self, symbol, date, maxId):
        self.symbol = symbol
        self.link = "https://api.stocktwits.com/api/2/streams/symbol/{}.json?".format(symbol)
        self.targetDate = date
        self.tweets = []
        self.reqeustQueue = deque()
        self.maxId = maxId
        self.initDir()

    def setLimits(self, size, duration):
        self.size = size
        self.duration = duration
        self.requestInterval = duration // size + 1 if duration % size else duration // size

    # create directions if they don't exist
    def initDir(self):
        if not os.path.isdir("stocks"):
            os.mkdir("stocks")
        if not os.path.isdir("stocks/{}".format(self.symbol)):
            os.mkdir("stocks/{}".format(self.symbol))

    # write tweets we get and the ID of the last tweet in case system break down
    def writeJson(self):
        if self.tweets:
            self.maxId = self.tweets[-1]["id"]
            fileName = "stocks/{}/{}.json".format(self.symbol, self.maxId)
            with open(fileName, "w") as f:
                json.dump(self.tweets, f)
    
    def getCurrentUrl(self):
        return self.link + "max={}".format(self.maxId)

    # request manager
    # can't exceed 200 requests within an hour
    def requestManager(self):
        if len(self.reqeustQueue) == self.size:
            now = datetime.now()
            firstRequest = self.reqeustQueue.popleft()
            if now < firstRequest + timedelta(seconds=self.duration):
                timeDiff = firstRequest - now
                waitTime = timeDiff.total_seconds() + 1 + self.duration                
                print("Reach request limit, wait for {} seconds.".format(waitTime))
                sleep(waitTime)

    def getMessages(self, url):
        self.requestManager()

        response = requests.get(url)
        self.reqeustQueue.append(datetime.now())
        try:
            data = json.loads(response.text)
        except JSONDecodeError:
            if "Bad Gateway" in response.text:
                print("Just a Bad Gateway, wait for 1 minute.")
                sleep(60)
                return True
            print(len(self.reqeustQueue))
            print(self.reqeustQueue[0], datetime.now())
            print(url)
            print(response.text)
            print(traceback.format_exc())
            raise Exception("Something worong with the response.")
        if data and data["response"]["status"] == 200:
            data["cursor"]["max"]
            for m in data["messages"]:
                record = {}            
                createdAt = datetime.strptime(m["created_at"], "%Y-%m-%dT%H:%M:%SZ")
                if createdAt < self.targetDate:
                    return False
                record["id"] = m["id"]
                record["text"] = m["body"]
                record["time"] = createdAt.timestamp()
                record["sentiment"] = m["entities"]["sentiment"]["basic"] if m["entities"]["sentiment"] else ""
                self.tweets.append(record)
        else:
            print(response.text)        
        return True

    def getTweetsAndWriteToFile(self):        
        if not self.getMessages(self.getCurrentUrl()):
            return False
        self.writeJson()
        print("Scrap {} tweets starting from {}.".format(len(self.tweets), self.maxId))
        self.tweets.clear()
        sleep(self.requestInterval)
        return True

    def scrapTweets(self):        
        try:
            doScrap = True
            while doScrap:
                doScrap = self.getTweetsAndWriteToFile()
        except Exception:
            print(traceback.format_exc())

# symbol = input("Enter stock symbol: ")
# print("This scraper scraps tweets backward.\n\
# The ID you put in belongs the most recent tweet you're goint go scrap.\n\
# And the scraper will keep going backward to scrap older tweets.")
# maxId = input("Enter the starting tweet ID: ")
# targetDate = input("Enter the earlest date (mmddyyyy): ")
# print("You can only send 200 requests to StockTwits in an hour.")
# requestLimit = input("Enter the limit of number of requests within an hour: ")

# scraper = StockTwitsAPIScraper(symbol, datetime.strptime(targetDate, "%m%d%Y"), int(maxId))
# scraper.setLimits(int(requestLimit), 3600)
# scraper.scrapTweets()

In [2]:
symbol = 'MSFT'
# maxId = 454712371
maxId = 455026845
targetDate = datetime.strptime('02142022', "%m%d%Y")
requestLimit = 200
scraper = StockTwitsAPIScraper(symbol, targetDate, maxId)
scraper.setLimits(requestLimit, 3600)
scraper.scrapTweets()

Scrap 30 tweets starting from 455022642.
Scrap 30 tweets starting from 455019139.
Scrap 30 tweets starting from 455015171.
Scrap 30 tweets starting from 455011465.
Scrap 30 tweets starting from 455007977.
Scrap 30 tweets starting from 455005807.
Scrap 30 tweets starting from 455003853.
Scrap 30 tweets starting from 455001753.
Scrap 30 tweets starting from 454999540.
Scrap 30 tweets starting from 454997633.
Scrap 30 tweets starting from 454995984.
Scrap 30 tweets starting from 454994564.
Scrap 30 tweets starting from 454993571.
Scrap 30 tweets starting from 454992639.
Scrap 30 tweets starting from 454991947.
Scrap 30 tweets starting from 454991560.
Scrap 30 tweets starting from 454991145.
Scrap 30 tweets starting from 454990450.
Scrap 30 tweets starting from 454989880.
Scrap 30 tweets starting from 454989124.
Scrap 30 tweets starting from 454988701.
Scrap 30 tweets starting from 454988310.
Scrap 30 tweets starting from 454987962.
Scrap 30 tweets starting from 454987462.
Scrap 30 tweets 

Scrap 30 tweets starting from 451893253.
Scrap 30 tweets starting from 451860091.
Scrap 30 tweets starting from 451811317.
Scrap 30 tweets starting from 451790659.
Scrap 30 tweets starting from 451774356.
Scrap 30 tweets starting from 451749137.
Scrap 30 tweets starting from 451729353.
Scrap 30 tweets starting from 451718363.
Scrap 30 tweets starting from 451695227.
Scrap 30 tweets starting from 451673127.
Scrap 30 tweets starting from 451643780.
Scrap 30 tweets starting from 451611433.
Scrap 30 tweets starting from 451576111.
Scrap 30 tweets starting from 451537269.
Scrap 30 tweets starting from 451504730.
Scrap 30 tweets starting from 451479733.
Scrap 30 tweets starting from 451460886.
Scrap 30 tweets starting from 451437867.
Scrap 30 tweets starting from 451423771.
Scrap 30 tweets starting from 451411903.
Scrap 30 tweets starting from 451398534.
Scrap 30 tweets starting from 451387909.
Scrap 30 tweets starting from 451380727.
Scrap 30 tweets starting from 451374154.
Scrap 30 tweets 

Scrap 30 tweets starting from 441236552.
Scrap 30 tweets starting from 441174315.
Scrap 30 tweets starting from 441123945.
Scrap 30 tweets starting from 441058104.
Scrap 30 tweets starting from 440966547.
Scrap 30 tweets starting from 440909292.
Scrap 30 tweets starting from 440833694.
Scrap 30 tweets starting from 440782261.
Scrap 30 tweets starting from 440705252.
Scrap 30 tweets starting from 440635905.
Scrap 30 tweets starting from 440563268.
Scrap 30 tweets starting from 440454529.
Scrap 30 tweets starting from 440355935.
Scrap 30 tweets starting from 440281548.
Scrap 30 tweets starting from 440243822.
Scrap 30 tweets starting from 440221838.
Scrap 30 tweets starting from 440168767.
Scrap 30 tweets starting from 440078921.
Scrap 30 tweets starting from 440001043.
Scrap 30 tweets starting from 439932032.
Scrap 30 tweets starting from 439857608.
Scrap 30 tweets starting from 439808307.
Scrap 30 tweets starting from 439765380.
Scrap 30 tweets starting from 439722579.
Scrap 30 tweets 

In [13]:
url = 'https://api.stocktwits.com/api/2/streams/symbol/MSFT.json?max=454712371'

In [14]:
response = requests.get(url)

In [15]:
data = json.loads(response.text)

In [18]:
data['messages']

[{'id': 454711940,
  'body': '$DRIV Autonomous EV ETF bottom finally? Holdings like $GOOG $NVDA $MSFT $QCOM',
  'created_at': '2022-04-26T02:08:30Z',
  'user': {'id': 2262315,
   'username': 'MrBlueHorseshoe',
   'name': 'Blue Horseshoe',
   'avatar_url': 'https://avatars.stocktwits.com/production/2262315/thumb-1647715759.png',
   'avatar_url_ssl': 'https://avatars.stocktwits.com/production/2262315/thumb-1647715759.png',
   'join_date': '2019-08-17',
   'official': False,
   'identity': 'User',
   'classification': [],
   'followers': 170,
   'following': 6,
   'ideas': 3116,
   'watchlist_stocks_count': 82,
   'like_count': 973,
   'plus_tier': '',
   'premium_room': '',
   'trade_app': False,
   'trade_status': None},
  'source': {'id': 2269,
   'title': 'StockTwits Web',
   'url': 'https://stocktwits.com'},
  'symbols': [{'id': 1632,
    'symbol': 'DRIV',
    'title': 'Global X Funds - Global X Autonomous & Electric Vehicles ETF',
    'aliases': [],
    'is_following': False,
    'w

In [2]:
symbol = 'NVDA'
# maxId = 454712371
maxId = 455279252
targetDate = datetime.strptime('02142022', "%m%d%Y")
requestLimit = 200
scraper = StockTwitsAPIScraper(symbol, targetDate, maxId)
scraper.setLimits(requestLimit, 3600)
scraper.scrapTweets()

Scrap 30 tweets starting from 455258400.
Scrap 30 tweets starting from 455240541.
Scrap 30 tweets starting from 455227886.
Scrap 30 tweets starting from 455208690.
Scrap 30 tweets starting from 455193210.
Scrap 30 tweets starting from 455177564.
Scrap 30 tweets starting from 455162925.
Scrap 30 tweets starting from 455152402.
Scrap 30 tweets starting from 455138071.
Scrap 30 tweets starting from 455125614.
Scrap 30 tweets starting from 455117254.
Scrap 30 tweets starting from 455103316.
Scrap 30 tweets starting from 455080577.
Scrap 30 tweets starting from 455056430.
Scrap 30 tweets starting from 455039834.
Scrap 30 tweets starting from 455018814.
Scrap 30 tweets starting from 455004919.
Scrap 30 tweets starting from 454991723.
Scrap 30 tweets starting from 454985250.
Scrap 30 tweets starting from 454979947.
Scrap 30 tweets starting from 454974299.
Scrap 30 tweets starting from 454965403.
Scrap 30 tweets starting from 454956699.
Scrap 30 tweets starting from 454946004.
Scrap 30 tweets 

In [3]:
symbol = 'AAPL'
maxId = 455329745
targetDate = datetime.strptime('02142022', "%m%d%Y")
requestLimit = 200
scraper = StockTwitsAPIScraper(symbol, targetDate, maxId)
scraper.setLimits(requestLimit, 3600)
# Scraping Apple
try:
    scraper.scrapTweets()
except:
    print("Done with Apple.")
    pass

# Scraping Broadcom
symbol = 'AVGO'
maxId = 454496190
scraper = StockTwitsAPIScraper(symbol, targetDate, maxId)
scraper.setLimits(requestLimit, 3600)
try:
    scraper.scrapTweets()
except:
    print("Done with Broadcom.")
    pass

# Scraping Cisco
symbol = 'CSCO'
maxId = 454748269
scraper = StockTwitsAPIScraper(symbol, targetDate, maxId)
scraper.setLimits(requestLimit, 3600)
try:
    scraper.scrapTweets()
except:
    print("Done with Cisco.")
    pass

symbol = 'ORCL'
maxId = 454749496
scraper = StockTwitsAPIScraper(symbol, targetDate, maxId)
scraper.setLimits(requestLimit, 3600)
try:
    scraper.scrapTweets()
except:
    print("Done with Oracle.")
    pass

Scrap 30 tweets starting from 455324182.
Scrap 30 tweets starting from 455318785.
Scrap 30 tweets starting from 455312099.
Scrap 30 tweets starting from 455306198.
Scrap 30 tweets starting from 455300202.
Scrap 30 tweets starting from 455292089.
Scrap 30 tweets starting from 455285557.
Scrap 30 tweets starting from 455278425.
Scrap 30 tweets starting from 455271908.
Scrap 30 tweets starting from 455264857.
Scrap 30 tweets starting from 455259142.
Scrap 30 tweets starting from 455252329.
Scrap 30 tweets starting from 455241642.
Scrap 30 tweets starting from 455232385.
Scrap 30 tweets starting from 455224897.
Scrap 30 tweets starting from 455214435.
Scrap 30 tweets starting from 455202739.
Scrap 30 tweets starting from 455193144.
Scrap 30 tweets starting from 455184329.
Scrap 30 tweets starting from 455178800.
Scrap 30 tweets starting from 455171601.
Scrap 30 tweets starting from 455161894.
Scrap 30 tweets starting from 455155327.
Scrap 30 tweets starting from 455149269.
Scrap 30 tweets 

Scrap 30 tweets starting from 453426147.
Scrap 30 tweets starting from 453415311.
Scrap 30 tweets starting from 453404984.
Scrap 30 tweets starting from 453394319.
Scrap 30 tweets starting from 453379768.
Scrap 30 tweets starting from 453371027.
Scrap 30 tweets starting from 453356133.
Scrap 30 tweets starting from 453344360.
Scrap 30 tweets starting from 453327750.
Scrap 30 tweets starting from 453308828.
Scrap 30 tweets starting from 453289214.
Scrap 30 tweets starting from 453280185.
Scrap 30 tweets starting from 453264723.
Scrap 30 tweets starting from 453252613.
Scrap 30 tweets starting from 453246686.
Scrap 30 tweets starting from 453237287.
Scrap 30 tweets starting from 453228571.
Scrap 30 tweets starting from 453218807.
Scrap 30 tweets starting from 453211109.
Scrap 30 tweets starting from 453193382.
Scrap 30 tweets starting from 453176397.
Scrap 30 tweets starting from 453162432.
Scrap 30 tweets starting from 453151188.
Scrap 30 tweets starting from 453137859.
Scrap 30 tweets 

Scrap 30 tweets starting from 441687287.
Scrap 30 tweets starting from 440830778.
Scrap 30 tweets starting from 439649452.
Scrap 30 tweets starting from 438601846.
Scrap 30 tweets starting from 437867415.
Scrap 30 tweets starting from 437178645.
Scrap 30 tweets starting from 436422512.


In [5]:
symbol = 'ORCL'
# maxId = 454749496
maxId = 454754930
scraper = StockTwitsAPIScraper(symbol, targetDate, maxId)
scraper.setLimits(requestLimit, 3600)
try:
    scraper.scrapTweets()
except:
    print("Done with Oracle.")
    pass

Scrap 30 tweets starting from 454218442.
Scrap 30 tweets starting from 453353537.
Scrap 30 tweets starting from 452890889.
Scrap 30 tweets starting from 451981465.
Scrap 30 tweets starting from 451505144.
Scrap 30 tweets starting from 450529123.
Scrap 30 tweets starting from 449825942.
Scrap 30 tweets starting from 449061875.
Scrap 30 tweets starting from 447935206.
Scrap 30 tweets starting from 446707699.
Scrap 30 tweets starting from 445702136.
Scrap 30 tweets starting from 444809140.
Scrap 30 tweets starting from 444448018.
Scrap 30 tweets starting from 444226059.
Scrap 30 tweets starting from 443816510.
Scrap 30 tweets starting from 443509539.
Scrap 30 tweets starting from 443399417.
Scrap 30 tweets starting from 443333990.
Scrap 30 tweets starting from 443255983.
Scrap 30 tweets starting from 443219238.
Scrap 30 tweets starting from 443176856.
Scrap 30 tweets starting from 443145847.
Scrap 30 tweets starting from 443128657.
Scrap 30 tweets starting from 443121876.
Scrap 30 tweets 

In [None]:
#Could have to do with not sleeping after getting an exception, so maybe decrease request speed.