## Data extraction and Preparation

In [3]:
import io, json, requests, time, os, os.path, math, urllib
from sys import stdout
from collections import Counter
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn import linear_model
from pandas_datareader.data import get_data_yahoo
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [5]:
# returns python object representation of JSON in response
def get_response(symbol, older_than, retries=5):
    url = 'https://api.stocktwits.com/api/2/streams/symbol/%s.json?max=%d' % (symbol, older_than-1)
    for _ in range(retries):
        response = requests.get(url)
        if response.status_code == 200:
            return json.loads(response.content)
        elif response.status_code == 429:
            print(response.content)
            return None
        time.sleep(1.0)
    # couldn't get response
    return None

In [7]:
# extends the current dataset for a given symbol with more tweets
def get_older_tweets(symbol, num_queries):    
    path = './data/%s.json' % symbol
    if os.path.exists(path):
        # extending an existing json file
        with open(path, 'r') as f:
            data = json.load(f)
            if len(data) > 0:
                older_than = data[-1]['id']
            else:
                older_than = 1000000000000
    else:
        # creating a new json file
        data = []
        older_than = 1000000000000 # any huge number
    
    for i in range(num_queries):
        content = get_response(symbol, older_than)
        if content == None:
            print('Error, an API query timed out')
            break
        data.extend(content['messages'])
        older_than = data[-1]['id']
        stdout.write('\rSuccessfully made query %d' % (i+1))
        stdout.flush()
        # sleep to make sure we don't get throttled
        time.sleep(0.5)
        
    # write the new data to the JSON file
    with open(path, 'w') as f:
        json.dump(data, f)
    print
    print('Done')

In [18]:
# get some data
# apparently a client can only make 200 requests an hour, so we can't get all the data at once

# make data directory if needed
if not os.path.exists('./data'):
    os.mkdir('./data')
    
symbols = symbols = ['TATAMOTORS.NSE']
tweets_per_symbol = 3000
for symbol in symbols:
    path = './data/%s.json' % symbol
    if os.path.exists(path):
        with open(path, 'r') as f:
            num_tweets = len(json.load(f))
    else:
        num_tweets = 0
    num_queries = (tweets_per_symbol - num_tweets - 1)/30 + 1
    num_queries = int(num_queries)
    if num_queries > 0:
        print('Getting tweets for symbol %s'% symbol)
        get_older_tweets(symbol, num_queries)

Getting tweets for symbol TATAMOTORS.NSE
Successfully made query 100Done


In [21]:
# check that we're doing the querying and appending correctly without getting duplicates
# and that message IDs are in descending order
symbol = 'TATAMOTORS.NSE'
with open('./data/%s.json' % symbol, 'r') as f:
    data = json.load(f)
S = set()
old_id = 1000000000000
for message in data:
    message_id = message['id']
    assert message_id not in S
    assert message_id < old_id
    old_id = message_id
    S.add(message_id)
print('Passed')

Passed


In [23]:
# Function takes in a JSON and returns a Pandas DataFrame for easier operation. 
def stocktwits_json_to_df(data, verbose=False):
    #data = json.loads(results)
    columns = ['id','created_at','username','name','user_id','body','basic_sentiment','reshare_count']
    db = pd.DataFrame(index=range(len(data)),columns=columns)
    for i, message in enumerate(data):
        db.loc[i,'id'] = message['id']
        db.loc[i,'created_at'] = message['created_at']
        db.loc[i,'username'] = message['user']['username']
        db.loc[i,'name'] = message['user']['name']
        db.loc[i,'user_id'] = message['user']['id']
        db.loc[i,'body'] = message['body']
        #We'll classify bullish as +1 and bearish as -1 to make it ready for classification training
        try:
            if (message['entities']['sentiment']['basic'] == 'Bullish'):
                db.loc[i,'basic_sentiment'] = 1
            elif (message['entities']['sentiment']['basic'] == 'Bearish'):
                db.loc[i,'basic_sentiment'] = -1
            else:
                db.loc[i,'basic_sentiment'] = 0
        except:
                db.loc[i,'basic_sentiment'] = 0
        db.loc[i,'reshare_count'] = message['reshares']['reshared_count']
        for j, symbol in enumerate(message['symbols']):
                db.loc[i,'symbol'+str(j)] = symbol['symbol']
        if verbose:
            #print message
            print(db.loc[i,:])
    db['created_at'] = pd.to_datetime(db['created_at'])
    return db

In [33]:
# Load tweets for visualizing data
filename = 'TATAMOTORS.NSE'
path = "C:\Users\nanda\PISL project\data\TATAMOTORS.NSE.json" % filename
with open(path, 'r') as f:
    data = json.load(f)
db = stocktwits_json_to_df(data)
print('%d examples extracted ' % db.shape[0])

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (690928719.py, line 3)

In [29]:
import json
import csv

with open('C:\Users\nanda\PISL project\data\TATAMOTORS.NSE.json') as json_file:
	jsondata = json.load(json_file)

data_file = open('C:\Users\nanda\PISL project\data\TATAMOTORS.NSE.csv', 'w', newline='')
csv_writer = csv.writer(data_file)

count = 0
for data in jsondata:
	if count == 0:
		header = data.keys()
		csv_writer.writerow(header)
		count += 1
	csv_writer.writerow(data.values())

data_file.close()


SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (2877805897.py, line 4)