In [160]:
%matplotlib inline

import re
import json
import time
import random
import datetime
import warnings
import requests
import itertools
import urllib.parse
import urllib.request

import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

In [161]:
#Allows an application to request user authorization. 
def oauth_authorize():    
    url    = "https://api.stocktwits.com/api/2/oauth/authorize"                
    params = urllib.parse.urlencode({"client_id": "f453d9d2f6316db9", 
                                     "response_type": "token",
                                     "redirect_uri": "https://sites.google.com/site/noelnamai/",
                                     "scope": "read,watch_lists,publish_messages,publish_watch_lists,follow_users,follow_stocks"
                                    })    
    file   = urllib.request.urlopen(url, params.encode("UTF-8"))    
    return file.getcode()

In [162]:
#Returns the most recent 30 messages for the specified symbol. Includes symbol object in response.
def stream_symbol(symbol):
    url = "https://api.stocktwits.com/api/2/streams/symbol/" + str(symbol) + ".json"
    try:
        content = requests.get(url).json()
    except Exception as error:
        print(error)
    return content

In [163]:
#Returns the most recent 30 messages with trending symbols in the last 5 minutes.
def stream_trending():             
    url = "https://api.stocktwits.com/api/2/streams/trending.json"
    payload = {"access_token": "e6f40cf7aad1e2aa5dcb14ca6003968495cf8bb3"}
    try:    
        content = requests.get(url, params=payload).json()
    except Exception as error:
        print(error) 
    return content

In [164]:
#Creates a dataframe from JSON data returned by the API.
def create_dataframe(data):    
    code = data["response"]["status"] #Seek code=200        
    response = []                                 
    if code == 200:         
        for tweet in data["messages"]:
            for symbol in tweet["symbols"]:
                row = {"symbol": symbol["symbol"],
                       "title": symbol["title"],
                       "tweet_id": tweet["id"],
                       "text": tweet["body"],
                       "UTC": tweet["created_at"],
                       "official": tweet["user"]["official"],
                       "name": tweet["user"]["name"],
                       "user_id": tweet["user"]["id"],
                       "user_name": tweet["user"]["username"]}
                response.append(row) 
    else: 
        print(code)
        pass   
    df = pd.DataFrame(response)
    return code, df

In [165]:
#Clean the data frame and fill in the missing data
def clean_dataframe(df):
    df.sort_values(by="UTC")
    df = df.drop_duplicates()    
    df = df.dropna()                
    df["index"] = range(len(df))
    df = df.set_index("index")
    df["count"] = df.groupby(["symbol"])["user_name"].transform("count")
    df.to_csv("stocktwits_df.csv", index=False, encoding="utf-8")
    return df

In [166]:
#Get trending symbols and start building a dataframe from them.
def do_stuff(code, t):
    start_time = time.time()
    while code == 200 and time.time() - start_time <= t:
        data = stream_trending()
        code, df = create_dataframe(data)
        try:
            saved_df = pd.read_csv("stocktwits_df.csv", encoding="utf8")
            saved_df = saved_df[["tweet_id","text","UTC","user_id","user_name","name","symbol","title","official"]]
            df = pd.concat([df, saved_df]) 
        except:
            pass        
        time.sleep(6)
        dfx = clean_dataframe(df)     
    return code, dfx

In [167]:
#Call the do_stuff()
t = 3
run = True
code = oauth_authorize()
try:
    code, df = do_stuff(code, t)
except Exception as error:
    print(error) 

In [168]:
#Fill in the data "sector" and "industry" to the main dataframe.
def fill_dataframe(x):   
    symbol = list(set(x["symbol"].values))[0]
    sector = symbol_title["sector"][symbol_title["symbol"] == symbol].values
    industry = symbol_title["industry"][symbol_title["symbol"] == symbol].values
    x["sector"] = sector[0] if len(sector) == 1 else np.nan
    x["industry"] = industry[0] if len(industry) == 1 else np.nan    
    return x

In [169]:
symbol_title = pd.read_csv("symbol_title.csv", index_col=2, sep=",")
symbol_title = symbol_title[["symbol","sector","industry"]]
df = df.groupby("symbol").apply(fill_dataframe); df.head()

Unnamed: 0_level_0,UTC,name,official,symbol,text,title,tweet_id,user_id,user_name,count,sector,industry
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,2016-09-29T19:12:12Z,Nick Leach,False,CATB,$CATB http://www.streetinsider.com/Analyst+Com...,Catabasis Pharmaceuticals,63704751,760797,nickleach22,191,,
1,2016-09-29T19:12:10Z,SG,False,SPY,"$SPY I trade stocks over the market, setups an...",SPDR S&P 500,63704749,433942,SG7416,536,Financial,Exchange Traded Fund
2,2016-09-29T19:12:10Z,Evora,False,SPY,$SPY will it close under 215 today?,SPDR S&P 500,63704748,569812,Daily_Investor,536,Financial,Exchange Traded Fund
3,2016-09-29T19:12:09Z,San Rensho,False,DB,"$SPY $DB yep, i was just starting 2nd grade",Deutsche Bank AG,63704743,560497,Sanrensho,386,Financial,Foreign Regional Banks
4,2016-09-29T19:12:09Z,San Rensho,False,SPY,"$SPY $DB yep, i was just starting 2nd grade",SPDR S&P 500,63704743,560497,Sanrensho,536,Financial,Exchange Traded Fund


In [183]:
{"df_size": len(df.index), 
 "users": len(list(set(df["user_id"]))), 
 "df_symbols": len(list(set(df["symbol"]))), 
 "all_symbols": len(symbol_title)}

{'all_symbols': 11718, 'df_size': 8876, 'df_symbols': 610, 'users': 1938}

In [184]:
df.UTC[0]

'2016-09-29T19:12:12Z'

In [171]:
network = nx.Graph()

#create a dictionary of {tweet_id : [symbols]}
dict1 = {}
rdf = df.groupby("tweet_id")
for tweet_id in list(set(df.tweet_id.values)):
    dict1[tweet_id] = list(rdf.get_group(tweet_id).title.values) 

#Create and save the weighted graph for use in Gephi
for key, value in dict1.items():
    for tup in list(itertools.combinations(value, 2)):        
        source = tup[0]
        target = tup[1] 
        if source != target:
            if network.has_edge(source, target) == True:            
                network[source][target]["weight"] += 1
                pass
            else:
                network.add_nodes_from([source, target])
                network.add_edge(source, target, weight=1)
                
nx.write_gexf(network, "symbol_graph.gexf")

In [172]:
#save the unweighted graph for use in R
node_list = list(network.nodes())    
data = [{"source":node_list.index(source), "target":node_list.index(target), "value":1.0} for (source, target) in network.edges()] 

links = pd.DataFrame(data)
links.to_csv("Rlinks.csv", index=False, encoding="utf-8")

nodes = pd.DataFrame(node_list, columns=["name"])
nodes.to_csv("Rnodes.csv", index=False, encoding="utf-8")