In [109]:
%matplotlib inline

import re
import json
import time
import random
import os.path
import requests
import warnings
import itertools
import urllib.parse
import urllib.request

import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

from datetime import datetime

In [110]:
#Read in the symbol title dataframe
symbol_title = pd.read_csv("symbol_title.csv", index_col=2, sep=",")
symbol_title = symbol_title[["symbol","sector","industry"]]
symbol_title.head()

Unnamed: 0_level_0,symbol,sector,industry
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAC Holdings,AAC,Conglomerates,Conglomerates
Almaden Minerals Ltd.,AAU,Basic Materials,Industrial Metals & Minerals
American Biltrite Inc.,ABL,Consumer Goods,Rubber & Plastics
Acme United Corp.,ACU,Consumer Goods,Office Supplies
AeroCentury Corp.,ACY,Services,Rental & Leasing Services


In [111]:
security = pd.read_csv("security.csv", encoding="utf8")

In [112]:
#Allows an application to request user authorization. 
def oauth_authorize():    
    url = "https://api.stocktwits.com/api/2/oauth/authorize"                
    params = urllib.parse.urlencode({"client_id": security["client_id"][0], 
                                     "response_type": "token",
                                     "redirect_uri": "https://sites.google.com/site/noelnamai/",
                                     "scope": "read,watch_lists,publish_messages,publish_watch_lists,follow_users,follow_stocks"
                                    })    
    oauth = urllib.request.urlopen(url, params.encode("UTF-8"))    
    return oauth

In [113]:
oauth = oauth_authorize()

In [114]:
#Returns the most recent 30 messages for the specified symbol. Includes symbol object in response.
def stream_symbol(symbol):
    url = "https://api.stocktwits.com/api/2/streams/symbol/" + str(symbol) + ".json"
    try:
        content = requests.get(url).json()
    except Exception as error:
        raise Exception("Method: stream_symbol " + "Error: " + error)
    return content

In [115]:
#Returns the most recent 30 messages with trending symbols in the last 5 minutes.
def stream_trending():             
    url = "https://api.stocktwits.com/api/2/streams/trending.json"
    payload = {"access_token": security["access_token"][0]}
    try:    
        content = requests.get(url, params=payload).json()
    except Exception as error:
        raise Exception("Method: stream_trending " + "Error: " + error) 
    return content

In [116]:
#Returns the most recent 30 symbols in the last 5 minutes.
def get_symbols():
    symbols = []
    data = stream_trending()
    code = data["response"]["status"] 
    for tweet in data["messages"]:
        for symbol in tweet["symbols"]:
            symbols.append(symbol["symbol"])
    return code, symbols

In [117]:
#Creates a dataframe from JSON data returned by the API.
def create_dataframe(data): 
    response = [] 
    code = data["response"]["status"]                                  
    if code == 200: 
        for tweet in data["messages"]:
            for symbol in tweet["symbols"]:
                utc = tweet["created_at"]
                row = {"symbol": symbol["symbol"],
                       "title": symbol["title"],
                       "tweet_id": tweet["id"],
                       "text": tweet["body"],
                       "date": datetime.strptime(utc, "%Y-%m-%dT%H:%M:%SZ").strftime("%d-%m-%Y"),
                       "time": datetime.strptime(utc, "%Y-%m-%dT%H:%M:%SZ").strftime("%H:%M:%S"),
                       "name": tweet["user"]["name"],
                       "user_id": str(tweet["user"]["id"]),
                       "user_name": str(tweet["user"]["username"])}
                response.append(row) 
    else: 
        raise Exception("Method: create_dataframe " + "Error: " + str(data["errors"][0]["message"]))
    df = pd.DataFrame(response)
    return code, df

In [118]:
#Clean the data frame and fill in the missing data
def clean_dataframe(df):
    df = df.drop_duplicates()    
    df = df.dropna() 
    df["index"] = range(len(df))
    df["tweet_id"] = df["tweet_id"].astype("int")
    df = df.set_index("index")
    return df

In [119]:
#Fill in the data "sector" and "industry" to the main dataframe.
def fill_dataframe(x):   
    symbol = list(set(x["symbol"].values))[0]
    sector = symbol_title["sector"][symbol_title["symbol"] == symbol].values
    industry = symbol_title["industry"][symbol_title["symbol"] == symbol].values
    x["sector"] = sector[0] if len(sector) == 1 else np.nan
    x["industry"] = industry[0] if len(industry) == 1 else np.nan   
    return x

In [120]:
#Read data frame. Creat a new one if non exists.
#df = pd.DataFrame(columns=["date","name","symbol","text","time","title","tweet_id","user_id","user_name"])
df = pd.read_csv("stocktwits_df.csv", encoding="utf8")

#Get trending symbols and start building a dataframe from them.
code, symbols = get_symbols()
np.random.shuffle(symbols)
for symbol in symbols:
    data = stream_symbol(symbol)
    code, df2 = create_dataframe(data)
    df = df.append(df2) 
    df = clean_dataframe(df)                  
    df.to_csv("stocktwits_df.csv", index=False, encoding="utf-8") 
    time.sleep(20)

In [121]:
df = df.groupby("symbol").apply(fill_dataframe)
df["count"] = df.groupby(["symbol"])["user_name"].transform("count")
df[["date","name","symbol","time","title","tweet_id","user_id","user_name"]].head()

Unnamed: 0_level_0,date,name,symbol,time,title,tweet_id,user_id,user_name
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,06-10-2016,Daniel Barrett,AUPH,19:47:40,Aurinia Pharmaceuticals Inc.,64199437,720914,DB868
1,06-10-2016,BryanU,AUPH,19:47:37,Aurinia Pharmaceuticals Inc.,64199433,849508,BearRugManufacturer
2,06-10-2016,Jon Lilie,AUPH,19:47:31,Aurinia Pharmaceuticals Inc.,64199413,768861,jonlilie
3,06-10-2016,101camshaft,AUPH,19:47:27,Aurinia Pharmaceuticals Inc.,64199398,834192,101camshaft
4,06-10-2016,Silkysmooth,AUPH,19:47:22,Aurinia Pharmaceuticals Inc.,64199383,841235,silkysmooth


In [122]:
{"df_size": len(df.index), 
 "users": len(list(set(df["user_id"]))), 
 "df_symbols": len(list(set(df["symbol"]))), 
 "all_symbols": len(symbol_title)}

{'all_symbols': 11718, 'df_size': 7780, 'df_symbols': 988, 'users': 1455}

In [127]:
network = nx.Graph()

#create a dictionary of {tweet_id : [symbols]}
dict1 = {}
rdf = df.groupby("tweet_id")
for tweet_id in list(set(df.tweet_id.values)):
    dict1[tweet_id] = list(rdf.get_group(tweet_id).symbol.values) 

#Create and save the weighted graph for use in Gephi
for key, value in dict1.items():
    for tup in list(itertools.combinations(value, 2)):        
        source = tup[0]
        target = tup[1] 
        if source != target:
            if network.has_edge(source, target) == True:            
                #network[source][target]["weight"] += 1
                pass
            else:
                network.add_nodes_from([source, target])
                network.add_edge(source, target, weight=1, key=str(key), date=df["date"][df["tweet_id"]==key].values[0])

In [128]:
#Add node attributes 
nodes  = []
titles = {}
for node in network.nodes():
    titles[node] = df["title"][df.symbol==node].values[0]
    nodes.append({"name": node, "title": df["title"][df.symbol==node].values[0]})

nx.set_node_attributes(network, "Title", titles)
nx.write_gexf(network, "symbol_graph.gexf")

In [129]:
#save the unweighted graph for use in R
data = []
node_list = list(network.nodes())

for (source, target) in network.edges():
    data.append({"value": 1.0, 
                 "id": network[source][target]["key"],
                 "date": network[source][target]["date"],
                 "source": node_list.index(source), 
                 "target": node_list.index(target)
                })    

links = pd.DataFrame(data)
nodes = pd.DataFrame(nodes)
links.to_csv("Rlinks.csv", index=False, encoding="utf-8")
nodes.to_csv("Rnodes.csv", index=False, encoding="utf-8")