# Streaming Data

In [7]:
import requests
import numpy as np
import pandas as pd
import gcsfs

In [25]:
def to_readable_datetime(x):
    from datetime import datetime
    return datetime.utcfromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S')

In [15]:
df = pd.read_csv("gcs://wagon-data-750-btc-sent-fc/raw_data/features_2016.csv",index_col = 0, parse_dates = True)

In [43]:
df.head()

Unnamed: 0,n-transactions-per-block,difficulty,utxo-count,mvrv,nvt,avg-block-size,n-transactions-excluding-popular,n-unique-addresses,median-confirmation-time,miners-revenue,...,S&P U.S. TIPS 7-10 Year Index (USD),S&P U.S. Treasury Bond 3-5 Year Index,S&P U.S. TIPS 3-5 Year Index (USD),S&P U.S. TIPS 1-3 Year Index (USD),S&P U.S. TIPS 5-7 Year Index (USD),S&P U.S. Treasury Bill 6-9 Month Index,tweets_sent,reddit_crypto_sent,reddit_econ_sent,volume_gross
2016-06-30,1546.4,209453200000.0,39737471.0,1.835217,7.901799,0.79751,205005.0,363625.0,8.344444,2633621.0,...,230.23,447.49,183.1,136.77,210.16,230.05,0.0,0.175696,-0.046658,10155650.0
2016-07-01,1414.325108,209453200000.0,39764401.0,1.896975,7.678641,0.736557,202414.0,356294.0,8.166667,2669238.0,...,231.32,447.52,183.45,136.87,210.81,230.06,-0.283333,0.102044,0.078542,9049903.0
2016-07-02,1282.250216,209488000000.0,39792758.0,1.890508,7.786408,0.675603,205482.0,348963.0,7.988889,2704855.0,...,231.6225,447.8175,183.5,136.885,210.94,230.06,0.059091,-0.0058,-0.034312,6774267.0
2016-07-03,1150.175325,209522800000.0,39821115.0,1.884042,7.894175,0.614649,208550.0,360712.666667,7.811111,2740471.0,...,231.925,448.115,183.55,136.9,211.07,230.06,0.0,0.015013,0.02094,8413963.0
2016-07-04,1277.952597,209557700000.0,39849472.0,1.877575,8.001942,0.675615,211618.0,372462.333333,7.633333,2625866.0,...,232.2275,448.4125,183.6,136.915,211.2,230.06,0.0,0.040985,0.0472,5634703.0


# Reddit Streaming

In [8]:
file_name = "../keys.json"
with open(file_name, "r") as key_file:
    keys = json.load(key_file)

In [9]:
# note that CLIENT_ID refers to 'personal use script' and SECRET_TOKEN to 'token'
auth = requests.auth.HTTPBasicAuth(keys['REDDIT_SCRIPT'], keys["REDDIT_TOKEN"])

# here we pass our login method (password), username, and password
data = {'grant_type': 'password',
        'username': keys['REDDIT_USERNAME'],
        'password': keys['REDDIT_PASSWORD']}

# setup our header info, which gives reddit a brief description of our app
headers = {'User-Agent': 'MyBot/0.0.1'}

# send our request for an OAuth token
res = requests.post('https://www.reddit.com/api/v1/access_token',
                    auth=auth, data=data, headers=headers)

# convert response to JSON and pull access_token value
TOKEN = res.json()['access_token']

# add authorization to our headers dictionary
headers = {**headers, **{'Authorization': f"bearer {TOKEN}"}}

# while the token is valid (~2 hours) we just add headers=headers to our requests
requests.get('https://oauth.reddit.com/api/v1/me', headers=headers)

<Response [200]>

In [None]:
crypto_url = "https://oauth.reddit.com/r/CryptoCurrency/top/?sort=top&t=day"

In [11]:
crypto_req = requests.get(crypto_url,
                   headers=headers)

In [37]:
reddit_crypto_df = pd.DataFrame()
for post in res.json()['data']['children']:
    reddit_crypto_df = reddit_crypto_df.append({
        'date': post["data"]["created_utc"],
        'subreddit': post['data']['subreddit'],
        'title': post['data']['title'],
        'selftext': post['data']['selftext'],
        'upvote_ratio': post['data']['upvote_ratio'],
        'ups': post['data']['ups'],
        'downs': post['data']['downs'],
        'score': post['data']['score']
    }, ignore_index=True)

In [38]:
reddit_crypto_df["date"] = reddit_crypto_df["date"].map(to_readable_datetime)

In [32]:
econ_url = "https://oauth.reddit.com/search?q=economy+OR+recession+OR+inflation+OR+shutdown+OR+infrastructure+OR+market+OR+retirement&restrict_sr=&sort=top&t=day"

In [34]:
res = requests.get(econ_url,
                   headers=headers)

reddit_df = pd.DataFrame()
for post in res.json()['data']['children']:
    reddit_df = reddit_df.append({
        'date': post["data"]["created_utc"],
        'subreddit': post['data']['subreddit'],
        'title': post['data']['title'],
        'selftext': post['data']['selftext'],
        'upvote_ratio': post['data']['upvote_ratio'],
        'ups': post['data']['ups'],
        'downs': post['data']['downs'],
        'score': post['data']['score']
    }, ignore_index=True)
reddit_econ_df["date"] = reddit_econ_df["date"]

In [35]:
reddit_df

Unnamed: 0,date,subreddit,title,selftext,upvote_ratio,ups,downs,score
0,1638185000.0,CryptoCurrency,Congratulations on surviving the Omicron bear ...,Can I please get a big round of applause for a...,0.83,6722.0,0.0,6722.0
1,1638186000.0,TrueOffMyChest,I oversold my disabilities to the government s...,As the title really.\n\nI've got fairly comple...,0.95,6599.0,0.0,6599.0
2,1638193000.0,Superstonk,The Criand Connection and Credit Linked Notes,## Preface\n\nOver the past year I’ve spent co...,0.97,5821.0,0.0,5821.0
3,1638212000.0,Superstonk,What if a liquidation did happened today,I have a (hypothesis) that a margin call did h...,0.96,5382.0,0.0,5382.0
4,1638211000.0,CryptoCurrency,BTC is almost up 10% in last 24 hours. If you ...,BTC crashed -55% in May 2021 still made All ti...,0.83,3659.0,0.0,3659.0
5,1638198000.0,amcstock,I’m increasingly building hate for the stock m...,"We crashed the NFT website, largest public pen...",0.94,3262.0,0.0,3262.0
6,1638195000.0,Superstonk,Jerkin it with Gherkinit S12E7 Deferred Settle...,Good Morning Apes!\n\nAnother possible settlem...,0.9,3125.0,0.0,3125.0
7,1638198000.0,CryptoMoonShots,🔥 Wanamoon BSC Token | Cyber Monday Super Deal...,**WELCOME TO A NEW BSC MOONSHOT EXPERIENCE… ...,0.94,3020.0,0.0,3020.0
8,1638192000.0,Superstonk,🛑 Highest inflation rate in Germany since 1992...,,0.98,2544.0,0.0,2544.0
9,1638243000.0,wallstreetbets,"OK, RECESSION CANCELLED",,0.98,2271.0,0.0,2271.0


In [57]:
econ_bert = pd.read_csv("gs://wagon-data-750-btc-sent-fc/sent_processed/econ_bert.csv")[["date","positive","negative","neutral"]]

In [42]:
crypto_bert["compound"]

Unnamed: 0,date,positive,negative,neutral
0,2015-11-15,0.182860,0.196688,0.620452
1,2015-11-16,0.171378,0.128283,0.700339
2,2015-11-17,0.143792,0.076343,0.779864
3,2015-11-18,0.151705,0.070495,0.777800
4,2015-11-19,0.125419,0.112176,0.762404
...,...,...,...,...
2178,2021-11-18,0.145112,0.081628,0.773259
2179,2021-11-19,0.085474,0.135326,0.779201
2180,2021-11-20,0.085734,0.178020,0.736247
2181,2021-11-21,0.203692,0.095910,0.700399


In [58]:
econ_bert["compound"] = (econ_bert.positive - econ_bert.negative)/(econ_bert.positive + econ_bert.negative)

In [63]:
econ_bert.columns=["econ_pos", "econ_neg", "econ_neu", "econ_compound"]

In [60]:
econ_bert.set_index("date", inplace = True)

In [76]:
econ_bert.index = pd.to_datetime(econ_bert.index)

In [77]:
crypto_bert.index = pd.to_datetime(crypto_bert.index)

In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1972 entries, 2016-06-30 to 2021-11-22
Data columns (total 61 columns):
 #   Column                                                     Non-Null Count  Dtype  
---  ------                                                     --------------  -----  
 0   n-transactions-per-block                                   1972 non-null   float64
 1   difficulty                                                 1972 non-null   float64
 2   utxo-count                                                 1972 non-null   float64
 3   mvrv                                                       1972 non-null   float64
 4   nvt                                                        1972 non-null   float64
 5   avg-block-size                                             1972 non-null   float64
 6   n-transactions-excluding-popular                           1972 non-null   float64
 7   n-unique-addresses                                         1972 non-null   flo

In [82]:
df_merge_one = pd.merge(df, crypto_bert[["crypto_pos", "crypto_neg", "crypto_neu"]], how="left", left_index = True, right_index = True)

In [83]:
df_merge_two = pd.merge(df_merge_one, econ_bert[["econ_pos", "econ_neg", "econ_neu"]], how="left", left_index = True, right_index = True)

In [84]:
df_merge_two

Unnamed: 0,n-transactions-per-block,difficulty,utxo-count,mvrv,nvt,avg-block-size,n-transactions-excluding-popular,n-unique-addresses,median-confirmation-time,miners-revenue,...,tweets_sent,reddit_crypto_sent,reddit_econ_sent,volume_gross,crypto_pos,crypto_neg,crypto_neu,econ_pos,econ_neg,econ_neu
2016-06-30,1546.400000,2.094532e+11,3.973747e+07,1.835217,7.901799,0.797510,205005.000000,363625.000000,8.344444,2.633621e+06,...,0.000000,0.175696,-0.046658,1.015565e+07,0.167653,0.083368,0.748979,0.051655,0.204414,0.743931
2016-07-01,1414.325108,2.094532e+11,3.976440e+07,1.896975,7.678641,0.736557,202414.000000,356294.000000,8.166667,2.669238e+06,...,-0.283333,0.102044,0.078542,9.049903e+06,0.086713,0.103169,0.810118,0.140822,0.150504,0.708674
2016-07-02,1282.250216,2.094880e+11,3.979276e+07,1.890508,7.786408,0.675603,205482.000000,348963.000000,7.988889,2.704855e+06,...,0.059091,-0.005800,-0.034312,6.774267e+06,0.094833,0.084204,0.820962,0.085085,0.214815,0.700100
2016-07-03,1150.175325,2.095228e+11,3.982112e+07,1.884042,7.894175,0.614649,208550.000000,360712.666667,7.811111,2.740471e+06,...,0.000000,0.015013,0.020940,8.413963e+06,0.126766,0.109568,0.763666,0.064866,0.215263,0.719870
2016-07-04,1277.952597,2.095577e+11,3.984947e+07,1.877575,8.001942,0.675615,211618.000000,372462.333333,7.633333,2.625866e+06,...,0.000000,0.040985,0.047200,5.634703e+06,0.074235,0.126823,0.798941,0.060054,0.168777,0.771169
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-11-18,2054.611695,2.267415e+13,7.682603e+07,2.437422,2.814664,1.270386,275592.333333,725020.000000,7.297222,5.265043e+07,...,-0.191667,0.129760,0.164820,3.137042e+09,0.145112,0.081628,0.773259,0.130400,0.148810,0.720790
2021-11-19,2078.240876,2.267415e+13,7.692242e+07,2.423386,3.531133,1.275837,262180.666667,678741.666667,7.027778,4.966713e+07,...,0.155357,0.058168,0.069516,2.119749e+09,0.085474,0.135326,0.779201,0.125548,0.184152,0.690301
2021-11-20,2078.240876,2.267415e+13,7.701882e+07,2.409351,4.247603,1.275837,248769.000000,632463.333333,6.758333,4.966713e+07,...,0.466667,0.083604,0.014180,8.896024e+08,0.085734,0.178020,0.736247,0.127368,0.070532,0.802100
2021-11-21,2078.240876,2.267415e+13,7.711521e+07,2.409351,4.247603,1.275837,248769.000000,586185.000000,6.758333,4.966713e+07,...,-0.022500,0.173188,0.206028,7.894335e+08,0.203692,0.095910,0.700399,0.195318,0.055419,0.749263


In [85]:
crypto_bert.to_csv("gs://wagon-data-750-btc-sent-fc/sent_processed/crypto_bert.csv")

In [86]:
econ_bert.to_csv("gs://wagon-data-750-btc-sent-fc/sent_processed/econ_bert.csv")

In [87]:
df_merge_two.to_csv("gs://wagon-750-btc-sent-fc/input_data/input_data_1.csv")

In [28]:
economy_keywords = ["economy",
                    "recession",
                    "inflation",
                    "shutdown",
                    "infrastructure",
                    "market",
                    "retirement"]

In [30]:
econ_string = "+OR+".join(economy_keywords)

In [31]:
econ_string

'economy+OR+recession+OR+inflation+OR+living cost+OR+the fed+OR+shutdown+OR+infrastructure+OR+market+OR+retirement'