# Streaming Data

In [1]:
%load_ext autoreload

In [2]:
import requests
import numpy as np
import pandas as pd
import gcsfs
import joblib
from google.cloud import storage
import datetime as dt
import matplotlib.pyplot as plt

In [3]:
def to_readable_datetime(x):
    from datetime import datetime
    return datetime.utcfromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S')

In [None]:
df = pd.read_csv("gcs://wagon-data-750-btc-sent-fc/raw_data/features_2016.csv",
                 index_col=0,
                 parse_dates=True)

In [None]:
df.columns

# Reddit Streaming

In [None]:
file_name = "../keys.json"
with open(file_name, "r") as key_file:
    keys = json.load(key_file)

In [None]:
# note that CLIENT_ID refers to 'personal use script' and SECRET_TOKEN to 'token'
auth = requests.auth.HTTPBasicAuth(keys['REDDIT_SCRIPT'], keys["REDDIT_TOKEN"])

# here we pass our login method (password), username, and password
data = {
    'grant_type': 'password',
    'username': keys['REDDIT_USERNAME'],
    'password': keys['REDDIT_PASSWORD']
}

# setup our header info, which gives reddit a brief description of our app
headers = {'User-Agent': 'MyBot/0.0.1'}

# send our request for an OAuth token
res = requests.post('https://www.reddit.com/api/v1/access_token',
                    auth=auth,
                    data=data,
                    headers=headers)

# convert response to JSON and pull access_token value
TOKEN = res.json()['access_token']

# add authorization to our headers dictionary
headers = {**headers, **{'Authorization': f"bearer {TOKEN}"}}

# while the token is valid (~2 hours) we just add headers=headers to our requests
requests.get('https://oauth.reddit.com/api/v1/me', headers=headers)

In [None]:
crypto_url = "https://oauth.reddit.com/r/CryptoCurrency/top/?sort=top&t=day"

In [None]:
crypto_req = requests.get(crypto_url, headers=headers)

In [None]:
reddit_crypto_df = pd.DataFrame()
for post in res.json()['data']['children']:
    reddit_crypto_df = reddit_crypto_df.append(
        {
            'date': post["data"]["created_utc"],
            'subreddit': post['data']['subreddit'],
            'title': post['data']['title'],
            'selftext': post['data']['selftext'],
            'upvote_ratio': post['data']['upvote_ratio'],
            'ups': post['data']['ups'],
            'downs': post['data']['downs'],
            'score': post['data']['score']
        },
        ignore_index=True)

In [None]:
reddit_crypto_df["date"] = reddit_crypto_df["date"].map(to_readable_datetime)

In [None]:
econ_url = "https://oauth.reddit.com/search?q=economy+OR+recession+OR+inflation+OR+shutdown+OR+infrastructure+OR+market+OR+retirement&restrict_sr=&sort=top&t=day"

In [None]:
res = requests.get(econ_url, headers=headers)

reddit_df = pd.DataFrame()
for post in res.json()['data']['children']:
    reddit_df = reddit_df.append(
        {
            'date': post["data"]["created_utc"],
            'subreddit': post['data']['subreddit'],
            'title': post['data']['title'],
            'selftext': post['data']['selftext'],
            'upvote_ratio': post['data']['upvote_ratio'],
            'ups': post['data']['ups'],
            'downs': post['data']['downs'],
            'score': post['data']['score']
        },
        ignore_index=True)
reddit_econ_df["date"] = reddit_econ_df["date"]

In [None]:
reddit_df

In [None]:
econ_bert = pd.read_csv(
    "gs://wagon-data-750-btc-sent-fc/sent_processed/econ_bert.csv")[[
        "date", "positive", "negative", "neutral"
    ]]

In [None]:
crypto_bert["compound"]

In [None]:
econ_bert["compound"] = (econ_bert.positive - econ_bert.negative) / (
    econ_bert.positive + econ_bert.negative)

In [None]:
econ_bert.columns = ["econ_pos", "econ_neg", "econ_neu", "econ_compound"]

In [None]:
econ_bert.set_index("date", inplace=True)

In [None]:
econ_bert.index = pd.to_datetime(econ_bert.index)

In [None]:
crypto_bert.index = pd.to_datetime(crypto_bert.index)

In [None]:
df_merge_two.columns

In [None]:
df_merge_one = pd.merge(df,
                        crypto_bert[["crypto_pos", "crypto_neg",
                                     "crypto_neu"]],
                        how="left",
                        left_index=True,
                        right_index=True)

In [None]:
df_merge_two = pd.merge(df_merge_one,
                        econ_bert[["econ_pos", "econ_neg", "econ_neu"]],
                        how="left",
                        left_index=True,
                        right_index=True)

In [None]:
df_merge_two

In [None]:
crypto_bert.to_csv(
    "gs://wagon-data-750-btc-sent-fc/sent_processed/crypto_bert.csv")

In [None]:
econ_bert.to_csv(
    "gs://wagon-data-750-btc-sent-fc/sent_processed/econ_bert.csv")

In [None]:
df_merge_two.to_csv("gs://wagon-750-btc-sent-fc/input_data/input_data_1.csv")

In [None]:
economy_keywords = [
    "economy", "recession", "inflation", "shutdown", "infrastructure",
    "market", "retirement"
]

In [None]:
econ_string = "+OR+".join(economy_keywords)

In [None]:
econ_string

In [None]:
econ_bert = pd.read_csv(
    "gs://wagon-data-750-btc-sent-fc/sent_processed/econ_bert.csv",
    index_col=0,
    parse_dates=True)

# Twitter collating

In [None]:
storage_client = storage.Client()

# Note: Client.list_blobs requires at least package version 1.17.0.
blobs = storage_client.list_blobs("wagon-data-750-btc-sent-fc",
                                  prefix=f"sent_data/tweet_inflation")
blob_list = [blob.name for blob in blobs]
blob_list.sort()

In [None]:
inflation_bert = pd.DataFrame()
for blob in blob_list:
    temp_df = pd.read_csv(f"gcs://wagon-data-750-btc-sent-fc/{blob}",
                          index_col=0,
                          parse_dates=True)
    inflation_bert = inflation_bert.append(temp_df)

In [None]:
inflation_bert["date"] = pd.to_datetime(inflation_bert["date"]).dt.date

In [None]:
inflation_grouped = inflation_bert.groupby("date").mean()

In [None]:
inflation_grouped.columns = ["inflation_pos", "inflation_neg", "inflation_neu"]

In [None]:
df_merge_three = pd.merge(df_merge_two,
                          inflation_grouped,
                          how="left",
                          left_index=True,
                          right_index=True)

In [None]:
df_merge_three.to_csv("gs://wagon-750-btc-sent-fc/input_data/input_data_2.csv")

# Sentiment DF for Website

In [None]:
sent_df = df_merge_three[[
    "reddit_econ_sent", "reddit_crypto_sent", "tweets_sent"
]]

In [None]:
inflation_bert["compound"] = (inflation_bert.positive - inflation_bert.negative
                              ) / (inflation_bert.positive +
                                   inflation_bert.negative)

In [None]:
inflation_grouped = inflation_bert.groupby("date").mean()

In [None]:
inflation_grouped.columns = [
    "inflation_pos", "inflation_neg", "inflation_neu", "inflation_compound"
]

In [None]:
inflation_grouped.to_csv(
    "gs://wagon-data-750-btc-sent-fc/sent_processed/inflation_bert.csv")

In [None]:
sent_df_1 = sent_df.merge(inflation_grouped["inflation_compound"],
                          left_index=True,
                          right_index=True).merge(
                              econ_bert[["econ_compound"]],
                              left_index=True,
                              right_index=True).merge(
                                  crypto_bert[["crypto_compound"]],
                                  left_index=True,
                                  right_index=True)

In [None]:
sent_df_1.columns

In [None]:
sent_df_1.drop("reddit_econ_sent", axis=1, inplace=True)
sent_df_1.drop("reddit_crypto_sent", axis=1, inplace=True)

In [None]:
sent_df_1["econ_compound"] = (sent_df_1["tweets_sent"] +
                              sent_df_1["econ_compound"]) / 2

In [None]:
sent_df_1.drop("tweets_sent", axis=1, inplace=True)

In [None]:
sent_df_1.to_csv(
    "gcs://wagon-data-750-btc-sent-fc/website_data/sent_data_1.csv")

In [None]:
sent_df_1.merge(
    df_merge_two["volume_gross"], left_index=True, right_index=True).to_csv(
        "gcs://wagon-data-750-btc-sent-fc/website_data/chart_data_1.csv")

# Preproc Pipeline

In [4]:
from Main_package.RNN_model.data import clean_features, clean_test_features

2021-12-01 16:14:36.385608: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-01 16:14:36.385872: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [5]:
big_df = pd.read_csv(
    "gs://wagon-data-750-btc-sent-fc/input_data/features_2016.csv",
    index_col=0,
    parse_dates=True)

In [None]:
pipeline = clean_features(big_df)

In [None]:
type(columns)

In [None]:
from sklearn.pipeline import Pipeline


# Placeholder prediction array

In [None]:
url = "https://cloudsentiment-gijujv7fiq-ew.a.run.app/predict"

In [None]:
predict_placeholder = df_merge_three[["volume_gross"]].iloc[-30:]
predict_placeholder.columns = ["volume_actual"]

In [None]:
big_df = pd.read_csv(
    "gs://wagon-data-750-btc-sent-fc/input_data/features_2016.csv",
    index_col=0,
    parse_dates=True)

In [None]:
for i in range(1, 31):
    big_df.iloc[-(89 + i):-i].to_csv(
        f"gcs://wagon-data-750-btc-sent-fc/input_data/fake_{i}")

In [None]:
fs = gcsfs.GCSFileSystem()
with fs.open('wagon-data-750-btc-sent-fc/model/model_RNN_8.joblib') as f:
    model = joblib.load(f)
from Main_package.RNN_model.data import clean_features

def predict(file_name="test_2021_11_22.csv", shape=(1, 89, 61)):
    """model_name is a string - "model_name.joblib"
    date is also a string - in the format "yyyy_mm_dd"
    shape is a 3 part tuple with the input dimensions of the model"""
    X_pred = np.zeros(shape)
    X_df = pd.read_csv(
        f"gcs://wagon-data-750-btc-sent-fc/input_data/{file_name}",
        index_col=0,
        parse_dates=True)
    X_pred[0] = clean_features(X_df)
    y_pred = model.predict_on_batch(X_pred)
    return np.exp(y_pred[0][0])

In [None]:
predictions = []
for i in range(1, 31):
    predictions.append(predict(file_name=f"fake_{i}"))

In [None]:
predict_placeholder["volume_pred"] = predictions

In [None]:
predict_placeholder.loc[dt.datetime(2021,11,23)] = real_volume.loc[dt.datetime(2021,11,22)]["volume_gross"]

In [None]:
predict_placeholder.drop(dt.datetime(2021,10,24), inplace = True)

In [None]:
predict_placeholder.to_csv("gcs://wagon-data-750-btc-sent-fc/website_data/pred_temp.csv")

In [None]:
pred = (predict_placeholder["volume_pred"] - np.mean(predict_placeholder["volume_pred"]))/np.std(predict_placeholder["volume_pred"])

In [None]:
actual = (predict_placeholder["volume_actual"] - np.mean(predict_placeholder["volume_actual"]))/np.std(predict_placeholder["volume_actual"])

In [None]:
plt.plot(predict_placeholder["volume_actual"])
plt.plot(predict_placeholder["volume_pred"])
plt.show()

## Getting actual value for the 23rd

In [None]:
real_volume = pd.read_csv("../raw_data/coinbase_2021_11.csv")

In [None]:
real_volume.set_index("time", inplace = True)

In [None]:
real_volume.index = pd.to_datetime(real_volume.index)

In [None]:
real_volume.loc[dt.datetime(2021,11,22)]["volume_gross"]

In [None]:
temp = pd.read_csv("gcs://wagon-data-750-btc-sent-fc/input_data/input_data_1.csv", index_col = 0, parse_dates = True)

In [None]:
temp["volume_gross"][-1] = real_volume.loc[dt.datetime(2021,11,22)]["volume_gross"]

In [None]:
temp.to_csv("gcs://wagon-data-750-btc-sent-fc/input_data/input_data_1.csv")

# Text-box stuff

In [None]:
tweet_df = pd.read_csv(
    "gs://wagon-data-750-btc-sent-fc/tweet_data/inflation_2021-11-22T00:00:00.000Z"
)

In [None]:
inflation_tweet_text = tweet_df["clean_tweet"]

In [None]:
inflation_tweet_text[2]

In [None]:
len(inflation_tweet_text)

In [None]:
crypto_reddit_df = pd.read_csv(
    "gs://wagon-data-750-btc-sent-fc/raw_data/crypto_reddit.csv")
crypto_reddit_df["date"] = pd.to_datetime(crypto_reddit_df["date"])

In [None]:
crypto_reddit_df[crypto_reddit_df["date"] > dt.datetime(2021, 11, 22)]["title"]

In [None]:
crypto_reddit_text = crypto_reddit_df[
    crypto_reddit_df["date"] > dt.datetime(2021, 11, 22)]["title"]
len(crypto_reddit_text)

In [None]:
econ_reddit_df = pd.read_csv(
    "gcs://wagon-data-750-btc-sent-fc/raw_data/reddit_econ_prelim.csv")

In [None]:
econ_reddit_df["date"] = econ_reddit_df["date"].map(to_readable_datetime)

In [None]:
econ_reddit_df["date"] = pd.to_datetime(econ_reddit_df["date"])

In [None]:
econ_reddit_text = econ_reddit_df[
    econ_reddit_df["date"] > dt.datetime(2021, 11, 22)]["title"]

In [None]:
len(econ_reddit_text)

In [None]:
econ_reddit_text[1]

In [None]:
text_string = " ".join(i for i in econ_reddit_text)

In [None]:
len(text_string)

In [None]:
text_string_1 = " ".join(i for i in crypto_reddit_text)

In [None]:
len(text_string_1)

In [None]:
text_string_3 = " ".join(i for i in inflation_tweet_text)

In [None]:
len(text_string_3)

In [None]:
text_string_4 = text_string_1 + text_string_3 + text_string

In [None]:
len(text_string_4)

In [None]:
text_string_4

In [None]:
text_list = text_string_4.split()

In [None]:
text_list

In [None]:
60 * 163

In [None]:
out_string = ", ".join(text_list)

In [None]:
out_string[3]

In [None]:
fs = gcsfs.GCSFileSystem()
with fs.open(
        'wagon-data-750-btc-sent-fc/website_data/word_text_2021_11_22.txt',
        "w") as f:
    f.write(out_string)

In [None]:
from collections import Counter

In [None]:
l_sorted = Counter(text_list).most_common()

In [None]:
l_sorted

In [None]:
with fs.open('wagon-data-750-btc-sent-fc/website_data/word_text_2021_11_22.txt', "rb") as f:
    in_string = f.read().decode()

In [None]:
import string

In [None]:
in_string[3]

# BTC DATA

In [None]:
bitcoin_list = [
    'n-transactions-per-block', 'difficulty', 'utxo-count', 'mvrv', 'nvt',
    'avg-block-size', 'n-transactions-excluding-popular', 'n-unique-addresses',
    'median-confirmation-time', 'miners-revenue', 'mempool-growth',
    'mempool-size', 'blocks-size', 'hash-rate', 'n-transactions-total',
    'avg-confirmation-time', 'nvts', 'transaction-fees-usd', 'active_account'
]

In [None]:
bitcoin_list

## Glassnode?

In [None]:
glassnode_url = "https://api.glassnode.com/v2/metrics"

In [None]:
active_account = "addresses/active_count"

In [None]:
import tensorflow as tf

In [None]:
loaded_model = joblib.load(
    tf.io.gfile.GFile(
        "gs://wagon-data-750-btc-sent-fc/model/finbert_token.joblib", "rb"))

In [None]:
loaded_model

In [None]:
fs = gcsfs.GCSFileSystem()
with fs.open('wagon-data-750-btc-sent-fc/model/finbert_token.joblib') as f:
    model = joblib.load(f)

In [None]:
model

In [None]:
pd.read_csv("gs://wagon-data-750-btc-sent-fc/input_data/test_2021_11_22.csv")