# CryptoProphet
## Notebook's Goal
> Clean and extract text embeddings from tweets using BERT Transformer.

In [1]:
# import custom packages
from src.paths import LOCAL_PROCESSED_DATA_PATH
from src.text import clean_tweet

# import official packages
from sentence_transformers import SentenceTransformer
from tqdm._tqdm_notebook import tqdm_notebook
import pandas as pd
import swifter

tqdm_notebook.pandas()

# inits BERT model
model = SentenceTransformer('stsb-mpnet-base-v2')

# loads tweets dataset
df_path = LOCAL_PROCESSED_DATA_PATH / 'tweets_with_mentioned_coins_20211013.json'
df = pd.read_json(df_path, orient='index')
print(df.shape)
df.head(1).T

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  import sys


(127642, 37)


Unnamed: 0,0
created_at,2021-10-13 14:00:11
id,1448287453043953668
id_str,1448287453043953664
full_text,@DocumentingBTC This is good for #bitcoin
truncated,False
display_text_range,"[16, 41]"
entities,"{'hashtags': [{'text': 'bitcoin', 'indices': [..."
source,"<a href=""https://mobile.twitter.com"" rel=""nofo..."
in_reply_to_status_id,1.44828e+18
in_reply_to_status_id_str,1.44828e+18


In [118]:
# cleans tweets for BERT (text must look like plain written English)
df['full_text_BERT_cleaned'] = df.full_text.progress_apply(clean_tweet)

  0%|          | 0/127642 [00:00<?, ?it/s]

In [120]:
# display results
df[['full_text_BERT_cleaned', 'full_text']]

Unnamed: 0,full_text_BERT_cleaned,full_text
0,DocumentingBTC This is good for bitcoin,@DocumentingBTC This is good for #bitcoin
1,Reshare DocumentingBTC: Data from Cambridge,RT @DocumentingBTC: Data from Cambridge \n\nht...
2,"APompliano Must listen, for eveybody!","@APompliano Must listen, for eveybody!"
3,Reshare APompliano: Yesterday we were erroneou...,RT @APompliano: Yesterday we were erroneously ...
4,Reshare PrestonPysh: I'm talking with 100trill...,RT @PrestonPysh: I'm talking with @100trillion...
...,...,...
127637,"police car light police car light 600 BTC (28,...","🚨 🚨 600 #BTC (28,310,853 USD) transferred fro..."
127638,"police car light police car light 20,450,358 U...","🚨 🚨 20,450,358 #USDT (20,450,358 USD) transfe..."
127639,"20,000,000 XLM (7,039,863 USD) transferred fro...","20,000,000 #XLM (7,039,863 USD) transferred fr..."
127640,"police car light police car light 25,640,000 X...","🚨 🚨 25,640,000 #XRP (29,004,583 USD) transfer..."


In [121]:
# extract text embeddings with BERT
df['full_text_embedding'] = df.full_text_BERT_cleaned.progress_apply(model.encode)

  0%|          | 0/127642 [00:00<?, ?it/s]

In [122]:
# truncs time hourly based (to join with BTC price data)
df['created_at_trunc_h'] = df.created_at.dt.floor('h')

In [123]:
# exports data
df_path = LOCAL_PROCESSED_DATA_PATH / 'tweets_with_embeddings_20211013.pkl'
df.to_pickle(df_path)

# Conclusion
> Text Embeddings properly extracted using BERT