In [1]:
import pandas as pd
import numpy as np

In [35]:
# Variable for determining if we should postprocess tweets from scratch.
# Set to false because we already have the postprocessed file available for reading.
postProcessTweets = False

# Variable for determining if we need to compute embeddings by label.
computeEmbeddingsByLabel = False

# Variable for determining if we need to compute cosine similarities.
computeCosineSimilarities = False

In [3]:
cool_cats_tweets_df = pd.read_csv('../twitter/data/preprocessed/coolcats_07-10_2021.csv')  

In [4]:
cool_cats_tweets_df.head(10)

Unnamed: 0,id,author_id,created_at,text
0,1454598113499533314,1452399811219628034,2021-10-30T23:56:30.000Z,"Cool Cat just listed under floor, seller NGMI:..."
1,1454597918191603716,47390321,2021-10-30T23:55:43.000Z,RT @rex2_16: CoolCats went from: 1.5 ETH to 0....
2,1454596608411283458,1452399811219628034,2021-10-30T23:50:31.000Z,"Cool Cat just listed under floor, seller NGMI:..."
3,1454594614346883074,1371874606374588417,2021-10-30T23:42:35.000Z,RT @rex2_16: CoolCats went from: 1.5 ETH to 0....
4,1454593025720688649,1435045789756235778,2021-10-30T23:36:17.000Z,RT @rex2_16: CoolCats went from: 1.5 ETH to 0....
5,1454590584811925507,1422010380499488770,2021-10-30T23:26:35.000Z,CoolCats &amp; Goatz Mashup Halloween: #NewPro...
6,1454590075992477700,610878019,2021-10-30T23:24:33.000Z,RT @EthernalsNFT: How anxious are you to get o...
7,1454589750992687115,1452399811219628034,2021-10-30T23:23:16.000Z,Collections Floor Prices (live update):BAYC: 3...
8,1454588938585210888,1343771344274509824,2021-10-30T23:20:02.000Z,RT @rex2_16: CoolCats went from: 1.5 ETH to 0....
9,1454588617435791361,1416458460854722561,2021-10-30T23:18:46.000Z,RT @rex2_16: CoolCats went from: 1.5 ETH to 0....


In [5]:
# Create clean_text column with cleaned up tweet text: 
# - Lowercasing.
# - Removing whitespace noise.
# - Removing URLs.
# - Removing stopwords.
# - Lemmatization.

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

import re

# Clean up a specific tweet.
def getCleanTweet(tweet):
    
    # Lowercasing.
    tweet = tweet.lower()
    
    # Removing extra spaces.
    tweet = " ".join(tweet.split())
    
    # Remove URLs
    tweet = re.sub(r"http\S+", "", tweet)
    
    tokenized_tweet = word_tokenize(tweet)
    
    # Removing stopwords.
    tokenized_tweet = [w for w in tokenized_tweet if not w in set(stopwords.words('english'))]
 
    # Stemming.
    ps = PorterStemmer()
    
    tokenized_tweet = [ps.stem(w) for w in tokenized_tweet]
    
    return " ".join(tokenized_tweet)

# Example
print("Original tweet: " + cool_cats_tweets_df['text'][100])
print("Cleaned tweet: " + getCleanTweet(cool_cats_tweets_df['text'][100]))

Original tweet: Say hello to the new member of our family#nfts #nft #digitalart #art #ctyptoart   #erhereum #nftlink #cryptoartist #blockchain #nftcollector #supducks #modernart #artcollectoe  #boredapeyachtclub #coolcats  #cryptopunk  #rectanglecats #cats #NFTGiveaway #SquidGame https://t.co/JinAlIHL7c
Cleaned tweet: say hello new member famili # nft # nft # digitalart # art # ctyptoart # erhereum # nftlink # cryptoartist # blockchain # nftcollector # supduck # modernart # artcollecto # boredapeyachtclub # coolcat # cryptopunk # rectanglecat # cat # nftgiveaway # squidgam


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\oswaldoolivo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\oswaldoolivo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Clean all tweets.

if postProcessTweets:
    import time 

    startTime = time.time()

    cleanTweets = []

    for i in range(len(cool_cats_tweets_df['text'])):
        if i % 5000 == 0:
            print("Cleaned " +str(i) + " out of " + str(len(cool_cats_tweets_df['text'])) + " tweets.")
        
        tweet = cool_cats_tweets_df['text'][i]
        cleanTweet = getCleanTweet(tweet)
    
        cleanTweets.append(cleanTweet)
    
    endTime = time.time()

    print("Cleaned tweets in " + str(endTime - startTime) + " seconds")

    cleanTweets[:10]

In [7]:
if postProcessTweets:
    cool_cats_tweets_df['cleaned_text'] = cleanTweets

    cool_cats_tweets_df.head(20)

In [8]:
# Save cleaned tweets to file.
if postProcessTweets:
    cool_cats_tweets_df.to_csv('../twitter/data/postprocessed/coolcats_07-10_2021.csv', index = True)

In [9]:
cool_cats_tweets_df = pd.read_csv('../twitter/data/postprocessed/coolcats_07-10_2021.csv')  

In [10]:
# Set string types for 'text' and 'cleaned_text'
cool_cats_tweets_df['text'] = cool_cats_tweets_df['text'].astype('string')
cool_cats_tweets_df['cleaned_text'] = cool_cats_tweets_df['text'].astype('string')

cool_cats_tweets_df.dtypes

Unnamed: 0       int64
id               int64
author_id        int64
created_at      object
text            string
cleaned_text    string
dtype: object

In [11]:
cool_cats_tweets_df.head(10)

Unnamed: 0.1,Unnamed: 0,id,author_id,created_at,text,cleaned_text
0,0,1454598113499533314,1452399811219628034,2021-10-30T23:56:30.000Z,"Cool Cat just listed under floor, seller NGMI:...","Cool Cat just listed under floor, seller NGMI:..."
1,1,1454597918191603716,47390321,2021-10-30T23:55:43.000Z,RT @rex2_16: CoolCats went from: 1.5 ETH to 0....,RT @rex2_16: CoolCats went from: 1.5 ETH to 0....
2,2,1454596608411283458,1452399811219628034,2021-10-30T23:50:31.000Z,"Cool Cat just listed under floor, seller NGMI:...","Cool Cat just listed under floor, seller NGMI:..."
3,3,1454594614346883074,1371874606374588417,2021-10-30T23:42:35.000Z,RT @rex2_16: CoolCats went from: 1.5 ETH to 0....,RT @rex2_16: CoolCats went from: 1.5 ETH to 0....
4,4,1454593025720688649,1435045789756235778,2021-10-30T23:36:17.000Z,RT @rex2_16: CoolCats went from: 1.5 ETH to 0....,RT @rex2_16: CoolCats went from: 1.5 ETH to 0....
5,5,1454590584811925507,1422010380499488770,2021-10-30T23:26:35.000Z,CoolCats &amp; Goatz Mashup Halloween: #NewPro...,CoolCats &amp; Goatz Mashup Halloween: #NewPro...
6,6,1454590075992477700,610878019,2021-10-30T23:24:33.000Z,RT @EthernalsNFT: How anxious are you to get o...,RT @EthernalsNFT: How anxious are you to get o...
7,7,1454589750992687115,1452399811219628034,2021-10-30T23:23:16.000Z,Collections Floor Prices (live update):BAYC: 3...,Collections Floor Prices (live update):BAYC: 3...
8,8,1454588938585210888,1343771344274509824,2021-10-30T23:20:02.000Z,RT @rex2_16: CoolCats went from: 1.5 ETH to 0....,RT @rex2_16: CoolCats went from: 1.5 ETH to 0....
9,9,1454588617435791361,1416458460854722561,2021-10-30T23:18:46.000Z,RT @rex2_16: CoolCats went from: 1.5 ETH to 0....,RT @rex2_16: CoolCats went from: 1.5 ETH to 0....


In [12]:
cool_cats_sale_events_df = pd.read_csv('../opensea_client/sale_events.txt')

In [13]:
cool_cats_sale_events_df.head(10)

Unnamed: 0,token_id,collection,event_type,auction_type,bid_amount,ending_price,created_date,total_price,quantity
0,9932,cool-cats-nft,successful,,,,2021-10-21T19:16:22.725846,6550000000000000000,1
1,9932,cool-cats-nft,successful,,,,2021-10-11T17:16:09.440387,10490000000000000000,1
2,9932,cool-cats-nft,successful,,,,2021-09-23T08:30:18.304058,6000000000000000000,1
3,9932,cool-cats-nft,successful,,,,2021-08-29T01:44:44.449125,5250000000000000000,1
4,9925,cool-cats-nft,successful,,,,2021-08-17T03:46:32.538134,1510000000000000000,1
5,9925,cool-cats-nft,successful,,,,2021-08-13T17:59:33.661011,1660000000000000000,1
6,9925,cool-cats-nft,successful,,,,2021-08-04T23:52:14.987776,1150000000000000000,1
7,9924,cool-cats-nft,successful,,,,2021-08-05T12:07:20.368464,1740000000000000000,1
8,9923,cool-cats-nft,successful,,,,2021-07-17T05:50:57.697687,690000000000000000,1
9,9921,cool-cats-nft,successful,,,,2021-08-22T14:51:26.933741,1649000000000000000,1


In [14]:
from transformers import BertTokenizer, TFBertModel, BertConfig,TFDistilBertModel,DistilBertTokenizer,DistilBertConfig

In [15]:
distilbertTokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
distilbertModel = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/347M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_projector', 'vocab_transform', 'vocab_layer_norm', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [16]:
sentences=cool_cats_tweets_df['text']

In [17]:
distilbertTokenizer.tokenize(sentences[0])

['cool',
 'cat',
 'just',
 'listed',
 'under',
 'floor',
 ',',
 'seller',
 'ng',
 '##mi',
 ':',
 ':',
 '6',
 '.',
 '48',
 '##38',
 ':',
 'https',
 ':',
 '/',
 '/',
 't',
 '.',
 'co',
 '/',
 'x',
 '##ng',
 '##9',
 '##m',
 '##w',
 '##vu',
 '##as',
 '##bu',
 '##ying',
 'the',
 'dip',
 'an',
 '##on',
 '?',
 '#',
 'cool',
 '##cats',
 '#',
 'cool',
 '##cats',
 '##n',
 '##ft',
 '#',
 'opens',
 '##ean',
 '##ft',
 '#',
 'n',
 '##ft',
 '#',
 'opens',
 '##ea']

In [19]:
distilbertInput=distilbertTokenizer.encode_plus(sentences[0],add_special_tokens = True,pad_to_max_length = True,truncation=True)
distilbertInput=distilbertTokenizer.encode_plus(sentences[0],add_special_tokens = True)
distilbertInput



{'input_ids': [101, 4658, 4937, 2074, 3205, 2104, 2723, 1010, 14939, 12835, 4328, 1024, 1024, 1020, 1012, 4466, 22025, 1024, 16770, 1024, 1013, 1013, 1056, 1012, 2522, 1013, 1060, 3070, 2683, 2213, 2860, 19722, 3022, 8569, 14147, 1996, 16510, 2019, 2239, 1029, 1001, 4658, 19588, 1001, 4658, 19588, 2078, 6199, 1001, 7480, 11219, 6199, 1001, 1050, 6199, 1001, 7480, 5243, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [20]:
idInput=np.asarray(distilbertInput['input_ids'])
maskInput=np.asarray(distilbertInput['attention_mask'])

distilbertOutput=distilbertModel([idInput.reshape(1,-1),maskInput.reshape(1,-1)])

type(distilbertOutput),distilbertOutput

(transformers.modeling_tf_outputs.TFBaseModelOutput,
 TFBaseModelOutput(last_hidden_state=<tf.Tensor: shape=(1, 59, 768), dtype=float32, numpy=
 array([[[-0.11779414, -0.15132035,  0.02275763, ..., -0.04923558,
           0.32646263,  0.5366365 ],
         [-0.1085775 , -0.24667631,  0.66228324, ...,  0.08109096,
           0.43866563,  0.44020706],
         [-0.29714283, -0.31835958,  0.43593767, ...,  0.05396344,
           0.04600083,  0.77221096],
         ...,
         [-0.16104284,  0.2466539 ,  0.68685853, ..., -0.1420151 ,
           0.15295202,  0.2981444 ],
         [ 0.1825399 ,  0.21809575,  0.35958427, ..., -0.06513596,
          -0.21891102, -0.5222688 ],
         [ 0.8622943 ,  0.27708182, -0.10701422, ...,  0.1955083 ,
          -0.46374887, -0.16059066]]], dtype=float32)>, hidden_states=None, attentions=None))

In [21]:
distilbertOutput[0][:,0,:]

<tf.Tensor: shape=(1, 768), dtype=float32, numpy=
array([[-1.17794141e-01, -1.51320353e-01,  2.27576271e-02,
        -6.12904504e-02, -6.19337708e-03, -2.25467801e-01,
         1.87008709e-01,  3.91300321e-01, -2.64210403e-01,
        -2.36914694e-01, -2.90164769e-01, -1.79498136e-01,
        -4.65925299e-02,  2.84286916e-01,  6.11369126e-03,
         1.11567259e-01, -1.96444124e-01,  4.30956721e-01,
        -1.63892172e-02,  1.03265606e-02,  1.04786173e-01,
        -5.82086325e-01,  5.03869057e-02, -2.77416348e-01,
        -1.11984462e-02, -1.70876756e-01,  1.18852153e-01,
        -1.76714525e-01, -2.01792225e-01,  1.44718289e-01,
        -6.97725341e-02,  2.06656590e-01,  9.76492912e-02,
        -4.28009421e-01,  1.06794544e-01, -1.98422834e-01,
         7.99453855e-02, -1.26169384e-01,  9.21885148e-02,
         2.55157590e-01, -8.72819349e-02,  1.10985130e-01,
         2.69452929e-01, -2.82108877e-03,  1.56285092e-01,
        -1.01892076e-01, -2.75846863e+00, -4.64077741e-02,
      

In [22]:
cool_cats_sale_events_df.head(10)

Unnamed: 0,token_id,collection,event_type,auction_type,bid_amount,ending_price,created_date,total_price,quantity
0,9932,cool-cats-nft,successful,,,,2021-10-21T19:16:22.725846,6550000000000000000,1
1,9932,cool-cats-nft,successful,,,,2021-10-11T17:16:09.440387,10490000000000000000,1
2,9932,cool-cats-nft,successful,,,,2021-09-23T08:30:18.304058,6000000000000000000,1
3,9932,cool-cats-nft,successful,,,,2021-08-29T01:44:44.449125,5250000000000000000,1
4,9925,cool-cats-nft,successful,,,,2021-08-17T03:46:32.538134,1510000000000000000,1
5,9925,cool-cats-nft,successful,,,,2021-08-13T17:59:33.661011,1660000000000000000,1
6,9925,cool-cats-nft,successful,,,,2021-08-04T23:52:14.987776,1150000000000000000,1
7,9924,cool-cats-nft,successful,,,,2021-08-05T12:07:20.368464,1740000000000000000,1
8,9923,cool-cats-nft,successful,,,,2021-07-17T05:50:57.697687,690000000000000000,1
9,9921,cool-cats-nft,successful,,,,2021-08-22T14:51:26.933741,1649000000000000000,1


In [23]:
# Get all pairs of sequential sale events.
def getAllSaleEventPairs():
    from csv import reader
    
    saleEventPairs = []
    lastSalesByIds = dict()
    
    with open('../opensea_client/sale_events.txt', 'r') as sale_events_file:
        csvReader = reader(sale_events_file)
        currentRowNumber = 0
        
        for row in csvReader:
            
            if currentRowNumber == 0:
                currentRowNumber += 1
                continue
            
            # Construct an entry with the current row as the latest sale, and the entry in the 
            # dictionary as the previous sale
            # Id, Start Sale Date, Start Sale Price, End Sale Date, End Sale Price.
            if row[0] in lastSalesByIds:
                lastSale = lastSalesByIds[row[0]]
                saleEventPairs.append([row[0], row[6], float(row[7]), lastSale[6], float(lastSale[7])])
                
            lastSalesByIds[row[0]] = row
                         
            currentRowNumber += 1
                
        return saleEventPairs
            
saleEventPairs = getAllSaleEventPairs()
len(saleEventPairs)

1571

In [24]:
saleEventPairs[:5]

[['9932',
  '2021-10-11T17:16:09.440387',
  1.049e+19,
  '2021-10-21T19:16:22.725846',
  6.55e+18],
 ['9932',
  '2021-09-23T08:30:18.304058',
  6e+18,
  '2021-10-11T17:16:09.440387',
  1.049e+19],
 ['9932',
  '2021-08-29T01:44:44.449125',
  5.25e+18,
  '2021-09-23T08:30:18.304058',
  6e+18],
 ['9925',
  '2021-08-13T17:59:33.661011',
  1.66e+18,
  '2021-08-17T03:46:32.538134',
  1.51e+18],
 ['9925',
  '2021-08-04T23:52:14.987776',
  1.15e+18,
  '2021-08-13T17:59:33.661011',
  1.66e+18]]

In [25]:
# Get all the tweets between a start and end time.
def getTweetsInTimeInterval(tweetsDF, startTime, endTime):
    return tweetsDF[(startTime <= tweetsDF.created_at) & (tweetsDF.created_at <= endTime)]

print(saleEventPairs[0])
getTweetsInTimeInterval(cool_cats_tweets_df, saleEventPairs[0][1], saleEventPairs[0][3])    

['9932', '2021-10-11T17:16:09.440387', 1.049e+19, '2021-10-21T19:16:22.725846', 6.55e+18]


Unnamed: 0.1,Unnamed: 0,id,author_id,created_at,text,cleaned_text
973,973,1449024283166920706,2894816197,2021-10-15T14:48:05.000Z,RT @Cryptomesssiah: GIVEAWAY OF RARE #CoolCats...,RT @Cryptomesssiah: GIVEAWAY OF RARE #CoolCats...
974,974,1449024233858797575,1446221459647565824,2021-10-15T14:47:53.000Z,RT @Cryptomesssiah: GIVEAWAY OF RARE #CoolCats...,RT @Cryptomesssiah: GIVEAWAY OF RARE #CoolCats...
975,975,1449024171367747587,1435441708641181697,2021-10-15T14:47:38.000Z,RT @Cryptomesssiah: GIVEAWAY OF RARE #CoolCats...,RT @Cryptomesssiah: GIVEAWAY OF RARE #CoolCats...
976,976,1449024082771644459,1433619860970430464,2021-10-15T14:47:17.000Z,RT @Cryptomesssiah: GIVEAWAY OF RARE #CoolCats...,RT @Cryptomesssiah: GIVEAWAY OF RARE #CoolCats...
977,977,1449024075746193411,1566750793,2021-10-15T14:47:15.000Z,RT @Cryptomesssiah: GIVEAWAY OF RARE #CoolCats...,RT @Cryptomesssiah: GIVEAWAY OF RARE #CoolCats...
...,...,...,...,...,...,...
49575,49575,1449026142493609987,1448329451511681031,2021-10-15T14:55:28.000Z,RT @Cryptomesssiah: GIVEAWAY OF RARE #CoolCats...,RT @Cryptomesssiah: GIVEAWAY OF RARE #CoolCats...
49576,49576,1449025616158855171,1058626370765574144,2021-10-15T14:53:23.000Z,RT @Cryptomesssiah: GIVEAWAY OF RARE #CoolCats...,RT @Cryptomesssiah: GIVEAWAY OF RARE #CoolCats...
49577,49577,1449025541185499138,1316406360762793984,2021-10-15T14:53:05.000Z,RT @NFTNywIRA: Tonight it's been a month since...,RT @NFTNywIRA: Tonight it's been a month since...
49578,49578,1449024948052365367,1294698814683709446,2021-10-15T14:50:43.000Z,RT @Cryptomesssiah: GIVEAWAY OF RARE #CoolCats...,RT @Cryptomesssiah: GIVEAWAY OF RARE #CoolCats...


In [26]:
# Measure time of computing all tweet intervals.
import time

startTime = time.time()

tweetsBetweenSaleEvents = \
[getTweetsInTimeInterval(cool_cats_tweets_df, saleEvent[1], saleEvent[3])['cleaned_text'] for saleEvent in saleEventPairs]
    
endTime = time.time()

print(endTime - startTime)

11.407142400741577


In [27]:
# Create data frame from sale pair events.
sale_pairs_df = pd.DataFrame(saleEventPairs)

sale_pairs_df.columns = ['id', 'start_sale_date', 'start_sale_price', 'end_sale_date', 'end_sale_price']
sale_pairs_df = sale_pairs_df.reset_index(drop=True)

sale_pairs_df

Unnamed: 0,id,start_sale_date,start_sale_price,end_sale_date,end_sale_price
0,9932,2021-10-11T17:16:09.440387,1.049000e+19,2021-10-21T19:16:22.725846,6.550000e+18
1,9932,2021-09-23T08:30:18.304058,6.000000e+18,2021-10-11T17:16:09.440387,1.049000e+19
2,9932,2021-08-29T01:44:44.449125,5.250000e+18,2021-09-23T08:30:18.304058,6.000000e+18
3,9925,2021-08-13T17:59:33.661011,1.660000e+18,2021-08-17T03:46:32.538134,1.510000e+18
4,9925,2021-08-04T23:52:14.987776,1.150000e+18,2021-08-13T17:59:33.661011,1.660000e+18
...,...,...,...,...,...
1566,8967,2021-07-17T16:20:05.458264,1.380000e+18,2021-09-17T21:16:43.614358,8.500000e+18
1567,8967,2021-07-09T01:40:21.637272,1.250000e+18,2021-07-17T16:20:05.458264,1.380000e+18
1568,8967,2021-07-08T13:00:04.209249,7.500000e+17,2021-07-09T01:40:21.637272,1.250000e+18
1569,8967,2021-07-08T07:34:39.258665,1.250000e+18,2021-07-08T13:00:04.209249,7.500000e+17


In [28]:
# Add percent change column.
sale_pairs_df['percent_change'] = \
    (sale_pairs_df['end_sale_price'] - sale_pairs_df['start_sale_price']) / sale_pairs_df['start_sale_price']

sale_pairs_df

Unnamed: 0,id,start_sale_date,start_sale_price,end_sale_date,end_sale_price,percent_change
0,9932,2021-10-11T17:16:09.440387,1.049000e+19,2021-10-21T19:16:22.725846,6.550000e+18,-0.375596
1,9932,2021-09-23T08:30:18.304058,6.000000e+18,2021-10-11T17:16:09.440387,1.049000e+19,0.748333
2,9932,2021-08-29T01:44:44.449125,5.250000e+18,2021-09-23T08:30:18.304058,6.000000e+18,0.142857
3,9925,2021-08-13T17:59:33.661011,1.660000e+18,2021-08-17T03:46:32.538134,1.510000e+18,-0.090361
4,9925,2021-08-04T23:52:14.987776,1.150000e+18,2021-08-13T17:59:33.661011,1.660000e+18,0.443478
...,...,...,...,...,...,...
1566,8967,2021-07-17T16:20:05.458264,1.380000e+18,2021-09-17T21:16:43.614358,8.500000e+18,5.159420
1567,8967,2021-07-09T01:40:21.637272,1.250000e+18,2021-07-17T16:20:05.458264,1.380000e+18,0.104000
1568,8967,2021-07-08T13:00:04.209249,7.500000e+17,2021-07-09T01:40:21.637272,1.250000e+18,0.666667
1569,8967,2021-07-08T07:34:39.258665,1.250000e+18,2021-07-08T13:00:04.209249,7.500000e+17,-0.400000


In [29]:
# Adding momentum of 5 percent change.
# Change greater than 5% percent => UP.
# Change greater than -5% percent => DOWN.
# Change between -5% and 5% => FLAT.

sale_pairs_df.loc[sale_pairs_df["percent_change"] > 0.05, "five_percent_momentum"] = "UP"
sale_pairs_df.loc[sale_pairs_df["percent_change"] < -0.05, "five_percent_momentum"] = "DOWN"
sale_pairs_df.loc[(sale_pairs_df["percent_change"] >= -0.05) & (sale_pairs_df["percent_change"] <= 0.05),\
                  "five_percent_momentum"] = "FLAT"

sale_pairs_df

Unnamed: 0,id,start_sale_date,start_sale_price,end_sale_date,end_sale_price,percent_change,five_percent_momentum
0,9932,2021-10-11T17:16:09.440387,1.049000e+19,2021-10-21T19:16:22.725846,6.550000e+18,-0.375596,DOWN
1,9932,2021-09-23T08:30:18.304058,6.000000e+18,2021-10-11T17:16:09.440387,1.049000e+19,0.748333,UP
2,9932,2021-08-29T01:44:44.449125,5.250000e+18,2021-09-23T08:30:18.304058,6.000000e+18,0.142857,UP
3,9925,2021-08-13T17:59:33.661011,1.660000e+18,2021-08-17T03:46:32.538134,1.510000e+18,-0.090361,DOWN
4,9925,2021-08-04T23:52:14.987776,1.150000e+18,2021-08-13T17:59:33.661011,1.660000e+18,0.443478,UP
...,...,...,...,...,...,...,...
1566,8967,2021-07-17T16:20:05.458264,1.380000e+18,2021-09-17T21:16:43.614358,8.500000e+18,5.159420,UP
1567,8967,2021-07-09T01:40:21.637272,1.250000e+18,2021-07-17T16:20:05.458264,1.380000e+18,0.104000,UP
1568,8967,2021-07-08T13:00:04.209249,7.500000e+17,2021-07-09T01:40:21.637272,1.250000e+18,0.666667,UP
1569,8967,2021-07-08T07:34:39.258665,1.250000e+18,2021-07-08T13:00:04.209249,7.500000e+17,-0.400000,DOWN


In [30]:
# Concatenate all the tweets between sale events.

concatenatedTweetsBetweenSaleEvents = []

for i in range(len(tweetsBetweenSaleEvents)):
    tweetsBetweenSale = tweetsBetweenSaleEvents[i]
    
    concatenatedTweets = " ".join(tweetsBetweenSale)
    concatenatedTweetsBetweenSaleEvents.append(concatenatedTweets)
    
len(concatenatedTweetsBetweenSaleEvents)

1571

In [31]:
print(concatenatedTweetsBetweenSaleEvents[0][:100])
print(concatenatedTweetsBetweenSaleEvents[1][:100])

RT @Cryptomesssiah: GIVEAWAY OF RARE #CoolCatsNFT  Current price 9 #ETH To Enter: Follow me Like &am
Do you know why square cats are such good pets? You can hunts round dogsAccess to Future drops, give


In [32]:
# Add concatenated tweets to dataframe.
sale_pairs_df['all_tweets'] = concatenatedTweetsBetweenSaleEvents

sale_pairs_df.head(10)

Unnamed: 0,id,start_sale_date,start_sale_price,end_sale_date,end_sale_price,percent_change,five_percent_momentum,all_tweets
0,9932,2021-10-11T17:16:09.440387,1.049e+19,2021-10-21T19:16:22.725846,6.55e+18,-0.375596,DOWN,RT @Cryptomesssiah: GIVEAWAY OF RARE #CoolCats...
1,9932,2021-09-23T08:30:18.304058,6e+18,2021-10-11T17:16:09.440387,1.049e+19,0.748333,UP,Do you know why square cats are such good pets...
2,9932,2021-08-29T01:44:44.449125,5.25e+18,2021-09-23T08:30:18.304058,6e+18,0.142857,UP,"#24px - anon dev. no roadmap. just pixels, cat..."
3,9925,2021-08-13T17:59:33.661011,1.66e+18,2021-08-17T03:46:32.538134,1.51e+18,-0.090361,DOWN,RT @XCOPYPUNKS: XCOPYCATS GiveawayWe're giving...
4,9925,2021-08-04T23:52:14.987776,1.15e+18,2021-08-13T17:59:33.661011,1.66e+18,0.443478,UP,RT @BullieverIsland: Celebrating Citizens of B...
5,9921,2021-08-22T11:05:32.621573,1.64e+18,2021-08-22T14:51:26.933741,1.649e+18,0.005488,FLAT,RT @DamianSpriggs: I have always wanted to be ...
6,9921,2021-08-17T04:39:51.161269,1.32e+18,2021-08-22T11:05:32.621573,1.64e+18,0.242424,UP,RT @DrugstoreNfts: DROP NEWSTODAY 10 PM GMT /...
7,9921,2021-07-30T16:46:54.052325,6.5e+17,2021-08-17T04:39:51.161269,1.32e+18,1.030769,UP,@zachward69 #Coolcats BEEP BOOP Cool Cat #5...
8,9920,2021-08-24T03:29:15.074609,3.3568e+18,2021-08-24T22:21:08.871690,4.9e+18,0.459724,UP,Omg looooool 2.2M followers 50E saleI can't g...
9,9920,2021-08-23T17:24:06.385908,2.11e+18,2021-08-24T03:29:15.074609,3.3568e+18,0.5909,UP,RT @SpaceLabCrypto: If you are worried that yo...


In [33]:
# Get the last hour of tweets between sale events.
from dateutil import parser
from datetime import datetime, timedelta

startTime = time.time()

lastHourTweets = []

for saleEvent in saleEventPairs:
    endDate = saleEvent[3]
    
    parsedEndDate = parser.parse(endDate)
    
    oneHourAgo = str(parsedEndDate - timedelta(hours=0, minutes=60))
    
    tweetsSinceLastHour = getTweetsInTimeInterval(cool_cats_tweets_df, oneHourAgo, endDate)['cleaned_text']
    
    concatenatedTweetsSinceLastHour = " ".join(tweetsSinceLastHour)
    
    lastHourTweets.append(concatenatedTweetsSinceLastHour)

endTime = time.time()

print(endTime - startTime)

10.842994451522827


In [34]:
# Add last hour tweets to the dataframe.
sale_pairs_df['last_hour_tweets'] = lastHourTweets

sale_pairs_df.head(10)

Unnamed: 0,id,start_sale_date,start_sale_price,end_sale_date,end_sale_price,percent_change,five_percent_momentum,all_tweets,last_hour_tweets
0,9932,2021-10-11T17:16:09.440387,1.049e+19,2021-10-21T19:16:22.725846,6.55e+18,-0.375596,DOWN,RT @Cryptomesssiah: GIVEAWAY OF RARE #CoolCats...,RT @gimmocrypto: This gif of @mutantcats purch...
1,9932,2021-09-23T08:30:18.304058,6e+18,2021-10-11T17:16:09.440387,1.049e+19,0.748333,UP,Do you know why square cats are such good pets...,Do you know why square cats are such good pets...
2,9932,2021-08-29T01:44:44.449125,5.25e+18,2021-09-23T08:30:18.304058,6e+18,0.142857,UP,"#24px - anon dev. no roadmap. just pixels, cat...","#24px - anon dev. no roadmap. just pixels, cat..."
3,9925,2021-08-13T17:59:33.661011,1.66e+18,2021-08-17T03:46:32.538134,1.51e+18,-0.090361,DOWN,RT @XCOPYPUNKS: XCOPYCATS GiveawayWe're giving...,RT @XCOPYPUNKS: XCOPYCATS GiveawayWe're giving...
4,9925,2021-08-04T23:52:14.987776,1.15e+18,2021-08-13T17:59:33.661011,1.66e+18,0.443478,UP,RT @BullieverIsland: Celebrating Citizens of B...,RT @BullieverIsland: Celebrating Citizens of B...
5,9921,2021-08-22T11:05:32.621573,1.64e+18,2021-08-22T14:51:26.933741,1.649e+18,0.005488,FLAT,RT @DamianSpriggs: I have always wanted to be ...,RT @DamianSpriggs: I have always wanted to be ...
6,9921,2021-08-17T04:39:51.161269,1.32e+18,2021-08-22T11:05:32.621573,1.64e+18,0.242424,UP,RT @DrugstoreNfts: DROP NEWSTODAY 10 PM GMT /...,RT @DrugstoreNfts: DROP NEWSTODAY 10 PM GMT /...
7,9921,2021-07-30T16:46:54.052325,6.5e+17,2021-08-17T04:39:51.161269,1.32e+18,1.030769,UP,@zachward69 #Coolcats BEEP BOOP Cool Cat #5...,@zachward69 #Coolcats BEEP BOOP Cool Cat #5...
8,9920,2021-08-24T03:29:15.074609,3.3568e+18,2021-08-24T22:21:08.871690,4.9e+18,0.459724,UP,Omg looooool 2.2M followers 50E saleI can't g...,Omg looooool 2.2M followers 50E saleI can't g...
9,9920,2021-08-23T17:24:06.385908,2.11e+18,2021-08-24T03:29:15.074609,3.3568e+18,0.5909,UP,RT @SpaceLabCrypto: If you are worried that yo...,RT @SpaceLabCrypto: If you are worried that yo...


In [59]:
# Get the embeddings for the different labels.

from numpy import dot
from numpy.linalg import norm

# Cosine similarity for comparing embeddings.
def cosineSimilarity(a, b):
    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))

# Returns a Distilbert embedding for a given text as a vector.
def getDistilbertEmbeddingForText(text):
    distilbertInput=distilbertTokenizer.encode_plus(\
                                                    text,\
                                                    max_length = 512,\
                                                    add_special_tokens = True,\
                                                    pad_to_max_length = True,\
                                                    truncation=True)
    idInput=np.asarray(distilbertInput['input_ids'])
    maskInput=np.asarray(distilbertInput['attention_mask'])
    distilbertOutput=distilbertModel([idInput.reshape(1,-1),maskInput.reshape(1,-1)])
    
    #print(type(distilbertOutput),distilbertOutput)
    
    return np.array(distilbertOutput[0][0]).reshape(-1)

# Get the embedding of the concatenation of all tweets for a given Momentum label.
def getDistilbertEmbeddingsForMomentumLabel(df, targetLabel, tweetsColumn):
    tweetsByLabel = df.loc[df.five_percent_momentum == targetLabel, tweetsColumn]
    
    concatenatedTweets = " ".join(tweetsByLabel)
        
    return getDistilbertEmbeddingForText(concatenatedTweets)

# Get the embeddings for all labels.
def getDistilbertEmbeddingsForMomentumLabels(df, tweetsColumn):
    distilbertEmbeddingsByLabel = dict()
    labels = ["UP", "DOWN", "FLAT"]
    
    for label in labels:
        distilbertEmbeddingsByLabel[label] = getDistilbertEmbeddingsForMomentumLabel(df, label, tweetsColumn)
        
    return distilbertEmbeddingsByLabel

In [36]:
if computeEmbeddingsByLabel:
    
    startTime = time.time()

    # Getting memory error when concatenating all histories.
    #distilbertEmbeddingsByLabel = getDistilbertEmbeddingsForMomentumLabels(sale_pairs_df, 'all_tweets')

    # Get Distilbert embeddings for tweets during the last hour of a sale.
    lastHourDistilbertEmbeddingsByLabel = getDistilbertEmbeddingsForMomentumLabels(sale_pairs_df, 'last_hour_tweets')

    #distilbertOutputs = []

    #distilbertInput=distilbertTokenizer.encode_plus(concatenatedTweetsBetweenSaleEvents,add_special_tokens = True,pad_to_max_length = True,truncation=True)
    #idInput=np.asarray(distilbertInput['input_ids'])
    #maskInput=np.asarray(distilbertInput['attention_mask'])
    #distilbertOutput=distilbertModel([idInput.reshape(1,-1),maskInput.reshape(1,-1)])

    #print(type(distilbertOutput),distilbertOutput)

    #for i in range(len(concatenatedTweetsBetweenSaleEvents)):
    #    if i % 100 == 0:
    #        print("Computing Embedding for tweets " + str(i) + " out of " + str(len(concatenatedTweetsBetweenSaleEvents)))
    
    #    concatenatedTweets = concatenatedTweetsBetweenSaleEvents[i]
    #    distilbertInput=distilbertTokenizer.encode_plus(concatenatedTweets,add_special_tokens = True,pad_to_max_length = True,truncation=True)
    #    idInput=np.asarray(distilbertInput['input_ids'])
    #    maskInput=np.asarray(distilbertInput['attention_mask'])

    #    distilbertOutput=distilbertModel([idInput.reshape(1,-1),maskInput.reshape(1,-1)])
    
    #    distilbertOutputs.append(distilbertOutput)

    #type(distilbertOutput),distilbertOutput
    
    #print(getDistilbertEmbeddingForText(concatenatedTweetsBetweenSaleEvents[0]))
    
    endTime = time.time()
    print("Finished computed embeddings for labels in " + str(endTime - startTime) + " seconds.")

In [37]:
if computeEmbeddingsByLabel:
    last_hour_embeddings_per_label_df.to_csv('../twitter/data/postprocessed/last_hour_embeddings_per_label_coolcats_07-10_2021.csv', index = True)

In [39]:
last_hour_embeddings_per_label_df = pd.read_csv('../twitter/data/postprocessed/last_hour_embeddings_per_label_coolcats_07-10_2021.csv')

KeyboardInterrupt: 

In [40]:
# Compute cosine similarity against all labels.

def getCosineSimilaritiesPerLabel(df, tweetsColumn, embeddingsPerLabel, labels):
    cosineSimilarities = []
    
    for i in range(len(df[tweetsColumn])):
        tweets = df[tweetsColumn][i]
        
        if i % 100 == 0:
            print("Computing cosine similarities for tweets " + str(i) + " out of " + str(len(df[tweetsColumn])))
        
        tweetsEmbedding = getDistilbertEmbeddingForText(tweets)
        
        cosineSimilarityPerLabel = []
        for label in labels:
            cosineSimilarityPerLabel.append(cosineSimilarity(tweetsEmbedding, embeddingsPerLabel[label]))
            
        cosineSimilarities.append(cosineSimilarityPerLabel)
        
    return cosineSimilarities
        

# Compute cosine similarities only if we haven't computed them before.
if computeCosineSimilarities:
    startTime = time.time()

    cosineSimilarities = getCosineSimilaritiesPerLabel(sale_pairs_df, \
                                                   'last_hour_tweets',\
                                                   lastHourDistilbertEmbeddingsByLabel,\
                                                  ["UP", "DOWN", "FLAT"])

    endTime = time.time()

    print("Finished computing cosine similarities after " + str(endTime - startTime) + " seconds")

In [41]:
# Create last-hour cosine similarities dataframe
if computeCosineSimilarities:
    last_hour_cosine_similarities_df = pd.DataFrame(data=cosineSimilarities, columns=["UP", "DOWN", "FLAT"])

    last_hour_cosine_similarities_df.head(10)

In [42]:
#Store last-hour cosine similarities.
if computeCosineSimilarities:
    last_hour_cosine_similarities_df.to_csv('../twitter/data/postprocessed/last_hour_cosine_similarities_coolcats_07-10_2021.csv', index = True)

In [43]:
last_hour_cosine_similarities_df = pd.read_csv('../twitter/data/postprocessed/last_hour_cosine_similarities_coolcats_07-10_2021.csv')

In [44]:
sale_pairs_df.head(10)

Unnamed: 0,id,start_sale_date,start_sale_price,end_sale_date,end_sale_price,percent_change,five_percent_momentum,all_tweets,last_hour_tweets
0,9932,2021-10-11T17:16:09.440387,1.049e+19,2021-10-21T19:16:22.725846,6.55e+18,-0.375596,DOWN,RT @Cryptomesssiah: GIVEAWAY OF RARE #CoolCats...,RT @gimmocrypto: This gif of @mutantcats purch...
1,9932,2021-09-23T08:30:18.304058,6e+18,2021-10-11T17:16:09.440387,1.049e+19,0.748333,UP,Do you know why square cats are such good pets...,Do you know why square cats are such good pets...
2,9932,2021-08-29T01:44:44.449125,5.25e+18,2021-09-23T08:30:18.304058,6e+18,0.142857,UP,"#24px - anon dev. no roadmap. just pixels, cat...","#24px - anon dev. no roadmap. just pixels, cat..."
3,9925,2021-08-13T17:59:33.661011,1.66e+18,2021-08-17T03:46:32.538134,1.51e+18,-0.090361,DOWN,RT @XCOPYPUNKS: XCOPYCATS GiveawayWe're giving...,RT @XCOPYPUNKS: XCOPYCATS GiveawayWe're giving...
4,9925,2021-08-04T23:52:14.987776,1.15e+18,2021-08-13T17:59:33.661011,1.66e+18,0.443478,UP,RT @BullieverIsland: Celebrating Citizens of B...,RT @BullieverIsland: Celebrating Citizens of B...
5,9921,2021-08-22T11:05:32.621573,1.64e+18,2021-08-22T14:51:26.933741,1.649e+18,0.005488,FLAT,RT @DamianSpriggs: I have always wanted to be ...,RT @DamianSpriggs: I have always wanted to be ...
6,9921,2021-08-17T04:39:51.161269,1.32e+18,2021-08-22T11:05:32.621573,1.64e+18,0.242424,UP,RT @DrugstoreNfts: DROP NEWSTODAY 10 PM GMT /...,RT @DrugstoreNfts: DROP NEWSTODAY 10 PM GMT /...
7,9921,2021-07-30T16:46:54.052325,6.5e+17,2021-08-17T04:39:51.161269,1.32e+18,1.030769,UP,@zachward69 #Coolcats BEEP BOOP Cool Cat #5...,@zachward69 #Coolcats BEEP BOOP Cool Cat #5...
8,9920,2021-08-24T03:29:15.074609,3.3568e+18,2021-08-24T22:21:08.871690,4.9e+18,0.459724,UP,Omg looooool 2.2M followers 50E saleI can't g...,Omg looooool 2.2M followers 50E saleI can't g...
9,9920,2021-08-23T17:24:06.385908,2.11e+18,2021-08-24T03:29:15.074609,3.3568e+18,0.5909,UP,RT @SpaceLabCrypto: If you are worried that yo...,RT @SpaceLabCrypto: If you are worried that yo...


In [45]:
# Add the cosine similarities for the last hour of tweets to the sale_pairs_df.
sale_pairs_df["up_last_hour"] = last_hour_cosine_similarities_df["UP"]
sale_pairs_df["down_last_hour"] = last_hour_cosine_similarities_df["DOWN"]
sale_pairs_df["flat_last_hour"] = last_hour_cosine_similarities_df["FLAT"]
sale_pairs_df.head(10)

Unnamed: 0,id,start_sale_date,start_sale_price,end_sale_date,end_sale_price,percent_change,five_percent_momentum,all_tweets,last_hour_tweets,up_last_hour,down_last_hour,flat_last_hour
0,9932,2021-10-11T17:16:09.440387,1.049e+19,2021-10-21T19:16:22.725846,6.55e+18,-0.375596,DOWN,RT @Cryptomesssiah: GIVEAWAY OF RARE #CoolCats...,RT @gimmocrypto: This gif of @mutantcats purch...,0.387524,1.0,0.384733
1,9932,2021-09-23T08:30:18.304058,6e+18,2021-10-11T17:16:09.440387,1.049e+19,0.748333,UP,Do you know why square cats are such good pets...,Do you know why square cats are such good pets...,1.0,0.387524,0.396572
2,9932,2021-08-29T01:44:44.449125,5.25e+18,2021-09-23T08:30:18.304058,6e+18,0.142857,UP,"#24px - anon dev. no roadmap. just pixels, cat...","#24px - anon dev. no roadmap. just pixels, cat...",0.387349,0.38688,0.383842
3,9925,2021-08-13T17:59:33.661011,1.66e+18,2021-08-17T03:46:32.538134,1.51e+18,-0.090361,DOWN,RT @XCOPYPUNKS: XCOPYCATS GiveawayWe're giving...,RT @XCOPYPUNKS: XCOPYCATS GiveawayWe're giving...,0.397461,0.399703,0.399968
4,9925,2021-08-04T23:52:14.987776,1.15e+18,2021-08-13T17:59:33.661011,1.66e+18,0.443478,UP,RT @BullieverIsland: Celebrating Citizens of B...,RT @BullieverIsland: Celebrating Citizens of B...,0.380741,0.390007,0.377293
5,9921,2021-08-22T11:05:32.621573,1.64e+18,2021-08-22T14:51:26.933741,1.649e+18,0.005488,FLAT,RT @DamianSpriggs: I have always wanted to be ...,RT @DamianSpriggs: I have always wanted to be ...,0.396572,0.384733,1.0
6,9921,2021-08-17T04:39:51.161269,1.32e+18,2021-08-22T11:05:32.621573,1.64e+18,0.242424,UP,RT @DrugstoreNfts: DROP NEWSTODAY 10 PM GMT /...,RT @DrugstoreNfts: DROP NEWSTODAY 10 PM GMT /...,0.402898,0.3824,0.390657
7,9921,2021-07-30T16:46:54.052325,6.5e+17,2021-08-17T04:39:51.161269,1.32e+18,1.030769,UP,@zachward69 #Coolcats BEEP BOOP Cool Cat #5...,@zachward69 #Coolcats BEEP BOOP Cool Cat #5...,0.385264,0.394358,0.387613
8,9920,2021-08-24T03:29:15.074609,3.3568e+18,2021-08-24T22:21:08.871690,4.9e+18,0.459724,UP,Omg looooool 2.2M followers 50E saleI can't g...,Omg looooool 2.2M followers 50E saleI can't g...,0.373806,0.36559,0.381575
9,9920,2021-08-23T17:24:06.385908,2.11e+18,2021-08-24T03:29:15.074609,3.3568e+18,0.5909,UP,RT @SpaceLabCrypto: If you are worried that yo...,RT @SpaceLabCrypto: If you are worried that yo...,0.385294,0.403803,0.39045


In [46]:
# Add the label with maximum cosine similarity as a feature.
sale_pairs_df.loc[(sale_pairs_df['up_last_hour'] >= sale_pairs_df['down_last_hour']) \
             & (sale_pairs_df['up_last_hour'] >= sale_pairs_df['flat_last_hour']),\
                  'last_hour_closest_cosine_similarity_label'] = "UP"
sale_pairs_df.loc[(sale_pairs_df['down_last_hour'] >= sale_pairs_df['up_last_hour']) \
             & (sale_pairs_df['down_last_hour'] >= sale_pairs_df['flat_last_hour']),\
                  'last_hour_closest_cosine_similarity_label'] = "DOWN"
sale_pairs_df.loc[(sale_pairs_df['flat_last_hour'] >= sale_pairs_df['up_last_hour']) \
             & (sale_pairs_df['flat_last_hour'] >= sale_pairs_df['down_last_hour']),\
                  'last_hour_closest_cosine_similarity_label'] = "FLAT"

sale_pairs_df.head(10)

Unnamed: 0,id,start_sale_date,start_sale_price,end_sale_date,end_sale_price,percent_change,five_percent_momentum,all_tweets,last_hour_tweets,up_last_hour,down_last_hour,flat_last_hour,last_hour_closest_cosine_similarity_label
0,9932,2021-10-11T17:16:09.440387,1.049e+19,2021-10-21T19:16:22.725846,6.55e+18,-0.375596,DOWN,RT @Cryptomesssiah: GIVEAWAY OF RARE #CoolCats...,RT @gimmocrypto: This gif of @mutantcats purch...,0.387524,1.0,0.384733,DOWN
1,9932,2021-09-23T08:30:18.304058,6e+18,2021-10-11T17:16:09.440387,1.049e+19,0.748333,UP,Do you know why square cats are such good pets...,Do you know why square cats are such good pets...,1.0,0.387524,0.396572,UP
2,9932,2021-08-29T01:44:44.449125,5.25e+18,2021-09-23T08:30:18.304058,6e+18,0.142857,UP,"#24px - anon dev. no roadmap. just pixels, cat...","#24px - anon dev. no roadmap. just pixels, cat...",0.387349,0.38688,0.383842,UP
3,9925,2021-08-13T17:59:33.661011,1.66e+18,2021-08-17T03:46:32.538134,1.51e+18,-0.090361,DOWN,RT @XCOPYPUNKS: XCOPYCATS GiveawayWe're giving...,RT @XCOPYPUNKS: XCOPYCATS GiveawayWe're giving...,0.397461,0.399703,0.399968,FLAT
4,9925,2021-08-04T23:52:14.987776,1.15e+18,2021-08-13T17:59:33.661011,1.66e+18,0.443478,UP,RT @BullieverIsland: Celebrating Citizens of B...,RT @BullieverIsland: Celebrating Citizens of B...,0.380741,0.390007,0.377293,DOWN
5,9921,2021-08-22T11:05:32.621573,1.64e+18,2021-08-22T14:51:26.933741,1.649e+18,0.005488,FLAT,RT @DamianSpriggs: I have always wanted to be ...,RT @DamianSpriggs: I have always wanted to be ...,0.396572,0.384733,1.0,FLAT
6,9921,2021-08-17T04:39:51.161269,1.32e+18,2021-08-22T11:05:32.621573,1.64e+18,0.242424,UP,RT @DrugstoreNfts: DROP NEWSTODAY 10 PM GMT /...,RT @DrugstoreNfts: DROP NEWSTODAY 10 PM GMT /...,0.402898,0.3824,0.390657,UP
7,9921,2021-07-30T16:46:54.052325,6.5e+17,2021-08-17T04:39:51.161269,1.32e+18,1.030769,UP,@zachward69 #Coolcats BEEP BOOP Cool Cat #5...,@zachward69 #Coolcats BEEP BOOP Cool Cat #5...,0.385264,0.394358,0.387613,DOWN
8,9920,2021-08-24T03:29:15.074609,3.3568e+18,2021-08-24T22:21:08.871690,4.9e+18,0.459724,UP,Omg looooool 2.2M followers 50E saleI can't g...,Omg looooool 2.2M followers 50E saleI can't g...,0.373806,0.36559,0.381575,FLAT
9,9920,2021-08-23T17:24:06.385908,2.11e+18,2021-08-24T03:29:15.074609,3.3568e+18,0.5909,UP,RT @SpaceLabCrypto: If you are worried that yo...,RT @SpaceLabCrypto: If you are worried that yo...,0.385294,0.403803,0.39045,DOWN


In [47]:
# Convert 'last_hour_closest_cosine_similarity_label' to -1,0,1.
sale_pairs_df.loc[sale_pairs_df['last_hour_closest_cosine_similarity_label'] == "UP", \
'numeric_last_hour_closest_cosine_similarity_label'] = 1
sale_pairs_df.loc[sale_pairs_df['last_hour_closest_cosine_similarity_label'] == "DOWN", \
'numeric_last_hour_closest_cosine_similarity_label'] = -1
sale_pairs_df.loc[sale_pairs_df['last_hour_closest_cosine_similarity_label'] == "FLAT", \
'numeric_last_hour_closest_cosine_similarity_label'] = 0

In [48]:
# Convert 'five_percent_momentum' to -1,0,1.
sale_pairs_df.loc[sale_pairs_df['five_percent_momentum'] == "UP", \
'numeric_five_percent_momentum'] = 1
sale_pairs_df.loc[sale_pairs_df['five_percent_momentum'] == "DOWN", \
'numeric_five_percent_momentum'] = -1
sale_pairs_df.loc[sale_pairs_df['five_percent_momentum'] == "FLAT", \
'numeric_five_percent_momentum'] = 0

In [49]:
sale_pairs_df.head(10)

Unnamed: 0,id,start_sale_date,start_sale_price,end_sale_date,end_sale_price,percent_change,five_percent_momentum,all_tweets,last_hour_tweets,up_last_hour,down_last_hour,flat_last_hour,last_hour_closest_cosine_similarity_label,numeric_last_hour_closest_cosine_similarity_label,numeric_five_percent_momentum
0,9932,2021-10-11T17:16:09.440387,1.049e+19,2021-10-21T19:16:22.725846,6.55e+18,-0.375596,DOWN,RT @Cryptomesssiah: GIVEAWAY OF RARE #CoolCats...,RT @gimmocrypto: This gif of @mutantcats purch...,0.387524,1.0,0.384733,DOWN,-1.0,-1.0
1,9932,2021-09-23T08:30:18.304058,6e+18,2021-10-11T17:16:09.440387,1.049e+19,0.748333,UP,Do you know why square cats are such good pets...,Do you know why square cats are such good pets...,1.0,0.387524,0.396572,UP,1.0,1.0
2,9932,2021-08-29T01:44:44.449125,5.25e+18,2021-09-23T08:30:18.304058,6e+18,0.142857,UP,"#24px - anon dev. no roadmap. just pixels, cat...","#24px - anon dev. no roadmap. just pixels, cat...",0.387349,0.38688,0.383842,UP,1.0,1.0
3,9925,2021-08-13T17:59:33.661011,1.66e+18,2021-08-17T03:46:32.538134,1.51e+18,-0.090361,DOWN,RT @XCOPYPUNKS: XCOPYCATS GiveawayWe're giving...,RT @XCOPYPUNKS: XCOPYCATS GiveawayWe're giving...,0.397461,0.399703,0.399968,FLAT,0.0,-1.0
4,9925,2021-08-04T23:52:14.987776,1.15e+18,2021-08-13T17:59:33.661011,1.66e+18,0.443478,UP,RT @BullieverIsland: Celebrating Citizens of B...,RT @BullieverIsland: Celebrating Citizens of B...,0.380741,0.390007,0.377293,DOWN,-1.0,1.0
5,9921,2021-08-22T11:05:32.621573,1.64e+18,2021-08-22T14:51:26.933741,1.649e+18,0.005488,FLAT,RT @DamianSpriggs: I have always wanted to be ...,RT @DamianSpriggs: I have always wanted to be ...,0.396572,0.384733,1.0,FLAT,0.0,0.0
6,9921,2021-08-17T04:39:51.161269,1.32e+18,2021-08-22T11:05:32.621573,1.64e+18,0.242424,UP,RT @DrugstoreNfts: DROP NEWSTODAY 10 PM GMT /...,RT @DrugstoreNfts: DROP NEWSTODAY 10 PM GMT /...,0.402898,0.3824,0.390657,UP,1.0,1.0
7,9921,2021-07-30T16:46:54.052325,6.5e+17,2021-08-17T04:39:51.161269,1.32e+18,1.030769,UP,@zachward69 #Coolcats BEEP BOOP Cool Cat #5...,@zachward69 #Coolcats BEEP BOOP Cool Cat #5...,0.385264,0.394358,0.387613,DOWN,-1.0,1.0
8,9920,2021-08-24T03:29:15.074609,3.3568e+18,2021-08-24T22:21:08.871690,4.9e+18,0.459724,UP,Omg looooool 2.2M followers 50E saleI can't g...,Omg looooool 2.2M followers 50E saleI can't g...,0.373806,0.36559,0.381575,FLAT,0.0,1.0
9,9920,2021-08-23T17:24:06.385908,2.11e+18,2021-08-24T03:29:15.074609,3.3568e+18,0.5909,UP,RT @SpaceLabCrypto: If you are worried that yo...,RT @SpaceLabCrypto: If you are worried that yo...,0.385294,0.403803,0.39045,DOWN,-1.0,1.0


In [82]:
# Add LDA topic modeling for the last hour of tweets.
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

startTime = time.time()

tfidVectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))
vectorizedText = tfidVectorizer.fit_transform(sale_pairs_df['last_hour_tweets'])

ldaModel = LatentDirichletAllocation(n_components=10,learning_method='online',random_state=42)

ldaTopics = ldaModel.fit_transform(vectorizedText)

endTime = time.time()

print("Finished computing LDA topic modeling after " + str(endTime - startTime) + " seconds.")

Finished computing LDA topic modeling after 28.737321376800537 seconds.


In [86]:
ldaTopics[0]

array([0.00746157, 0.53683045, 0.00746157, 0.00746157, 0.00746974,
       0.00746171, 0.00746692, 0.00746157, 0.40346326, 0.00746163])

In [85]:
# Create LDA topics DF for the last hour.
last_hour_lda_topics_df = pd.DataFrame(data = ldaTopics, columns = ["last_hour_topic_" + str(i) for i in range(10)])

last_hour_lda_topics_df.head(10)

Unnamed: 0,last_hour_topic_0,last_hour_topic_1,last_hour_topic_2,last_hour_topic_3,last_hour_topic_4,last_hour_topic_5,last_hour_topic_6,last_hour_topic_7,last_hour_topic_8,last_hour_topic_9
0,0.007462,0.53683,0.007462,0.007462,0.00747,0.007462,0.007467,0.007462,0.403463,0.007462
1,0.005589,0.671481,0.005589,0.005589,0.005589,0.005589,0.00559,0.005589,0.005589,0.283805
2,0.006734,0.462998,0.006734,0.006734,0.006734,0.00674,0.006735,0.006734,0.006734,0.483122
3,0.009024,0.441034,0.009024,0.009024,0.486775,0.009024,0.009024,0.009024,0.009024,0.009024
4,0.008887,0.92002,0.008887,0.008887,0.008887,0.008887,0.008887,0.008887,0.008887,0.008887
5,0.007751,0.930238,0.007751,0.007751,0.007751,0.007751,0.007753,0.007751,0.007751,0.007751
6,0.008117,0.926947,0.008117,0.008117,0.008117,0.008117,0.008119,0.008117,0.008117,0.008117
7,0.008513,0.477369,0.008513,0.008513,0.454525,0.008515,0.008513,0.008513,0.008513,0.008513
8,0.007746,0.930287,0.007746,0.007746,0.007746,0.007746,0.007747,0.007746,0.007746,0.007746
9,0.010773,0.903041,0.010773,0.010773,0.010773,0.010773,0.010773,0.010773,0.010773,0.010773


In [87]:
# Append LDA topic DF to main DF.
for i in range(10):
    topicLabel = "last_hour_topic_" + str(i)
    sale_pairs_df[topicLabel] = last_hour_lda_topics_df[topicLabel]

sale_pairs_df.head(10)

Unnamed: 0,id,start_sale_date,start_sale_price,end_sale_date,end_sale_price,percent_change,five_percent_momentum,all_tweets,last_hour_tweets,up_last_hour,...,last_hour_topic_0,last_hour_topic_1,last_hour_topic_2,last_hour_topic_3,last_hour_topic_4,last_hour_topic_5,last_hour_topic_6,last_hour_topic_7,last_hour_topic_8,last_hour_topic_9
0,9932,2021-10-11T17:16:09.440387,1.049e+19,2021-10-21T19:16:22.725846,6.55e+18,-0.375596,DOWN,RT @Cryptomesssiah: GIVEAWAY OF RARE #CoolCats...,RT @gimmocrypto: This gif of @mutantcats purch...,0.387524,...,0.007462,0.53683,0.007462,0.007462,0.00747,0.007462,0.007467,0.007462,0.403463,0.007462
1,9932,2021-09-23T08:30:18.304058,6e+18,2021-10-11T17:16:09.440387,1.049e+19,0.748333,UP,Do you know why square cats are such good pets...,Do you know why square cats are such good pets...,1.0,...,0.005589,0.671481,0.005589,0.005589,0.005589,0.005589,0.00559,0.005589,0.005589,0.283805
2,9932,2021-08-29T01:44:44.449125,5.25e+18,2021-09-23T08:30:18.304058,6e+18,0.142857,UP,"#24px - anon dev. no roadmap. just pixels, cat...","#24px - anon dev. no roadmap. just pixels, cat...",0.387349,...,0.006734,0.462998,0.006734,0.006734,0.006734,0.00674,0.006735,0.006734,0.006734,0.483122
3,9925,2021-08-13T17:59:33.661011,1.66e+18,2021-08-17T03:46:32.538134,1.51e+18,-0.090361,DOWN,RT @XCOPYPUNKS: XCOPYCATS GiveawayWe're giving...,RT @XCOPYPUNKS: XCOPYCATS GiveawayWe're giving...,0.397461,...,0.009024,0.441034,0.009024,0.009024,0.486775,0.009024,0.009024,0.009024,0.009024,0.009024
4,9925,2021-08-04T23:52:14.987776,1.15e+18,2021-08-13T17:59:33.661011,1.66e+18,0.443478,UP,RT @BullieverIsland: Celebrating Citizens of B...,RT @BullieverIsland: Celebrating Citizens of B...,0.380741,...,0.008887,0.92002,0.008887,0.008887,0.008887,0.008887,0.008887,0.008887,0.008887,0.008887
5,9921,2021-08-22T11:05:32.621573,1.64e+18,2021-08-22T14:51:26.933741,1.649e+18,0.005488,FLAT,RT @DamianSpriggs: I have always wanted to be ...,RT @DamianSpriggs: I have always wanted to be ...,0.396572,...,0.007751,0.930238,0.007751,0.007751,0.007751,0.007751,0.007753,0.007751,0.007751,0.007751
6,9921,2021-08-17T04:39:51.161269,1.32e+18,2021-08-22T11:05:32.621573,1.64e+18,0.242424,UP,RT @DrugstoreNfts: DROP NEWSTODAY 10 PM GMT /...,RT @DrugstoreNfts: DROP NEWSTODAY 10 PM GMT /...,0.402898,...,0.008117,0.926947,0.008117,0.008117,0.008117,0.008117,0.008119,0.008117,0.008117,0.008117
7,9921,2021-07-30T16:46:54.052325,6.5e+17,2021-08-17T04:39:51.161269,1.32e+18,1.030769,UP,@zachward69 #Coolcats BEEP BOOP Cool Cat #5...,@zachward69 #Coolcats BEEP BOOP Cool Cat #5...,0.385264,...,0.008513,0.477369,0.008513,0.008513,0.454525,0.008515,0.008513,0.008513,0.008513,0.008513
8,9920,2021-08-24T03:29:15.074609,3.3568e+18,2021-08-24T22:21:08.871690,4.9e+18,0.459724,UP,Omg looooool 2.2M followers 50E saleI can't g...,Omg looooool 2.2M followers 50E saleI can't g...,0.373806,...,0.007746,0.930287,0.007746,0.007746,0.007746,0.007746,0.007747,0.007746,0.007746,0.007746
9,9920,2021-08-23T17:24:06.385908,2.11e+18,2021-08-24T03:29:15.074609,3.3568e+18,0.5909,UP,RT @SpaceLabCrypto: If you are worried that yo...,RT @SpaceLabCrypto: If you are worried that yo...,0.385294,...,0.010773,0.903041,0.010773,0.010773,0.010773,0.010773,0.010773,0.010773,0.010773,0.010773


In [88]:
# Feature selection.
features = ['up_last_hour', 'down_last_hour', 'flat_last_hour', 'numeric_last_hour_closest_cosine_similarity_label'] \
           + ["last_hour_topic_" + str(i) for i in range(10)]
label = 'numeric_five_percent_momentum'

In [89]:
# Split into 90% training and 10% test.
from sklearn.model_selection import train_test_split

X = sale_pairs_df[features]
y = sale_pairs_df[[label]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [90]:
# Predicting according to the majority class.
print((1 == y_train["numeric_five_percent_momentum"]).sum() / y_train.count())

# Using cosine similarity for prediction.
print((X_train["numeric_last_hour_closest_cosine_similarity_label"] == y_train["numeric_five_percent_momentum"]).sum() / y_train.count())

print((X_train["numeric_last_hour_closest_cosine_similarity_label"] == 1).sum())

numeric_five_percent_momentum    0.893843
dtype: float64
numeric_five_percent_momentum    0.307148
dtype: float64
445


In [91]:
# Train a decision tree classifier.

from sklearn.tree import DecisionTreeClassifier
decisionTreeModel = DecisionTreeClassifier()   
decisionTreeModel.fit(X_train,y_train)

DecisionTreeClassifier()

In [92]:
# Compute predictions over the test set.

y_predict = decisionTreeModel.predict(X_test)

In [93]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
accuracy_score(y_test,y_predict)

0.8354430379746836

In [94]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, decisionTreeModel.predict_proba(X_test),multi_class='ovr')

0.5200662989430209

In [95]:
# Extract balanced samples.
cloned_sale_pairs_df = sale_pairs_df.copy()

# Extract 50 samples per class.
up_sample_df = cloned_sale_pairs_df.loc[sale_pairs_df['five_percent_momentum'] == "UP",].sample(n=50, random_state=42)
down_sample_df = cloned_sale_pairs_df.loc[sale_pairs_df['five_percent_momentum'] == "DOWN",].sample(n=50, random_state=42)
flat_sample_df = cloned_sale_pairs_df.loc[sale_pairs_df['five_percent_momentum'] == "FLAT",].sample(n=50, random_state=42)

samples_df = pd.concat([up_sample_df, down_sample_df, flat_sample_df])

# Drop samples from source DF.
cloned_sale_pairs_df = cloned_sale_pairs_df.drop(samples_df.index)

samples_df

Unnamed: 0,id,start_sale_date,start_sale_price,end_sale_date,end_sale_price,percent_change,five_percent_momentum,all_tweets,last_hour_tweets,up_last_hour,...,last_hour_topic_0,last_hour_topic_1,last_hour_topic_2,last_hour_topic_3,last_hour_topic_4,last_hour_topic_5,last_hour_topic_6,last_hour_topic_7,last_hour_topic_8,last_hour_topic_9
1198,9122,2021-07-11T07:36:03.317093,1.979000e+18,2021-08-02T15:46:46.489656,2.220000e+18,0.121779,UP,RT @markshaw: Remember: They'll come for the a...,RT @markshaw: Remember: They'll come for the a...,0.354057,...,0.005977,0.946207,0.005977,0.005977,0.005977,0.005977,0.005977,0.005977,0.005977,0.005977
1132,9170,2021-07-07T15:40:36.198903,4.000000e+17,2021-07-07T18:06:27.996699,4.880000e+17,0.220000,UP,RT @sonirious: CoolCats are going to flip apes...,RT @sonirious: CoolCats are going to flip apes...,0.389061,...,0.011104,0.900062,0.011104,0.011104,0.011104,0.011104,0.011106,0.011104,0.011104,0.011104
731,9440,2021-07-05T08:27:31.398556,8.414770e+16,2021-08-22T05:31:56.492745,1.600000e+18,18.014187,UP,RT @markshaw: What can I do for you that will ...,RT @xtremetom: We see you @dfinzer (co-founder...,0.381760,...,0.008637,0.922269,0.008637,0.008637,0.008637,0.008637,0.008637,0.008637,0.008637,0.008637
506,9575,2021-07-07T04:46:43.475180,2.500000e+17,2021-07-08T02:58:47.414361,4.850000e+17,0.940000,UP,Whats up @coolcatsnft fam?!? What a day!! Foll...,Whats up @coolcatsnft fam?!? What a day!! Foll...,0.385264,...,0.012606,0.886547,0.012606,0.012606,0.012606,0.012606,0.012606,0.012606,0.012606,0.012606
1445,8994,2021-07-04T10:33:57.321596,6.000000e+16,2021-07-04T18:10:32.626065,1.400000e+17,1.333333,UP,RT @Zvizvizvi: @PsyChrypto @alexgausman Only 1...,RT @Zvizvizvi: @PsyChrypto @alexgausman Only 1...,0.396388,...,0.010349,0.439798,0.010349,0.010349,0.010349,0.010349,0.477411,0.010349,0.010349,0.010349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10,9918,2021-08-04T19:04:01.906638,1.550000e+18,2021-08-19T19:29:02.968073,1.490000e+18,-0.038710,FLAT,RT @BluuCatt: MIKE TYSON SAYING COOLCATS LIVE ...,RT @BluuCatt: MIKE TYSON SAYING COOLCATS LIVE ...,0.378709,...,0.005815,0.947660,0.005815,0.005815,0.005815,0.005816,0.005816,0.005815,0.005815,0.005815
486,9602,2021-07-02T00:52:51.979627,1.980000e+16,2021-07-02T01:44:31.310903,1.950000e+16,-0.015152,FLAT,RT @Stanley_Studios: i love them both lol@Gut...,RT @Stanley_Studios: i love them both lol@Gut...,0.397436,...,0.011369,0.565118,0.011369,0.011369,0.011369,0.343926,0.011370,0.011369,0.011369,0.011369
29,9907,2021-07-09T21:47:36.613699,9.999000e+17,2021-08-04T05:55:33.969680,9.900000e+17,-0.009901,FLAT,RT @markshaw: What can I do for you that will ...,RT @SunkenSquidNFT: SUNKEN SQUIDS x COOL CAT N...,0.393659,...,0.008540,0.923142,0.008540,0.008540,0.008540,0.008540,0.008540,0.008540,0.008540,0.008540
1484,8976,2021-09-04T11:39:54.807283,4.480000e+18,2021-09-11T12:24:58.355250,4.590000e+18,0.024554,FLAT,RT @MonasNFT: SELLOUT INCOMING 245 ETH 1.80...,RT @MonasNFT: SELLOUT INCOMING 245 ETH 1.80...,0.371589,...,0.019441,0.825026,0.019441,0.019441,0.019443,0.019441,0.019441,0.019441,0.019441,0.019441


In [96]:
# Create training and test for balanced dataset.
X_train = cloned_sale_pairs_df[features]
y_train = cloned_sale_pairs_df[[label]]
X_test = samples_df[features]
y_test = samples_df[[label]]

In [97]:
# Train Decision Tree Classifier.
decisionTreeModel = DecisionTreeClassifier()   
decisionTreeModel.fit(X_train,y_train)

DecisionTreeClassifier()

In [98]:
# Call predictiion
y_predict = decisionTreeModel.predict(X_test)

In [99]:
# Accuracy of the model.
accuracy_score(y_test,y_predict)

0.38666666666666666

In [100]:
# Accuracy of the baseline.
(y_test == 1).sum() / y_test.count()

numeric_five_percent_momentum    0.333333
dtype: float64

In [101]:
print((sale_pairs_df["five_percent_momentum"] =="UP").sum())
print((sale_pairs_df["five_percent_momentum"] =="DOWN").sum())
print((sale_pairs_df["five_percent_momentum"] =="FLAT").sum())

1406
101
64


In [102]:
(sale_pairs_df["five_percent_momentum"] == "UP").sum() / sale_pairs_df.count()

id                                                   0.894971
start_sale_date                                      0.894971
start_sale_price                                     0.894971
end_sale_date                                        0.894971
end_sale_price                                       0.894971
percent_change                                       0.894971
five_percent_momentum                                0.894971
all_tweets                                           0.894971
last_hour_tweets                                     0.894971
up_last_hour                                         0.894971
down_last_hour                                       0.894971
flat_last_hour                                       0.894971
last_hour_closest_cosine_similarity_label            0.894971
numeric_last_hour_closest_cosine_similarity_label    0.894971
numeric_five_percent_momentum                        0.894971
last_hour_topic_0                                    0.894971
last_hou

In [103]:
len(concatenatedTweetsBetweenSaleEvents[3])

89372