In [48]:
import pandas as pd
from datetime import datetime
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tqdm import tnrange, tqdm_notebook, tqdm
import glob
import numpy as np

# Declare currency and use the file with the clean tweets
currency = "bitcoin"
symbol = "BTC"
tweets_clean_file = f'../data/twitter/{symbol}/{currency}_tweets_clean.csv'
path = f'../data/twitter/{symbol}'

In [49]:
df_clean = pd.read_csv(tweets_clean_file)
print(df_clean.shape)
df_clean.head(5)

  exec(code_obj, self.user_global_ns, self.user_ns)


(1974464, 7)


Unnamed: 0,ID,Text,UserName,UserFollowerCount,RetweetCount,Likes,CreatedAt
0,1480974073383448576,RT : ⚜💎 Premium Diamond 💎⚜🏈 NCAAF 💎✅☑✅☑✅☑✅☑✅💎💎...,JC’S Picks,843,22,0,Tue Jan 11 18:45:08 +0000 2022
1,1480974072938905603,"RT : Honorable Judges, We are requesging to le...",Aman. Panhwar,29,11,0,Tue Jan 11 18:45:08 +0000 2022
2,1480974071894401027,New Airdrop: AMMpad🔘 Details: /Wl71sALlsIAirdr...,gaktau,19,0,0,Tue Jan 11 18:45:08 +0000 2022
3,1480974070447550471,RT : BREAKING – Strike officially launches Bit...,BitCapt,139,322,0,Tue Jan 11 18:45:08 +0000 2022
4,1480974070132924426,RT : Top 10 cryptocurrencies in the world by T...,jeffrockz _ zaid,59,4,0,Tue Jan 11 18:45:08 +0000 2022


In [50]:
# If the ID is bigger, the tweet is more recent, so sort them by ID
df_clean = df_clean[df_clean['ID'].apply(lambda x: isinstance(x, (int, np.int64)))]
df_clean = df_clean.sort_values(by='ID')

In [51]:
# Declare the Vader Analyzer Object
analyzer = SentimentIntensityAnalyzer()
compound = []
for i, s in enumerate(tqdm(df_clean['Text'])):
    vader_score = analyzer.polarity_scores(s)
    compound.append(vader_score["compound"])
df_clean["compound"] = compound
df_clean.head(2)

100%|██████████| 1843392/1843392 [04:25<00:00, 6944.86it/s]


Unnamed: 0,ID,Text,UserName,UserFollowerCount,RetweetCount,Likes,CreatedAt,compound
1974463,1477990572501737472,RT : Have you ever seen a project like $SOUL g...,Tah,92,41,0,Mon Jan 03 13:09:46 +0000 2022,0.3612
1974462,1477990572501737472,RT : Have you ever seen a project like $SOUL g...,Tah,92,41,0,Mon Jan 03 13:09:46 +0000 2022,0.3612


In [52]:
scores = []
# Obtain score by multiplying the score with the likes and the number of followers to get more insight
for i, s in tqdm(df_clean.iterrows(), total = df_clean.shape[0]):
    scores.append(float(s["compound"]) * (float(s["UserFollowerCount"])+1) * (float(s["Likes"])+1))
df_clean["score"] = scores
df_clean.head(2)

100%|██████████| 1843392/1843392 [02:25<00:00, 12684.95it/s]


Unnamed: 0,ID,Text,UserName,UserFollowerCount,RetweetCount,Likes,CreatedAt,compound,score
1974463,1477990572501737472,RT : Have you ever seen a project like $SOUL g...,Tah,92,41,0,Mon Jan 03 13:09:46 +0000 2022,0.3612,33.5916
1974462,1477990572501737472,RT : Have you ever seen a project like $SOUL g...,Tah,92,41,0,Mon Jan 03 13:09:46 +0000 2022,0.3612,33.5916


In [53]:
# Divide the tweets in 20K pieces to divide in multiple files
n = 20000
chunks_df = [df_clean[i:i+n] for i in range(0,df_clean.shape[0],n)]

# Create each file by using the dates in the tweets
sep_char = '~'
for chunk_df in chunks_df:
    chunk_min = chunk_df['ID'].min()
    chunk_max = chunk_df['ID'].max()
    date_from = (datetime.strptime(chunk_df.iloc[0]['CreatedAt'], '%a %b %d %X %z %Y')).strftime('%Y-%m-%d %H-%M-%S')
    date_to = (datetime.strptime(chunk_df.iloc[-1]['CreatedAt'], '%a %b %d %X %z %Y')).strftime('%Y-%m-%d %H-%M-%S')
    print(date_from, date_to)

    # Create CSV with tweets
    chunk_df.to_csv(f"{path}/{date_from}{sep_char}{date_to}.csv", header=True, index=False)

2022-01-03 13-09-46 2022-01-03 14-37-05
2022-01-03 14-37-05 2022-01-03 15-52-28
2022-01-03 15-52-28 2022-01-03 17-08-21
2022-01-03 17-08-22 2022-01-03 18-52-00
2022-01-03 18-52-01 2022-01-03 20-31-37
2022-01-03 20-31-37 2022-01-03 22-30-36
2022-01-03 22-30-36 2022-01-04 00-54-44
2022-01-04 00-54-45 2022-01-04 03-41-15
2022-01-04 03-41-16 2022-01-04 06-20-29
2022-01-04 06-20-29 2022-01-04 08-56-36
2022-01-04 08-56-37 2022-01-04 11-14-22
2022-01-04 11-14-23 2022-01-04 13-20-57
2022-01-04 13-20-58 2022-01-04 14-54-29
2022-01-04 14-54-29 2022-01-04 16-20-01
2022-01-04 16-20-01 2022-01-04 17-59-42
2022-01-04 17-59-43 2022-01-04 20-03-06
2022-01-04 20-03-07 2022-01-04 22-19-31
2022-01-04 22-19-31 2022-01-05 01-17-09
2022-01-05 01-17-10 2022-01-05 03-48-04
2022-01-05 03-48-04 2022-01-05 06-20-58
2022-01-05 06-20-59 2022-01-05 09-04-23
2022-01-05 09-04-23 2022-01-05 11-35-32
2022-01-05 11-35-32 2022-01-05 13-42-50
2022-01-05 13-42-50 2022-01-05 15-33-35
2022-01-05 15-33-35 2022-01-05 17-21-57


In [61]:
cols = ['CRYPTO', 'LINE_COUNT', 'MOST_RECENT_FILE', 'MOST_RECENT_ID']

def get_var(key, crypto):
    df_var = pd.read_csv("../data/twitter/var_twitter.csv", sep = ',', dtype = {'LINE_COUNT': np.int32})
    return df_var[key].loc[df_var['CRYPTO'] == crypto].values[0]

def update_var(key, value, crypto):
    df_var = pd.read_csv("../data/twitter/var_twitter.csv", sep = ',', dtype = {'LINE_COUNT': np.int32})
    df_var[key].loc[df_var['CRYPTO'] == crypto] = str(value)
    df_var.to_csv("var.csv", index=False)
    
def add_new_crypto(crypto):
    df_var = pd.read_csv("../data/twitter/var_twitter.csv", sep = ',', dtype = {'LINE_COUNT': np.int32})
    if df_var[cols[0]].loc[df_var['CRYPTO'] == crypto].empty:
        new_line = pd.DataFrame([[crypto,-1,"",0]], columns=cols)
        df_var = df_var.append(new_line)
        df_var.to_csv("data/twitter/var.csv", index=False)

In [65]:
files = glob.glob(f"{path}/*~*.csv")
files = sorted(files)
last_file = files[-1]
print(last_file)
last_df = pd.read_csv(last_file)
last_elem = last_df.tail(1)
print(last_elem['ID'])
print(last_df.shape)

add_new_crypto(symbol)
update_var(cols[1], last_df.shape[0], symbol)
update_var(cols[2], last_file, symbol)
update_var(cols[3], last_elem, symbol)

../data/twitter/BTC/2022-01-11 18-41-38~2022-01-11 18-45-08.csv
733    1480974073383448576
Name: ID, dtype: int64
(734, 9)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [66]:
var_df = pd.read_csv('../data/twitter/var_twitter.csv')
var_df

Unnamed: 0,CRYPTO,LINE_COUNT,MOST_RECENT_FILE,MOST_RECENT_ID
0,BTC,556,data/twitter/BTC/2018-05-29 12-20-53~2018-05-2...,1001439557504692224
