In [1]:
import requests
import warnings
import string
import joblib
import multiprocessing
import torch
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from nltk import *
from nltk.corpus import stopwords
from transformers import BertTokenizer
from transformers import BertModel
from torch.nn import functional as F


warnings.filterwarnings("ignore")

In [2]:
def load_tweets(tweets_file="../data/preprocessed_tweet_20201619.csv", 
                from_date="2017-01-01", 
                to_date="2020-06-01", 
                count=10):
    cols = ["date", "time", "username", "tweet", "clean_tweet", "hashtags", 
            "likes_count", "replies_count", "retweets_count", "slang_count"]
    df = pd.read_csv(tweets_file, usecols=cols)
    print("# of total tweets: {}".format(df.shape[0]))
    df.sort_values(by="date", ascending=True, inplace=True)
    df.set_index('date', inplace=True)
    df = df.loc[from_date:to_date]
    df.reset_index(drop=False, inplace=True)
    df.drop_duplicates(inplace=True)
    df = df[df.clean_tweet.str.count('\s+').gt(count)]
    print("There are {} tweets we get.".format(df.shape[0]))
    return df

df = load_tweets(from_date="2017-01-01", to_date="2020-06-17")
df_simplied = df[["date", "clean_tweet"]]
df_simplied.head()

# of total tweets: 1297358
There are 282228 tweets we get.


Unnamed: 0,date,clean_tweet
0,2017-01-01,"Happy New Year, everyone! Like atUser just tol..."
5,2017-01-01,trading forex binaryoptions China steps up scr...
6,2017-01-01,Forex Forum - url replies to: Best Forex Sig...
7,2017-01-01,Forex Forum - goosebone replies to: Let is Tr...
8,2017-01-01,Learning MQL4: must-do or just for fun? via /r...


In [5]:
class BertVectorizer():
    """
    BERT Vectorizer
    """
    def __init__(self):
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertModel.from_pretrained('bert-base-uncased')
        self.model = self.model.to(self.device)
    
    def vectorize(self, text):
        tokenized_text = self.tokenizer.encode_plus(
            text=text, 
            max_length=32, 
            add_special_tokens=True, 
            return_token_type_ids=False, 
            pad_to_max_length=True, 
            return_attention_mask=True, 
            return_tensors="pt")
        with torch.no_grad(): 
            logits = self.model(
                tokenized_text["input_ids"].to(self.device), 
                tokenized_text["attention_mask"].to(self.device))
        vector = logits[1].to("cpu").numpy()
        return vector

In [6]:
vectorizer = BertVectorizer()
df_simplied['tweet_vector'] = df_simplied['clean_tweet'].apply(lambda x: vectorizer.vectorize(x))

In [8]:
joblib.dump(df_simplied, "../data/tweets_vectors_df.gzip", compress=3)
tweets_vectors_df = joblib.load("../data/tweets_vectors_df.gzip")

['../data/tweets_vectors_df.gzip']

In [12]:
df_complete = pd.concat([df, tweets_vectors_df.tweet_vector], axis=1)