In [99]:
# https://huggingface.co/zhayunduo/roberta-base-stocktwits-finetuned"

In [100]:
import pandas as pd
from datetime import datetime, timedelta
import os
import numpy as np

from transformers import RobertaForSequenceClassification, RobertaTokenizer
from transformers import pipeline

# pip install emoji
import emoji

import re

In [101]:
def process_folder(stock_ticker):
    '''
    Reads the data from a folder associated with a single stock.
    '''
    df = pd.DataFrame(columns = [
        'id',
        'text',
        'time',
        'sentiment'
    ])
    directory = 'stocks/' + stock_ticker
    for file in os.listdir(directory):
        filename = directory + '/' + file
        new_df = pd.read_json(filename)
        df = pd.concat([df, new_df], axis=0).reset_index(drop=True)
    df.time = df.time.apply(datetime.fromtimestamp)
    return df

In [102]:
def compute_value_counts(df, start_date, time_delta):
    '''
    Computes the number of posts with bearish and bullish labels within
    the time interval from start_date to (start_date + time_delta).
    '''
    end_date = start_date + timedelta(days=time_delta)
    filtered_df = df[(df.time > start_date) & (df.time < end_date)].sentiment
    pos = filtered_df.value_counts().get('Bullish', 0)
    neg = filtered_df.value_counts().get('Bearish', 0)
    return pos, neg

In [103]:
def add_sentiment_by_day(df, df_by_day, end):
    '''
    Produces a dataframe with columns date, bullish, and bearish,
    containing the day, the number of bullish posts on that day,
    and the number of bearish posts on that day, respectively.
    It terminates at the given end date.
    '''
    start_date = datetime(year=2022, day=1, month=1)
    time_delta_days = 1
    while start_date < end:
        pos, neg = compute_value_counts(df, start_date, time_delta_days)
        new_df = pd.DataFrame({
            'date': start_date,
            'bullish': pos,
            'bearish': neg},index=[0])
        df_by_day = df_by_day.append(new_df, ignore_index=True)
        start_date += timedelta(days=time_delta_days)
    return df_by_day

In [104]:
def gen_sentiment_csvs(stocks, end):
    '''
    For a given list of stocks, generates a csv for each stock
    containing the information from the dataframe containing the
    number of bearish and bullish posts by day.
    '''
    for stock in stocks:
        df = process_folder(stock)
        df_by_day = pd.DataFrame(columns = [
            'date',
            'bullish',
            'bearish'
        ])
        df_by_day = add_sentiment_by_day(df, df_by_day, end)
        df_by_day.to_csv(stock + '.csv', index=False)

In [105]:
# the model was trained upon below preprocessing
def process_text(texts):
    '''
    Preprocessing function applied to the text of each post before
    performing sentiment analysis. This was the function applied in
    order to train the pre-trained transformer model we are using.
    '''

    # remove URLs
    texts = re.sub(r'https?://\S+', "", texts)
    texts = re.sub(r'www.\S+', "", texts)
    # remove '
    texts = texts.replace('&#39;', "'")
    # remove symbol names
    texts = re.sub(r'(\#)(\S+)', r'hashtag_\2', texts)
    texts = re.sub(r'(\$)([A-Za-z]+)', r'cashtag_\2', texts)
    # remove usernames
    texts = re.sub(r'(\@)(\S+)', r'mention_\2', texts)
    # demojize
    texts = emoji.demojize(texts, delimiters=("", " "))

    return texts.strip()

In [106]:
def cast_to_datetime(date_string):
    '''
    Converts dates read in as strings back into datetime.datetime objects.
    '''
    return datetime.strptime(date_string, "%Y-%m-%d %H:%M:%S")

In [107]:
# Loads the model
tokenizer_loaded = RobertaTokenizer.from_pretrained('zhayunduo/roberta-base-stocktwits-finetuned')
model_loaded = RobertaForSequenceClassification.from_pretrained('zhayunduo/roberta-base-stocktwits-finetuned')
# nlp takes a list of text objects
# It returns a list of dictionaries, each with a 'label' and 'score' key.
# If the 'label' key is "LABEL_0", then the post is 'bearish'.
# If the 'label' key is "LABEL_1", then the post is 'bullish'.
nlp = pipeline("text-classification", model=model_loaded, tokenizer=tokenizer_loaded)

In [108]:
def process_sentiments(df):
    '''
    A function to add sentiment labels and confidence scores for every post.
    If a post is pre-labeled, the label is kept, and a score of 1 is assigned.
    If the post is not pre-labeled but longer than 250 words, it is assigned
    'no label' and a score of 1.
    Otherwise, RoBERTa is run on the post, and the post is assigned the
    resulting label and score.
    '''
    texts = df.text.apply(process_text)
    gen_sentiment = []
    gen_scores = []
    for i in range(len(df)):
        try:
            labeled = df.sentiment[i]
            if labeled:
                gen_sentiment.append(labeled)
                gen_scores.append(1.0)
            else:
                text = texts[i]
                if len(text) > 250:
                    gen_sentiment.append('no label')
                    gen_scores.append(1.0)
                else:
                    result = nlp([text])[0]
                    if result.get('label') == 'LABEL_0':
                        label = 'bearish'
                    elif result.get('label') == 'LABEL_1':
                        label = 'bullish'
                    else:
                        label = 'unsure'
                    score = result.get('score')
                    gen_sentiment.append(label)
                    gen_scores.append(score)
        except:
            gen_sentiment.append('error')
            gen_scores.append(1.0)
    df['gen_sentiment'] = gen_sentiment
    df['gen_scores'] = gen_scores
    return df

In [109]:
def add_final_sentiments(df):
    '''
    Disambiguates sentiment scores by adding a finalized sentiments column.
    If a tweet came pre-labeled, this value is taken to be the true final
    sentiment. Otherwise, we compute the median of all scores associated with
    posts that were labeled as either 'bearish' or 'bullish' by the RoBERTa
    model. For those that scored above this median, we assign their computed
    sentiment label to be their finalized label. All other posts are assigned
    a label of "''".
    '''
    df.rename(columns={"sentiment": "scraped_sentiment"}, inplace=True)
    final_sentiments = []
    median = np.median(df[(df.gen_sentiment == 'bullish') |
                          (df.gen_sentiment == 'bearish')]
                       .gen_scores)
    for i in range(len(df)):
        gen_sen = df.gen_sentiment[i]
        if gen_sen == 'Bearish' or gen_sen == 'Bullish':
            final_sentiments.append(gen_sen)
        elif gen_sen == 'bullish' and df.gen_scores[i] > median:
            final_sentiments.append('Bullish')
        elif gen_sen == 'bearish' and df.gen_scores[i] > median:
            final_sentiments.append('Bearish')
        else:
            final_sentiments.append('')
    df['sentiment'] = final_sentiments
    return df

In [110]:
def gen_labeled_csvs(stocks):
    '''
    For each stock, this function generates a csv with
    RoBERTa-generated sentiment labels and associated scores,
    as well as a csv with counts of each label by day.
    '''
    df_by_day = pd.DataFrame(columns = [
        'date',
        'bullish',
        'bearish'
    ])
    end = datetime(year=2022, day=15, month=5)
    for stock in stocks:
        df = process_folder(stock)
        df = process_sentiments(df)
        df = add_final_sentiments(df)
        df.to_csv(stock+'_RoBERTa.csv', index=False)
        df_by_day = add_sentiment_by_day(df, df_by_day, end)
        df_by_day.to_csv(stock+'_byday_RoBERTa.csv', index=False)

In [111]:
stocks = ['AAPL', 'MSFT', 'NVDA']

In [112]:
end = datetime(year=2022, day=15, month=5)

In [113]:
# gen_sentiment_csvs(stocks, end)

In [114]:
# df_by_day = add_sentiment_by_day(df, df_by_day, end)

In [115]:
# df_by_day.to_csv('bydayapple.csv', index=False)

In [116]:
# df_apple = process_folder('AAPL')

In [117]:
# Over 90% posts have less than 250 words.:
# sorted(df_apple.text.apply(len), reverse=True)[12697:12789]

In [118]:
sentences = pd.Series(['just buy','just sell it',
                      'entity rocket to the sky!',
                      'go down','even though it is going up, I still think it will not keep this trend in the near future'])
# sentences = list(sentences.apply(process_text))  # if input text contains https, @ or # or $ symbols, better apply preprocess to get a more accurate result
sentences = list(sentences)
results = nlp(sentences)
print(results) # 2 labels, label 0 is bearish, label 1 is bullish

[{'label': 'LABEL_1', 'score': 0.9919407963752747}, {'label': 'LABEL_0', 'score': 0.9894136190414429}, {'label': 'LABEL_1', 'score': 0.9486343860626221}, {'label': 'LABEL_0', 'score': 0.9953770637512207}, {'label': 'LABEL_0', 'score': 0.6002194285392761}]


In [119]:
# gen_sentiment = []
# gen_scores = []
# for i in range(len(df_apple)):
#     try:
#         labeled = df_apple.sentiment[i]
#         if labeled:
#             gen_sentiment.append(labeled)
#             gen_scores.append(1.0)
#         else:
#             text = texts[i]
#             if len(text) > 250:
#                 gen_sentiment.append('no label')
#                 gen_scores.append(1.0)
#             else:
#                 result = nlp([text])[0]
#                 if result.get('label') == 'LABEL_0':
#                     label = 'bearish'
#                 elif result.get('label') == 'LABEL_1':
#                     label = 'bullish'
#                 else:
#                     label = 'unsure'
#                 score = result.get('score')
#                 gen_sentiment.append(label)
#                 gen_scores.append(score)
#     except:
#         gen_sentiment.append('error')
#         gen_scores.append(1.0)

In [120]:
# df_apple.to_csv('labeled_apple.csv', index=False)

In [121]:
# df_apple['gen_sentiment'] = gen_sentiment

In [122]:
# df_apple['gen_scores'] = gen_scores

In [123]:
df_try = pd.read_csv('labeled_apple.csv')

In [124]:
df_try

Unnamed: 0,id,text,time,sentiment,gen_sentiment,gen_scores
0,423494210,$AAPL win win win $$$$ \n\nhttps://youtube.com...,2022-01-05 02:12:15,Bullish,Bullish,1.000000
1,423494128,$AAPL winning the battle,2022-01-05 02:11:49,Bullish,Bullish,1.000000
2,423493758,$AAPL Get in on Saitama...easy to buy on LBank...,2022-01-05 02:10:16,,bullish,0.998604
3,423492805,$aapl You know some bear will repost this when...,2022-01-05 02:06:14,Bearish,Bearish,1.000000
4,423492694,$AAPL fake it till you make it. When people ac...,2022-01-05 02:05:47,,bullish,0.989834
...,...,...,...,...,...,...
134391,459188904,$SPY $QQQ $IWM $AAPL shorted the nasdaq from $...,2022-05-13 02:11:18,,bearish,0.835228
134392,459188124,$AAPL u had ur chance.,2022-05-13 02:07:07,Bullish,Bullish,1.000000
134393,459187176,$AAPL $BTC.X $BRGX.X \nMy Top Three Investments!,2022-05-13 02:02:01,Bullish,Bullish,1.000000
134394,459186483,$SPY $IWM $QQQ $AAPL get ready for monster mo...,2022-05-13 01:58:24,,bullish,0.959215


In [125]:
df_try = add_final_sentiments(df_try)
df_try

Unnamed: 0,id,text,time,scraped_sentiment,gen_sentiment,gen_scores,sentiment
0,423494210,$AAPL win win win $$$$ \n\nhttps://youtube.com...,2022-01-05 02:12:15,Bullish,Bullish,1.000000,Bullish
1,423494128,$AAPL winning the battle,2022-01-05 02:11:49,Bullish,Bullish,1.000000,Bullish
2,423493758,$AAPL Get in on Saitama...easy to buy on LBank...,2022-01-05 02:10:16,,bullish,0.998604,Bullish
3,423492805,$aapl You know some bear will repost this when...,2022-01-05 02:06:14,Bearish,Bearish,1.000000,Bearish
4,423492694,$AAPL fake it till you make it. When people ac...,2022-01-05 02:05:47,,bullish,0.989834,Bullish
...,...,...,...,...,...,...,...
134391,459188904,$SPY $QQQ $IWM $AAPL shorted the nasdaq from $...,2022-05-13 02:11:18,,bearish,0.835228,
134392,459188124,$AAPL u had ur chance.,2022-05-13 02:07:07,Bullish,Bullish,1.000000,Bullish
134393,459187176,$AAPL $BTC.X $BRGX.X \nMy Top Three Investments!,2022-05-13 02:02:01,Bullish,Bullish,1.000000,Bullish
134394,459186483,$SPY $IWM $QQQ $AAPL get ready for monster mo...,2022-05-13 01:58:24,,bullish,0.959215,


In [126]:
df_try['sentiment'].value_counts()

Bullish    63823
           41065
Bearish    29508
Name: sentiment, dtype: int64

In [127]:
df_by_day = pd.DataFrame(columns = [
    'date',
    'bullish',
    'bearish'
])

In [128]:
datetime.strptime(df_try.time[0], "%Y-%m-%d %H:%M:%S")

datetime.datetime(2022, 1, 5, 2, 12, 15)

In [129]:
df_try.time = df_try.time.apply(cast_to_datetime)

In [130]:
# df_try_by_day.to_csv('by_day_apple_labeled.csv')

In [131]:
gen_labeled_csvs(['NVDA', 'MSFT'])