# Extract Twitter Dataset with API

Twitter offers the past seven days of data on their free API tier, so we will go back in 60-minute windows and extract ~100 tweets from within each of these windows. Requires bearer tokens as from registration process with Twitter for developer access.

In [1]:
from datetime import datetime, timedelta
import requests
import pandas as pd
import numpy as np
import re

In [None]:
with open('bearer_token.txt') as fp:
    BEARER_TOKEN = fp.read()

In [None]:
endpoint = 'https://api.twitter.com/2/tweets/search/recent'
headers = {'authorization': f'Bearer {BEARER_TOKEN}'}
params = {
    'query': '(amazon OR aws OR jeff bezos) (lang:en)',
    'max_results': '100',
    'tweet.fields': 'created_at, lang'}

In [None]:
dtformat = '%Y-%m-%dT%H:%M:%SZ'

In [None]:
def time_periods(now, mins):
    now = datetime.strptime(now, dtformat)
    intervals_time = now - timedelta(minutes=mins) # time series at mins intervals
    return intervals_time.strftime(dtformat)

In [None]:
now = datetime.now()  # get the current datetime, this is our starting point
last_week = now - timedelta(days=7)  # datetime one week ago = the finish line
now = now.strftime(dtformat)  # convert now datetime to format for API

In [None]:
df = pd.DataFrame()

In [None]:
def twitter_data(tweet):
    data = {
        'id': tweet['id_str'],
        'created_at': tweet['created_at'],
        'text': tweet['full_text']
    }
    return data

In [None]:
while True:
    if datetime.strptime(now, dtformat) < last_week:
        break # loop based on earliest time
      
    pre60 = time_periods(now, 60)
    params['start_time'] = pre60
    params['end_time'] = now
    response = requests.get(endpoint,
                            params=params,
                            headers=headers)
    now = pre60

    for tweet in response.json()['data']:
        row = twitter_data(tweet)
        df = df.append(row, ignore_index=True)

# Gathering Financial News

Historical analyst headlines and financial news headlines over several years on a sample of listed companies. Sentiment analysis using flair; classification into positive / negative with probability.

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
root_folder='/content/drive/My Drive/DataAnalysis'

In [4]:
data = pd.read_csv(root_folder+'/analyst_ratings_processed.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400469 entries, 0 to 1400468
Data columns (total 4 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   Unnamed: 0  1399180 non-null  float64
 1   title       1400469 non-null  object 
 2   date        1399180 non-null  object 
 3   stock       1397891 non-null  object 
dtypes: float64(1), object(3)
memory usage: 42.7+ MB


In [5]:
data = data.dropna(axis=0)

In [6]:
data['date'] = data['date'].str.split(' ').str[0]
data['date'] = pd.Series(pd.to_datetime(data['date'], format='%Y-%m-%d')) # 2020-06-05 10:30:00-04:00

In [7]:
data = data.drop('Unnamed: 0', axis=1)

## Flair / DistilBERT

This model splits the text into character-level tokens and uses the DistilBERT model to make predictions.The advantage of working at the character-level (as opposed to word-level) is that words that the network has never seen before can still be assigned a sentiment. DistilBERT is a distilled version of the powerful BERT transformer model

In [8]:
import flair
sentiment_model = flair.models.TextClassifier.load('en-sentiment')

2022-02-20 22:01:09,439 loading file /root/.flair/models/sentiment-en-mix-distillbert_4.pt


In [10]:
data = data.sample(n = 1000, replace = False)
data['text_clean'] = data['title'].apply(lambda x : text_cleaner(x))

In [12]:
data['text_obj'] = data['text_clean'].apply(lambda x : flair.data.Sentence(x))
data['text_obj'].apply(lambda x : sentiment_model.predict(x))
data['score'] = data['text_obj'].apply(lambda x : x.labels[0].score)
data['value'] = data['text_obj'].apply(lambda x : x.labels[0].value)

In [13]:
data

Unnamed: 0,title,date,stock,text_clean,text_obj,score,value
336161,Diebold's Planned Acquisition Of Wincor Nixdor...,2016-05-31,DBD,diebold s planned acquisition of wincor nixdor...,"(Token: 1 diebold, Token: 2 s, Token: 3 planne...",0.556393,POSITIVE
570059,Atlantic Equities Downgrades Garmin to Neutral...,2015-02-20,GRMN,atlantic equities downgrades garmin to neutral...,"(Token: 1 atlantic, Token: 2 equities, Token: ...",0.998690,NEGATIVE
1264993,TherapeuticsMD Reports Resubmission Of NDA For...,2017-11-29,TXMD,therapeuticsmd reports resubmission of nda for...,"(Token: 1 therapeuticsmd, Token: 2 reports, To...",0.984767,POSITIVE
478804,FAA To Streamline Fire Regulations For Cargo C...,2019-07-03,FDX,faa to streamline fire regulations for cargo c...,"(Token: 1 faa, Token: 2 to, Token: 3 streamlin...",0.790327,POSITIVE
1232600,"Earnings Scheduled For February 25, 2014",2014-02-25,TOL,earnings scheduled for february,"(Token: 1 earnings, Token: 2 scheduled, Token:...",0.872605,POSITIVE
...,...,...,...,...,...,...,...
1036042,American Petro-Hunter's Sacramento Gas Project...,2009-08-10,Q,american petro hunter s sacramento gas project...,"(Token: 1 american, Token: 2 petro, Token: 3 h...",0.612436,POSITIVE
1119241,Morning Market Losers,2013-06-10,SHI,morning market losers,"(Token: 1 morning, Token: 2 market, Token: 3 l...",0.999858,NEGATIVE
555722,Mid-Morning Market Update: Markets Open Lower;...,2016-06-10,GNW,mid morning market update markets open lower ...,"(Token: 1 mid, Token: 2 morning, Token: 3 mark...",0.995033,NEGATIVE
137174,Shares of several Brazilian bank stocks tradin...,2019-03-28,BBD,shares of several brazilian bank stocks tradin...,"(Token: 1 shares, Token: 2 of, Token: 3 severa...",0.992516,POSITIVE


## FinBERT

*   Original BERT training Data English Wikipedia and BookCorpus (Zhu et al., 2015)
*   Finance Articles from Yahoo Finance
*   Financial News from Financial Web
*   Question-Answer pairs about financial issues from Reddit

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

In [None]:
labels = {0:'neutral', 1:'positive',2:'negative'}

In [None]:
data['text_obj_finBERT'] = data['text_clean'].apply(lambda x : tokenizer(x, return_tensors="pt", padding=True))
data['text_obj_finBERT'].apply(lambda x : finbert(**inputs)[0])
data['sentiment_finBERT'] = data['text_obj_finBERT'].apply(lambda x : labels[np.argmax(x.detach().numpy())]

# Financial News Analysis

In [None]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",
                           "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",
                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",
                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",
                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",
                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",
                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",
                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",
                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",
                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",
                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",
                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",
                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",
                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",
                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",
                           "you're": "you are", "you've": "you have"}

In [9]:
def text_cleaner(text):
    newString = text.lower()
    newString = re.sub('"', '', newString)
    newString = re.sub(r'\([^)]*\)', '', newString)
    newString = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newString.split(' ')])
    newString = re.sub(r"(?i)http(s):\/\/[a-z0-9.~_\-\/]+", '', newString)
    newString = re.sub(r"(?i)@[a-z0-9_]+", '', newString)
    newString = re.sub('[^a-zA-Z\s]', ' ', newString)
    # tokens = [w for w in newString.split() if w not in stopwords]

    return newString