In [2]:
import pandas as pd
from pathlib import Path
from transformers import BertModel, BertTokenizer
import torch
from torch import nn
import torch.nn.functional as F

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
data_path = Path('data/AAPL.csv')
weights_path = Path('models/model.1600000.processed.noemoticon.bin')
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
PRE_TRAINED_MODEL_PATH = Path(r'models/bert-base-cased/')

In [5]:
df = pd.read_csv(data_path)

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,User,Text,Date,Favorites,Retweets,Mentions,Hashtags,Geolocation
0,0,jerry,patting myself on the back for buying aapl yes...,2007-01-09 20:06:02+00:00,0,0,,,
1,1,topa,wishes he had AAPL stock to sell,2007-01-09 18:28:48+00:00,0,0,,,
2,2,blakeburris,Who's watching AAPL live??? BUY!!!,2007-01-09 18:00:24+00:00,0,0,,,
3,3,kenwalker,macworld Obsessively refreshing macrumorslive....,2007-01-09 17:00:29+00:00,0,0,,,
4,4,macworld,kenwalker: Obsessively refreshing macrumorsliv...,2007-01-09 17:00:20+00:00,0,0,,,


In [7]:
class_names = ['negative', 'positive']

In [8]:
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [9]:
class SentimentClassifier(nn.Module):
    
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_PATH)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
        
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        output = self.drop(pooled_output)
        return self.out(output)

In [10]:
model = SentimentClassifier(len(class_names))
model = model.to(device)

In [11]:
model.load_state_dict(torch.load(weights_path))

<All keys matched successfully>

In [12]:
MAX_LEN=70

In [34]:
def encode_tweet(tweet_text):
    encoded_tweet = tokenizer.encode_plus(
        tweet_text,
        max_length=MAX_LEN,
        truncation=True,
        add_special_tokens=True,
        return_token_type_ids=False,
        return_attention_mask=True,
        return_tensors='pt'
    )
    return encoded_tweet

In [35]:
def predict_sentiment(tweet_text, model, device):    
    encoded_tweet = encode_tweet(tweet_text)
    input_ids = encoded_tweet['input_ids'].to(device)
    attention_mask = encoded_tweet['attention_mask'].to(device)
    output = model(input_ids, attention_mask)
    _, prediction = torch.max(output, dim=1)
    return prediction

In [45]:
prediction = predict_sentiment(df.Text[0], model, device)
print(f'Tweet text: {tweet_text}\nSentiment: {class_names[prediction]}')

Tweet text: patting myself on the back for buying aapl yesterday morning
Sentiment: positive


Tweet text: patting myself on the back for buying aapl yesterday morning
Sentiment: positive


In [69]:
df['Sentiment'] = df.Text.map(lambda x: predict_sentiment(x, model, device).cpu().data.numpy())

In [70]:
df.head()

Unnamed: 0.1,Unnamed: 0,User,Text,Date,Favorites,Retweets,Mentions,Hashtags,Geolocation,Sentiment
0,0,jerry,patting myself on the back for buying aapl yes...,2007-01-09 20:06:02+00:00,0,0,,,,[1]
1,1,topa,wishes he had AAPL stock to sell,2007-01-09 18:28:48+00:00,0,0,,,,[0]
2,2,blakeburris,Who's watching AAPL live??? BUY!!!,2007-01-09 18:00:24+00:00,0,0,,,,[1]
3,3,kenwalker,macworld Obsessively refreshing macrumorslive....,2007-01-09 17:00:29+00:00,0,0,,,,[1]
4,4,macworld,kenwalker: Obsessively refreshing macrumorsliv...,2007-01-09 17:00:20+00:00,0,0,,,,[1]


In [71]:
df.to_csv('data/AAPL_classified.csv')