In [8]:
!pip install torch

Collecting torch
  Downloading torch-1.8.1-cp38-cp38-win_amd64.whl (190.5 MB)
Installing collected packages: torch
Successfully installed torch-1.8.1


In [1]:
import pandas as pd
import torch
import numpy
import torch.nn as nn

In [2]:
twitter_df = pd.read_csv('Data/twitter_data_clean.csv')
twitter_df['tweet'].head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


0    episode 13 crypto thunderdome part 1 prize 50 ...
1    grt chart going off the screen lambo soon btc ...
2    live bitcoin trading with deribot on deribit e...
3    deribot daily trading report 20122020 1107 utc...
4    learned about stellar amp earned xlm in return...
Name: tweet, dtype: object

In [3]:
training_df = pd.read_csv('Data/training_data_clean.csv')
training_df = training_df.sample(n=100000, random_state=100)
training_df.head()

Unnamed: 0,sentiment,tweet
975371,4,i did not realise you had a monome cool stuff
1276867,4,watching tv texting sitting with my bro and bf...
103630,0,why would i dload an app called the quotmoron ...
482837,0,i feel so bad for him
219956,0,can not sleepagaintoo much rattling around in ...


In [4]:
training_df = training_df.dropna()

In [5]:
list_of_tweets = training_df['tweet'].tolist()
from collections import Counter

all_text = ' '.join(list_of_tweets)
words = all_text.split()

count_words = Counter(words)
total_words = len(words)
sorted_words = count_words.most_common(total_words)
vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}

In [6]:
import numpy as np

# Return features of tweet ints, where each tweet is padded with 0's or truncated to the input seq_length.
def pad_features(tweet_int, seq_length):
        
    tweet_len = len(tweet_int)
        
    if tweet_len <= seq_length:
        zeroes = list(np.zeros(seq_length-tweet_len))
        new = zeroes+tweet_int
    elif tweet_len > seq_length:
        new = tweet_int[0:seq_length]
            
    return np.asarray(new)


In [7]:
from string import punctuation

def tokenize_review(test_review):
    test_review = test_review.lower() # lowercase
    # get rid of punctuation
    test_text = ''.join([c for c in test_review if c not in punctuation])

    # splitting by spaces
    test_words = test_text.split()

    # tokens
    test_ints = []
    for word in test_words:
        if word in vocab_to_int:
            test_ints.append(vocab_to_int[word])
    #test_ints.append([vocab_to_int[word] for word in test_words])
    
    return test_ints

In [8]:
class SentimentLSTM(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super().__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=drop_prob)
        
        # dropout layer
        self.dropout = nn.Dropout(0.3)
        
        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
        

    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size(0)
        
        # embeddings and lstm_out
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
    
        # stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        # sigmoid function
        sig_out = self.sig(out)
        
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] # get last batch of labels
        
        # return last sigmoid output and hidden state
        return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (False):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

In [9]:
vocab_size = 67394
output_size = 1
embedding_dim = 400
hidden_dim = 256

n_layers = 2
model = SentimentLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)

model.load_state_dict(torch.load('sentiment_analysis.pt'))

<All keys matched successfully>

In [10]:
def predict(net, test_review, sequence_length=20):
    
    net.eval()
    
    # tokenize review
    test_ints = tokenize_review(test_review)
    
    # pad tokenized sequence
    seq_length=sequence_length
    features = pad_features(test_ints, seq_length)
    
    # convert to tensor to pass into your model
    feature_tensor = torch.from_numpy(features)
    
    batch_size = feature_tensor.size(0)
    
    # initialize hidden state
    h = net.init_hidden(batch_size)
    
    if(False):
        feature_tensor = feature_tensor.cuda()

    feature_tensor = feature_tensor.unsqueeze(0)

    
    # get the output from the model
    output, h = net(feature_tensor.to(torch.int64), h)
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze()) 
    # printing output value, before rounding
    print('Prediction value, pre-rounding: {:.6f}'.format(output.item()))
    
    # print custom response
    if(pred.item()==1):
        return 1
    else:
        return 0

In [11]:
print(predict(model, "im so upset"))

Prediction value, pre-rounding: 0.215993
0


In [12]:
twitter_df['sentiment'] = twitter_df.apply(lambda row: predict(model, row['tweet']), axis=1)

on value, pre-rounding: 0.579213
Prediction value, pre-rounding: 0.594472
Prediction value, pre-rounding: 0.142449
Prediction value, pre-rounding: 0.671367
Prediction value, pre-rounding: 0.257072
Prediction value, pre-rounding: 0.774881
Prediction value, pre-rounding: 0.116276
Prediction value, pre-rounding: 0.907602
Prediction value, pre-rounding: 0.671367
Prediction value, pre-rounding: 0.651562
Prediction value, pre-rounding: 0.007730
Prediction value, pre-rounding: 0.651562
Prediction value, pre-rounding: 0.923714
Prediction value, pre-rounding: 0.219093
Prediction value, pre-rounding: 0.755192
Prediction value, pre-rounding: 0.007730
Prediction value, pre-rounding: 0.907602
Prediction value, pre-rounding: 0.174183
Prediction value, pre-rounding: 0.826274
Prediction value, pre-rounding: 0.024387
Prediction value, pre-rounding: 0.377085
Prediction value, pre-rounding: 0.977723
Prediction value, pre-rounding: 0.965866
Prediction value, pre-rounding: 0.977723
Prediction value, pre-ro

In [13]:
twitter_df.head()

Unnamed: 0.1,Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,...,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest,sentiment
0,64835,1340558190652506112,1340558190652506112,2020-12-20 02:22:33,2020-12-20,02:22:33,-500,15061990,tjeezyyy,"jeezy, a love story 🤰 tokennotneeded.com",...,,,,,[],,,,,1
1,24163,1340590202562080768,1340542774454587392,2020-12-20 04:29:45,2020-12-20,04:29:45,-500,1277617350745796610,bittradebtc,Bittrade,...,,,,,[],,,,,1
2,24035,1340616069153771521,1340616069153771521,2020-12-20 06:12:32,2020-12-20,06:12:32,-500,1624918207,deribotinfo,DeriBot.info,...,,,,,[],,,,,0
3,12100,1340617072448069632,1340617072448069632,2020-12-20 06:16:32,2020-12-20,06:16:32,-500,1624918207,deribotinfo,DeriBot.info,...,,,,,[],,,,,1
4,64728,1340637885326741504,1340637885326741504,2020-12-20 07:39:14,2020-12-20,07:39:14,-500,2231159425,thepixiepost,Jenn Nieto 🐝I ♥️ CTP & HIVE🐝,...,,,,,[],,,,,0


In [14]:
twitter_df.to_csv('twitter_labeled.csv')

In [16]:
reddit_df = pd.read_csv('Data/reddit_data.csv')
reddit_df['body'].head()

0                       We can only hope my friend
1        We’re taking this rocket to the moon baby
2                                    all in bb 🚀🚀🚀
3    I love monero. Even more than I love stellar.
4              10 dollar a coin would be very nice
Name: body, dtype: object

In [18]:
reddit_df['sentiment'] = reddit_df.apply(lambda row: predict(model, row['body']), axis=1)

on value, pre-rounding: 0.850697
Prediction value, pre-rounding: 0.903862
Prediction value, pre-rounding: 0.951696
Prediction value, pre-rounding: 0.063387
Prediction value, pre-rounding: 0.651562
Prediction value, pre-rounding: 0.393082
Prediction value, pre-rounding: 0.068412
Prediction value, pre-rounding: 0.850283
Prediction value, pre-rounding: 0.923404
Prediction value, pre-rounding: 0.349279
Prediction value, pre-rounding: 0.140941
Prediction value, pre-rounding: 0.632677
Prediction value, pre-rounding: 0.035853
Prediction value, pre-rounding: 0.024387
Prediction value, pre-rounding: 0.563108
Prediction value, pre-rounding: 0.597095
Prediction value, pre-rounding: 0.301484
Prediction value, pre-rounding: 0.676059
Prediction value, pre-rounding: 0.928734
Prediction value, pre-rounding: 0.781250
Prediction value, pre-rounding: 0.891269
Prediction value, pre-rounding: 0.169492
Prediction value, pre-rounding: 0.601468
Prediction value, pre-rounding: 0.601468
Prediction value, pre-ro

In [19]:
reddit_df.head()

Unnamed: 0.1,Unnamed: 0,author,body,created_utc,created,created_at,sentiment
0,0,ExcitingRelease95,We can only hope my friend,1611205194,1611223194,1/20/2021 23:59,0
1,1,GoobyBTC,We’re taking this rocket to the moon baby,1611204949,1611222949,1/20/2021 23:55,1
2,2,beans_lel,all in bb 🚀🚀🚀,1611204458,1611222458,1/20/2021 23:47,0
3,3,9107201999,I love monero. Even more than I love stellar.,1611204333,1611222333,1/20/2021 23:45,1
4,4,ljayyskux,10 dollar a coin would be very nice,1611202623,1611220623,1/20/2021 23:17,1


In [20]:
reddit_df.to_csv('reddit_labeled.csv')