In [35]:
import os
import random as rnd
import trax
import trax.fastmath.numpy as np
from trax import layers as tl
from utils import Layer, load_tweets, process_tweet
import numpy as np

# Importing data

In [36]:
all_positive_tweets,all_negative_tweets=load_tweets()
print(f"The number of positive tweets: {len(all_positive_tweets)}")
print(f"The number of negative tweets: {len(all_negative_tweets)}")

The number of positive tweets: 5000
The number of negative tweets: 5000


In [37]:
val_pos=all_positive_tweets[4000:]
train_pos=all_positive_tweets[:4000]

val_neg=all_negative_tweets[4000:]
train_neg=all_negative_tweets[:4000]

In [38]:
train_x=train_pos+train_neg
val_x=val_pos+val_neg

In [39]:
train_y=np.append(np.ones(len(train_pos)),np.ones(len(train_neg)))
val_y=np.append(np.ones(len(val_pos)),np.ones(len(val_neg)))

In [40]:
print(f"length of train_x: {len(train_x)}")
print(f"length of val_x: {len(val_x)}")

length of train_x: 8000
length of val_x: 2000


In [41]:
# tweet before and after preprocessing

In [42]:
print("original tweet at training position 0:")
print(train_pos[0])
print('\n')
print(process_tweet(train_pos[0]))

original tweet at training position 0:
#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)


['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']


In [43]:
# building vocabulary mapping each word to an integer index

In [44]:
Vocab={"__PAD__":0,"__</e>__":1,"__UNK__":2}

In [45]:
for tweet in train_x:
    processed_tweet=process_tweet(tweet)
    for word in processed_tweet:
        if word not in Vocab:
            Vocab[word]=len(Vocab)

In [46]:
print("Total words in vocab are: ",len(Vocab))

Total words in vocab are:  9088


In [47]:
# converting tweet to a tensor, consisting of unique indexes

In [48]:
def tweet_to_tensor(tweet,vocab_dict,unk_token='__UNK__',verbose=False):
    word_l=process_tweet(tweet)
    if verbose:
        print('List of words from the processed tweet: ')
        print(word_l)
    tensor_l=[]
    unk_ID=vocab_dict[unk_token]
    for word in word_l:
        word_ID=vocab_dict.get(word,unk_ID)
        tensor_l.append(word_ID)
    return tensor_l

In [49]:
print('Actual tweet is:\n',val_pos[0])
print('\nTensor of tweet:\n',tweet_to_tensor(val_pos[0],vocab_dict=Vocab))

Actual tweet is:
 Bro:U wan cut hair anot,ur hair long Liao bo
Me:since ord liao,take it easy lor treat as save $ leave it longer :)
Bro:LOL Sibei xialan

Tensor of tweet:
 [1065, 136, 479, 2351, 745, 8148, 1123, 745, 53, 2, 2672, 791, 2, 2, 349, 601, 2, 3489, 1017, 597, 4559, 9, 1065, 157, 2, 2]


# Creating a batch generator

In [58]:
def data_generator(data_pos, data_neg, batch_size, loop, vocab_dict, shuffle=False):

    n_to_take = batch_size // 2
    
    pos_index = 0
    neg_index = 0
    
    len_data_pos = len(data_pos)
    len_data_neg = len(data_neg)
    
    pos_index_lines = list(range(len_data_pos))
    neg_index_lines = list(range(len_data_neg))
    
    if shuffle:
        rnd.shuffle(pos_index_lines)
        rnd.shuffle(neg_index_lines)
        
    stop = False
    
    while not stop:  
        
        batch = []
        
        for i in range(n_to_take):
                    
            if pos_index >= len_data_pos: 
                
                if not loop:
                    stop = True;
                    break;
                
                pos_index = 0
                
                if shuffle:
                    rnd.shuffle(pos_index_lines)
                    
            tweet = data_pos[pos_index_lines[pos_index]]
            
            tensor = tweet_to_tensor(tweet, vocab_dict)
            
            batch.append(tensor)
            
            pos_index = pos_index + 1

        for i in range(n_to_take):
            
            if neg_index>=len_data_neg:
                
                if not loop:
                    stop = True;
                    break;
                    
                neg_index = 0
                
                if shuffle:
                    rnd.shuffle(neg_index_lines)
            tweet = data_neg[neg_index_lines[neg_index]]
            
            tensor = tweet_to_tensor(tweet,vocab_dict)
            
            batch.append(tensor)
            
            neg_index = neg_index + 1

        if stop:
            break;

        pos_index += n_to_take
        
        neg_index += n_to_take
        
        max_len = max([len(t) for t in batch]) 
        
        tensor_pad_l = []
        
        for tensor in batch:

            n_pad = max_len - len(tensor)
            
            pad_l = [0] * n_pad
            
            tensor_pad = tensor + pad_l
            
            tensor_pad_l.append(tensor_pad)

        inputs = np.asarray(tensor_pad_l)
  
        target_pos = [1] * n_to_take
        
        target_neg = [0] * n_to_take
        
        target_l = target_pos + target_neg
        
        targets = np.asarray(target_l)

        example_weights = np.ones_like(targets)
        
        yield inputs, targets, example_weights

In [61]:
# testing batch generator function

In [60]:
rnd.seed(30) 

def train_generator(batch_size, shuffle = False):
    return data_generator(train_pos, train_neg, batch_size, True, Vocab, shuffle)

def val_generator(batch_size, shuffle = False):
    return data_generator(val_pos, val_neg, batch_size, True, Vocab, shuffle)

def test_generator(batch_size, shuffle = False):
    return data_generator(val_pos, val_neg, batch_size, False, Vocab, shuffle)

inputs, targets, example_weights = next(train_generator(4, shuffle=True))

print(f'Inputs: {inputs}')
print(f'Targets: {targets}')
print(f'Example Weights: {example_weights}')

Inputs: [[2005 4451 3201    9    0    0    0    0    0    0    0]
 [4954  567 2000 1454 5174 3499  141 3499  130  459    9]
 [3761  109  136  583 2930 3969    0    0    0    0    0]
 [ 250 3761    0    0    0    0    0    0    0    0    0]]
Targets: [1 1 0 0]
Example Weights: [1 1 1 1]


In [62]:
# defining own Layers

In [63]:
# ReLU layer

In [64]:
class Relu(Layer):
    def forward(self,X):
        activation=np.maximum(X,0)
        return activation

In [65]:
# testing ReLU

In [66]:
x=np.array([[0,-1,1],[2,-2,0]],dtype=float)

In [67]:
relu_layer=Relu()

In [68]:
print(f'Input array: {x}')
print(f'ReLU activation values: {relu_layer(x)}')

Input array: [[ 0. -1.  1.]
 [ 2. -2.  0.]]
ReLU activation values: [[0. 0. 1.]
 [2. 0. 0.]]


In [70]:
# using fastmath modules within Trax

In [69]:
from trax import fastmath
np=fastmath.numpy
random=fastmath.random

In [None]:
class Dense(Layer):
    def __init__(self,n_units,init_stdev=.1):
        self._n_units=n_units
        self._init_stdev=init_stdev
    def forward(self,X):
        dense=np.dot(X,self)