### Build a Model. Ideally we will use some SOTA model

In [1]:
import pandas as pd
import nltk
import numpy as np
from nltk.tokenize import WhitespaceTokenizer as wst
np.random.seed(7)
import re

In [2]:
tweets = pd.read_csv('final.csv')
tweets.drop(columns='Unnamed: 0',inplace=True)

FileNotFoundError: [Errno 2] File final.csv does not exist: 'final.csv'

In [3]:
tweets.head()

Unnamed: 0,sentiment,text
0,Neutral,RT @NancyLeeGrahn: How did everyone feel about...
1,Positive,RT @ScottWalker: Didn't catch the full #GOPdeb...
2,Neutral,RT @TJMShow: No mention of Tamir Rice and the ...
3,Positive,RT @RobGeorge: That Carly Fiorina is trending ...
4,Positive,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...


In [4]:
#### Encoding the Sentiments 

tweets.loc[tweets.sentiment == "Neutral", 'Sentiment_C'] = int(0)
tweets.loc[tweets.sentiment == "Positive", 'Sentiment_C'] = int(0)
tweets.loc[tweets.sentiment == "Negative", 'Sentiment_C'] = int(1)

In [105]:
tweets.iloc[6].text

'RT @warriorwoman91: I liked her and was happy when I heard she was going to be the moderator. Not anymore. #GOPDebate @megynkelly https://…'

In [6]:
### Text Cleaning 

# tweets.text = tweets.text.str.lower()

tweets.text = tweets.text.str.replace("&amp","")
tweets.text = tweets.text.str.replace("\n"," ")
tweets.text = tweets.text.str.replace("\\\\"," ")

### Deleting Multiple Spaces 

tweets.text =  [re.sub(' +', ' ', str(x)) for x in tweets['text']]

In [60]:
len(tweets)

13871

In [64]:
#### Reserve 1000 Tweets for Testing
test_tweets = tweets.sample(1000)

In [67]:
test_tweets.index

Int64Index([12520,  4509, 10923, 11469, 10989,   145,  9017, 13199,  6570,
             8586,
            ...
            10936,  1712,  1557,  8282,  8875,  1366, 10344,  5821,  5737,
             2958],
           dtype='int64', length=1000)

In [68]:
tweets.drop(test_tweets.index,axis=0,inplace=True)

In [69]:
len(tweets)

12871

In [7]:
### Creating a common tweet corpus list

all_tweets = tweets.text.tolist()

In [8]:
#### Tokenize and get unique tokens 
tokens = wst().tokenize(str(all_tweets))
unique_tokens = list(set(tokens))
unique_tokens.sort()

In [9]:
from collections import Counter
all_characters = list(str(all_tweets))
l_sorted = Counter(all_characters).most_common()

In [10]:
#### Creating a Char Dictionary 
char_dict = dict()
char_dict['<padding>'] = 0
char_dict['<start>'] = 1
char_dict['<unknown>'] = 2

counter = 3

#### Selected 

vocabulary_size = 200


for i in range(int(vocabulary_size)):
    char_dict[l_sorted[i][0]] = counter
    counter = counter + 1

In [11]:
#### Function to Create Character encodings
def encode_tweets_char(string): 
    encoded_char_list = list()
    encoded_char_list.append("1")
    for c in list(string):
        try:
            encoded_char_list.append(str((char_dict[c])))
        except KeyError:
            encoded_char_list.append(str(char_dict['<unknown>']))
        
    return encoded_char_list

In [12]:
# Reverse dictionary to get words from numbers
reverse_char_dict = dict(zip(char_dict.values(), char_dict.keys()))

In [13]:
#### Decoding Function - To convert the encoded tweet back to character (sequence)
def decode_tweets_char(list_enc): 
    for i in list_enc:
        print(reverse_char_dict[int(i)])

In [14]:
enc = encode_tweets_char(tweets.text[19])

In [15]:
enc

['1',
 '33',
 '31',
 '3',
 '34',
 '28',
 '6',
 '18',
 '4',
 '13',
 '6',
 '24',
 '4',
 '13',
 '13',
 '4',
 '11',
 '35',
 '3',
 '50',
 '15',
 '17',
 '36',
 '6',
 '16',
 '4',
 '4',
 '35',
 '3',
 '28',
 '6',
 '23',
 '8',
 '9',
 '22',
 '3',
 '32',
 '7',
 '11',
 '3',
 '5',
 '11',
 '6',
 '9',
 '10',
 '22',
 '4',
 '9',
 '14',
 '4',
 '11',
 '3',
 '10',
 '15',
 '11',
 '22',
 '4',
 '11',
 '23',
 '3',
 '32',
 '7',
 '11',
 '3',
 '10',
 '7',
 '13',
 '14',
 '8',
 '4',
 '11',
 '10',
 '26',
 '3',
 '10',
 '6',
 '8',
 '13',
 '7',
 '11',
 '10',
 '3',
 '6',
 '9',
 '14',
 '3',
 '6',
 '8',
 '11',
 '18',
 '4',
 '9',
 '3',
 '14',
 '7',
 '4',
 '10',
 '3',
 '9',
 '7',
 '5',
 '3',
 '18',
 '6',
 '36',
 '4',
 '3',
 '7',
 '15',
 '11',
 '3',
 '17',
 '7',
 '15',
 '9',
 '5',
 '11',
 '23',
 '3',
 '10',
 '6',
 '32',
 '4',
 '11',
 '3',
 '21',
 '50',
 '6',
 '3',
 '21',
 '24',
 '27',
 '28',
 '25',
 '4',
 '16',
 '6',
 '5',
 '4']

In [16]:
#### Encoding all the tweets 

tweets['Encoded'] = tweets.text.apply(encode_tweets_char)

In [17]:
x_train = tweets.Encoded.values
y_train = tweets.Sentiment_C.values

In [18]:
import tensorflow as tf
from tensorflow.keras.preprocessing import sequence

# Selecting Max Character Limit
character_limit = 151 

# Padding all the sequences
x_train = sequence.pad_sequences(x_train, padding = 'pre', maxlen=character_limit, truncating = 'post')

In [19]:
tf.__version__

'2.0.0'

In [20]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train, test_size = 0.15, random_state = 7)

In [21]:
### Converting Y to catagorical 
from tensorflow.keras.utils import to_categorical

y_train_binary = to_categorical(y_train_split)
y_test_binary = to_categorical(y_test_split)

In [23]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout, LSTM, Bidirectional
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
# from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import EarlyStopping


In [26]:
Nadam = optimizers.Nadam(lr = 0.00001)

model1=Sequential()
model1.add(Embedding(vocabulary_size+3, 100 , input_length=character_limit,trainable = True))
model1.add(SimpleRNN(128,unroll=True,return_sequences = True))
model1.add(SimpleRNN(40,unroll=True))
model1.add(Dense(2, activation='softmax'))
model1.compile(loss = 'sparse_categorical_crossentropy', optimizer = Nadam, metrics = ['accuracy'])

cb = [EarlyStopping(monitor='val_loss',
                              min_delta=0,
                              patience=2,
                              verbose=0, mode='auto')]
    
model1.fit(x_train_split, y_train_split, validation_data=(x_test_split,y_test_split), batch_size=150, epochs=15,callbacks = cb ) 

Train on 11790 samples, validate on 2081 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x1a56768890>

In [70]:
model1.save('Tweets_Classifier.h5')

In [119]:
test_tweets.head()

Unnamed: 0,sentiment,text,Sentiment_C,Encoded
12520,Negative,Damn... I'd turn Republican for this. #GOPDeba...,1.0,"[1, 25, 6, 18, 9, 30, 30, 30, 3, 41, 19, 14, 3..."
4509,Negative,RT @bobcesca_go: As if the rest of the debate ...,1.0,"[1, 33, 31, 3, 34, 16, 7, 16, 17, 4, 10, 17, 6..."
10923,Negative,I would expect this farce from the BOTTOM-tier...,1.0,"[1, 41, 3, 29, 7, 15, 13, 14, 3, 4, 49, 20, 4,..."
11469,Neutral,Straight Outta Compton commercial on Fox News....,0.0,"[1, 40, 5, 11, 6, 8, 22, 12, 5, 3, 27, 15, 5, ..."
10989,Negative,RT @RWSurferGirl: Is it just me or does anyone...,1.0,"[1, 33, 31, 3, 34, 33, 42, 40, 15, 11, 32, 4, ..."


In [75]:
from tensorflow.keras import models

In [77]:
model = models.load_model("Tweets_Classifier.h5")

Exception ignored in: <function IteratorResourceDeleter.__del__ at 0x1a4ecf0b90>
Traceback (most recent call last):
  File "/Users/shantanu/opt/anaconda3/envs/LEXACQ/lib/python3.7/site-packages/tensorflow_core/python/data/ops/iterator_ops.py", line 541, in __del__
    handle=self._handle, deleter=self._deleter)
  File "/Users/shantanu/opt/anaconda3/envs/LEXACQ/lib/python3.7/site-packages/tensorflow_core/python/ops/gen_dataset_ops.py", line 1157, in delete_iterator
    "DeleteIterator", handle=handle, deleter=deleter, name=name)
  File "/Users/shantanu/opt/anaconda3/envs/LEXACQ/lib/python3.7/site-packages/tensorflow_core/python/framework/op_def_library.py", line 793, in _apply_op_helper
    op_def=op_def)
  File "/Users/shantanu/opt/anaconda3/envs/LEXACQ/lib/python3.7/site-packages/tensorflow_core/python/framework/func_graph.py", line 544, in create_op
    inp = self.capture(inp)
  File "/Users/shantanu/opt/anaconda3/envs/LEXACQ/lib/python3.7/site-packages/tensorflow_core/python/framewo

In [78]:
type(eval_enc)

numpy.ndarray

In [79]:
eval_enc = encode_tweets_char("I liked her and was happy when I heard she was going to be the moderator. Not anymore")
eval_enc = sequence.pad_sequences([eval_enc], padding = 'pre', maxlen=character_limit, truncating = 'post')

In [80]:
len(eval_enc[0])

151

In [81]:
model.predict(eval_enc)

array([[0.47523627, 0.5247637 ]], dtype=float32)

In [87]:
#### Testing and Abalation 

In [131]:
from nltk.tokenize import word_tokenize
import string 
from nltk.corpus import stopwords 
#requires Tensorflow 2.0 and Tf-hub 0.7!!
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np

In [132]:
module_url = "https://tfhub.dev/google/elmo/3"
embed = hub.load(module_url)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [133]:
text=tf.convert_to_tensor(["I am feeling kind of blue", "Blue is the color of the sky","I am feeling kind of blue"])
out = embed.signatures['default'](text)['elmo']

In [135]:
out.shape

TensorShape([3, 7, 1024])

In [100]:
def preprocess_text(text_eval):
    eval_enc = encode_tweets_char(text)
    eval_enc = sequence.pad_sequences([eval_enc], padding = 'pre', maxlen=character_limit, truncating = 'post')
    return eval_enc

In [171]:
test_tweets.reset_index(inplace=True)

In [173]:
test_tweets.head()

Unnamed: 0,index,sentiment,text,Sentiment_C,Encoded
0,12520,Negative,Damn... I'd turn Republican for this. #GOPDeba...,1.0,"[1, 25, 6, 18, 9, 30, 30, 30, 3, 41, 19, 14, 3..."
1,4509,Negative,RT @bobcesca_go: As if the rest of the debate ...,1.0,"[1, 33, 31, 3, 34, 16, 7, 16, 17, 4, 10, 17, 6..."
2,10923,Negative,I would expect this farce from the BOTTOM-tier...,1.0,"[1, 41, 3, 29, 7, 15, 13, 14, 3, 4, 49, 20, 4,..."
3,11469,Neutral,Straight Outta Compton commercial on Fox News....,0.0,"[1, 40, 5, 11, 6, 8, 22, 12, 5, 3, 27, 15, 5, ..."
4,10989,Negative,RT @RWSurferGirl: Is it just me or does anyone...,1.0,"[1, 33, 31, 3, 34, 33, 42, 40, 15, 11, 32, 4, ..."


In [300]:
### Abalation and Get the Vectors of the important trigrams 

pred_vals_array = np.empty((0,2), float)
NAE_vals_array = np.empty((1,3,1024), float)
# NAE_vals_array = []
PAE_vals_array = np.empty((3,1024), float)

for index, row in test_tweets.iterrows():
    
    text = row.text.translate(str.maketrans('','',string.punctuation))
    tokens = word_tokenize(text)
    tokens_without_sw = [word for word in tokens if not word in stopwords.words()]
    
    ### First Get the overall Classification
    
    pred_vlaues = model.predict(preprocess_text(str(tokens_without_sw)))
    
    classification = pred_vlaues.argmax()
    
    if classification == 0 and row.Sentiment_C == 0:
        mode = 'PAE' # Positive AutoEncoder
    elif classification == 0 and row.Sentiment_C == 1:
        mode = 'NAE'
    elif classification == 1 and row.Sentiment_C == 0:
        mode = 'PAE'
    elif classification == 1 and row.Sentiment_C == 1:
        mode = 'NAE'
    
    for i in range(len(tokens_without_sw)-2):
        
        text_eval = tokens_without_sw[i:i+3]

        a = preprocess_text(str(text_eval))
        pred_vlaues = model.predict(a)
        pred_vals_array = np.vstack((pred_vals_array,pred_vlaues))
        
        if mode == "NAE":

            # Only Negative Vals, and find the index of the highest contributing phrase

            phrase_start_index = pred_vals_array[:,1].argmax()

            ### Get the ELMO Encoding of the entire vector:

            text_elmo = row.text

            text=tf.convert_to_tensor([str(tokens_without_sw)])
            out = embed.signatures['default'](text)['elmo']

            elmo_vecs = np.array(out[0][phrase_start_index:phrase_start_index+3])

            print("B",elmo_vecs.shape)
            
            
#             NAE_vals_array = np.dstack((NAE_vals_array, elmo_vecs))
            
            print(np.append(NAE_vals_array, np.atleast_3d(elmo_vecs), axis=0).shape)
            
            print("",NAE_vals_array.shape)

#             NAE_vals_array.append(elmo_vecs)

        elif mode == "PAE":

            # Only Positive Vals, and find the index of the highest contributing phrase

            phrase_start_index = pred_vals_array[:,0].argmax()

            ### Get the ELMO Encoding of the entire vector:

            text_elmo = row.text

            text=tf.convert_to_tensor([str(tokens_without_sw)])
            out = embed.signatures['default'](text)['elmo']

            PAE_vals_array = np.vstack((PAE_vals_array,out[0][phrase_start_index:phrase_start_index+3]))

        

B (3, 1024)


ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 3 and the array at index 1 has size 1024

In [285]:
NAE_vals_array.shape

(81, 1024)

In [286]:
(int(NAE_vals_array.shape[0])/int(3))

27.0

In [272]:
aa = NAE_vals_array.reshape(int(NAE_vals_array.shape[0]/3),3,1024)

In [273]:
aa.shape

(3, 3, 1024)

In [225]:
len(out[0][0])

1024

In [231]:
NAE_vals_array[0].shape

(3, 1024)

In [241]:
aa = np.array(NAE_vals_array)

In [238]:
aa[0]

array([[-0.73210824, -0.3453857 , -0.124824  , ..., -0.18160583,
         0.4167027 ,  0.24779648],
       [-0.21743327, -0.1413041 , -0.01641104, ...,  0.09404293,
         0.06934943, -0.25008377],
       [-0.15964414, -0.01390902, -0.16066319, ...,  0.564662  ,
         0.42623278,  0.57753956]], dtype=float32)

In [150]:
out[0][0:2]

<tf.Tensor: id=85024, shape=(2, 1024), dtype=float32, numpy=
array([[-0.73210824, -0.3453857 , -0.124824  , ..., -0.18160583,
         0.4167027 ,  0.24779648],
       [-0.21743327, -0.1413041 , -0.01641104, ...,  0.09404293,
         0.06934943, -0.25008377]], dtype=float32)>

In [127]:
pred_vals_array

array([[0.63093668, 0.36906329],
       [0.7150175 , 0.28498253],
       [0.7244342 , 0.2755658 ],
       [0.67767799, 0.32232198]])

In [249]:
# lstm autoencoder to recreate a timeseries
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import RepeatVector
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.callbacks import EarlyStopping

In [51]:
'''
A UDF to convert input data into 3-D
array as required for LSTM network.
'''

def temporalize(X, y, lookback):
    output_X = []
    output_y = []
    for i in range(len(X)-lookback-1):
        t = []
        for j in range(1,lookback+1):
            # Gather past records upto the lookback period
            t.append(X[[(i+j+1)], :])
        output_X.append(t)
        output_y.append(y[i+lookback+1])
    return output_X, output_y

In [52]:
# define input timeseries
timeseries = np.array([[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
                       [0.1**3, 0.2**3, 0.3**3, 0.4**3, 0.5**3, 0.6**3, 0.7**3, 0.8**3, 0.9**3]]).transpose()

timesteps = timeseries.shape[0]
n_features = timeseries.shape[1]
timeseries

array([[0.1  , 0.001],
       [0.2  , 0.008],
       [0.3  , 0.027],
       [0.4  , 0.064],
       [0.5  , 0.125],
       [0.6  , 0.216],
       [0.7  , 0.343],
       [0.8  , 0.512],
       [0.9  , 0.729]])

In [53]:
timesteps = 3
X, y = temporalize(X = timeseries, y = np.zeros(len(timeseries)), lookback = timesteps)

n_features = 2
X = np.array(X)
X = X.reshape(X.shape[0], timesteps, n_features)

X

array([[[0.3  , 0.027],
        [0.4  , 0.064],
        [0.5  , 0.125]],

       [[0.4  , 0.064],
        [0.5  , 0.125],
        [0.6  , 0.216]],

       [[0.5  , 0.125],
        [0.6  , 0.216],
        [0.7  , 0.343]],

       [[0.6  , 0.216],
        [0.7  , 0.343],
        [0.8  , 0.512]],

       [[0.7  , 0.343],
        [0.8  , 0.512],
        [0.9  , 0.729]]])

In [54]:
# define model
model = Sequential()
model.add(LSTM(1024, activation='relu', input_shape=(timesteps,n_features), return_sequences=True))
model.add(LSTM(720, activation='relu', return_sequences=True))
model.add(LSTM(200, activation='relu', return_sequences=False))
model.add(RepeatVector(timesteps))
model.add(LSTM(200, activation='relu', return_sequences=True))
model.add(LSTM(720, activation='relu', return_sequences=True))
model.add(LSTM(1024, activation='relu', return_sequences=True))
model.add(TimeDistributed(Dense(n_features)))
model.compile(optimizer='adam', loss='mse')
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_24 (LSTM)               (None, 3, 1024)           4206592   
_________________________________________________________________
lstm_25 (LSTM)               (None, 3, 720)            5025600   
_________________________________________________________________
lstm_26 (LSTM)               (None, 200)               736800    
_________________________________________________________________
repeat_vector_6 (RepeatVecto (None, 3, 200)            0         
_________________________________________________________________
lstm_27 (LSTM)               (None, 3, 200)            320800    
_________________________________________________________________
lstm_28 (LSTM)               (None, 3, 720)            2652480   
_________________________________________________________________
lstm_29 (LSTM)               (None, 3, 1024)          

In [55]:
# fit model
model.fit(X, X, epochs=300, batch_size=5, verbose=1)
# demonstrate reconstruction
yhat = model.predict(X, verbose=0)
print('---Predicted---')
print(np.round(yhat,3))
print('---Actual---')
print(np.round(X, 3))

Train on 5 samples
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Ep

In [8]:
#requires Tensorflow 2.0 and Tf-hub 0.7!!
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np

In [9]:
print(tf.__version__)
print(hub.__version__)

2.0.0
0.7.0


In [10]:
module_url = "https://tfhub.dev/google/elmo/3"
embed = hub.load(module_url)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [56]:
text=tf.convert_to_tensor(["I am feeling kind of blue", "Blue is the color of the sky","I am feeling kind of blue"])
out = embed.signatures['default'](text)['elmo']

In [57]:
import pandas as pd
import re

In [13]:
text = pd.read_csv('final.csv')

In [23]:
text.head()

Unnamed: 0.1,Unnamed: 0,sentiment,text
0,0,Neutral,RT @NancyLeeGrahn: How did everyone feel about...
1,1,Positive,RT @ScottWalker: Didn't catch the full #GOPdeb...
2,2,Neutral,RT @TJMShow: No mention of Tamir Rice and the ...
3,3,Positive,RT @RobGeorge: That Carly Fiorina is trending ...
4,4,Positive,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...


In [15]:
tweets = text[text.sentiment == 'Negative']

In [16]:
tweets = tweets.iloc[:100]

In [17]:
### Text Cleaning 

# tweets.text = tweets.text.str.lower()

tweets.text = tweets.text.str.replace("&amp","")
tweets.text = tweets.text.str.replace("\n"," ")
tweets.text = tweets.text.str.replace("\\\\"," ")

### Deleting Multiple Spaces 

tweets.text =  [re.sub(' +', ' ', str(x)) for x in tweets['text']]

In [18]:
text=tf.convert_to_tensor([str(list(tweets.text.values))])
out = embed.signatures['default'](text)['elmo']

In [58]:
out.shape

TensorShape([3, 7, 1024])

In [61]:
X = (out)

In [62]:
X.shape

TensorShape([3, 7, 1024])

In [247]:
aa

array([array([[-0.73210824, -0.3453857 , -0.124824  , ..., -0.18160583,
         0.4167027 ,  0.24779648],
       [-0.21743327, -0.1413041 , -0.01641104, ...,  0.09404293,
         0.06934943, -0.25008377],
       [-0.15964414, -0.01390902, -0.16066319, ...,  0.564662  ,
         0.42623278,  0.57753956]], dtype=float32),
       array([[-0.46022922, -0.4642863 ,  0.14044693, ..., -0.02865595,
        -0.14250925, -1.1547729 ],
       [-0.20208055,  0.06410888,  0.01537442, ...,  0.42384213,
         0.15774754, -0.6359169 ],
       [-0.56034267,  0.01419745,  0.37376517, ...,  0.60058975,
         0.11053625, -0.12548313]], dtype=float32),
       array([], shape=(0, 1024), dtype=float32)], dtype=object)

In [242]:
timesteps = 3

In [243]:
n_features = 1024

In [250]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)

# define model
model_TAE = Sequential()
model_TAE .add(LSTM(1024, activation='relu', input_shape=(timesteps,n_features), return_sequences=True))
model_TAE .add(LSTM(720, activation='relu', input_shape=(timesteps,n_features), return_sequences=True))
model_TAE .add(LSTM(128, activation='relu', input_shape=(timesteps,n_features), return_sequences=True))
model_TAE .add(LSTM(64, activation='relu', return_sequences=False))
model_TAE .add(RepeatVector(timesteps))
model_TAE .add(LSTM(64, activation='relu', return_sequences=True))
model_TAE .add(LSTM(128, activation='relu', return_sequences=True))
model_TAE .add(LSTM(720, activation='relu', return_sequences=True))
model_TAE .add(LSTM(1027, activation='relu', return_sequences=True))
model_TAE .add(TimeDistributed(Dense(n_features)))
model_TAE .compile(optimizer='adam', loss='mse')
model.summary()
# model_TAE .fit(aa, aa, epochs=3, batch_size=5)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 151, 100)          20300     
_________________________________________________________________
simple_rnn_6 (SimpleRNN)     (None, 151, 128)          29312     
_________________________________________________________________
simple_rnn_7 (SimpleRNN)     (None, 40)                6760      
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 82        
Total params: 56,454
Trainable params: 56,454
Non-trainable params: 0
_________________________________________________________________


In [254]:
aa.reshape(aa.shape[0],3,1024)

ValueError: cannot reshape array of size 3 into shape (3,3,1024)

In [274]:
model_TAE .fit(aa, aa, epochs=3, batch_size=5)

Train on 3 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x1af6a2a210>

In [67]:
# fit model
# model.fit(X, X, epochs=300, batch_size=5, verbose=0)
# # demonstrate reconstruction
yhat = model.predict(X, verbose=0)
print('---Predicted---')
print(np.round(yhat,3))
print('---Actual---')
print(np.round(X, 3))

---Predicted---
[[[-1.235 -0.566 -0.178 ... -0.25   0.243  0.375]
  [-0.464 -0.697 -0.437 ...  0.539  0.952  0.016]
  [-0.288  0.496  0.338 ...  0.291  0.708  0.739]
  ...
  [ 0.163 -0.189  0.24  ... -1.57   0.661  0.116]
  [-0.585 -0.742  0.053 ...  0.24   0.311  0.319]
  [-0.029 -0.044  0.04  ...  0.027 -0.013 -0.017]]

 [[-0.634 -0.312 -0.31  ...  0.194 -0.653  0.008]
  [-0.117  0.029  0.314 ... -0.376  0.763  0.332]
  [ 0.144 -0.207  0.477 ...  0.135  0.041 -0.109]
  ...
  [-0.183  0.184 -0.156 ... -1.519  0.263 -0.357]
  [ 0.531  0.     0.062 ...  0.019  0.306 -0.491]
  [ 0.319  0.254 -0.507 ...  0.168  0.236  0.735]]

 [[-1.235 -0.566 -0.178 ... -0.25   0.243  0.375]
  [-0.464 -0.697 -0.437 ...  0.539  0.952  0.016]
  [-0.288  0.496  0.338 ...  0.291  0.708  0.739]
  ...
  [ 0.163 -0.189  0.24  ... -1.57   0.661  0.116]
  [-0.585 -0.742  0.053 ...  0.24   0.311  0.319]
  [-0.029 -0.044  0.04  ...  0.027 -0.013 -0.017]]]
---Actual---
[[[-1.225 -0.56  -0.176 ... -0.25   0.242  0.37

In [70]:
X[1][1]

<tf.Tensor: id=44796, shape=(), dtype=float32, numpy=0.029630631>

In [72]:
from sklearn.metrics.pairwise import cosine_similarity

In [76]:
yhat[1][1]

array([-0.11707014,  0.02946293,  0.31371886, ..., -0.3759995 ,
        0.76296276,  0.33222032], dtype=float32)

In [81]:
cosine_similarity(X[1],yhat[1])

array([[0.9999984 , 0.22021209, 0.17488632, 0.19086945, 0.12718728,
        0.18547067, 0.13892356],
       [0.22004215, 0.99999267, 0.43366843, 0.27878729, 0.31209975,
        0.21988192, 0.2538378 ],
       [0.17521492, 0.43285212, 0.9999906 , 0.41001785, 0.28900406,
        0.57374215, 0.19866472],
       [0.1898911 , 0.27963036, 0.41042745, 0.99998736, 0.28019905,
        0.27675462, 0.37575626],
       [0.12598501, 0.31087074, 0.2900501 , 0.27850658, 0.9999824 ,
        0.4377004 , 0.26187444],
       [0.18399483, 0.22046381, 0.57457495, 0.2792754 , 0.44138718,
        0.9999609 , 0.3052461 ],
       [0.1402052 , 0.25613952, 0.20154655, 0.37691993, 0.27127737,
        0.31106085, 0.9999559 ]], dtype=float32)

In [1]:
import pandas as pd

In [93]:
Data = pd.read_csv('final.csv')

In [94]:
Data.head()

Unnamed: 0.1,Unnamed: 0,sentiment,text
0,0,Neutral,RT @NancyLeeGrahn: How did everyone feel about...
1,1,Positive,RT @ScottWalker: Didn't catch the full #GOPdeb...
2,2,Neutral,RT @TJMShow: No mention of Tamir Rice and the ...
3,3,Positive,RT @RobGeorge: That Carly Fiorina is trending ...
4,4,Positive,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...


In [95]:
from nltk.tokenize import word_tokenize
import string 

In [96]:
a = Data.text[1]

In [97]:
a = a.translate(str.maketrans('','',string.punctuation))

In [98]:
tokens = word_tokenize(a)
from nltk.corpus import stopwords 
tokens_without_sw = [word for word in tokens if not word in stopwords.words()]
for i in range(len(tokens_without_sw)-2):
    print(tokens_without_sw[i:i+3])

['RT', 'ScottWalker', 'Didnt']
['ScottWalker', 'Didnt', 'catch']
['Didnt', 'catch', 'full']
['catch', 'full', 'GOPdebate']
['full', 'GOPdebate', 'last']
['GOPdebate', 'last', 'night']
['last', 'night', 'Here']
['night', 'Here', 'Scotts']
['Here', 'Scotts', 'best']
['Scotts', 'best', 'lines']
['best', 'lines', '90']
['lines', '90', 'seconds']
['90', 'seconds', 'Walker16']
['seconds', 'Walker16', 'httptcoZSfF…']


In [21]:
len(tokens_without_sw)

16

In [28]:
for i in range(len(tokens_without_sw)-2):
    print(tokens_without_sw[i:i+3])

['RT', 'ScottWalker', 'Didnt']
['ScottWalker', 'Didnt', 'catch']
['Didnt', 'catch', 'full']
['catch', 'full', 'GOPdebate']
['full', 'GOPdebate', 'last']
['GOPdebate', 'last', 'night']
['last', 'night', 'Here']
['night', 'Here', 'Scotts']
['Here', 'Scotts', 'best']
['Scotts', 'best', 'lines']
['best', 'lines', '90']
['lines', '90', 'seconds']
['90', 'seconds', 'Walker16']
['seconds', 'Walker16', 'httptcoZSfF…']


In [331]:
from tensorflow.keras import backend as K
import tensorflow as tf
a = K.placeholder(shape=(None, 2,2))
b = K.ones_like(a)
print(b.shape)

(None, 2, 2)


In [332]:
type(a)

tensorflow.python.framework.ops.Tensor

In [333]:
a.shape

TensorShape([None, 2, 2])

In [334]:
arr1 = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])

In [335]:
type(arr1)

numpy.ndarray

In [336]:
arr1.shape

(2, 2, 2)

In [340]:
a = tf.concat([a,arr1],0)

In [349]:
print(a[0][1])

Tensor("strided_slice_3:0", shape=(2,), dtype=float32)


In [345]:
with tf.compat.v1.Session() as sess:  print(a.eval(session=sess)) 

ValueError: Cannot use the given session to evaluate tensor: the tensor's graph is different from the session's graph.