In [None]:
from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/My Drive/2021-2022/Fall/Clubs/CAIS++/My Curriculum/Project



Mounted at /content/drive/
/content/drive/My Drive/2021-2022/Fall/Clubs/CAIS++/My Curriculum/Project


In [None]:
!pip install tensorflow
!pip install keras
!pip install numpy



In [None]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import os

EMBEDDING_DIM = 50

def loadData(tweets_dir, embeddings_dir):
	# Load tweets, valences
	print("1 -- Loading tweets and labels")
	tweets = pd.read_csv(tweets_dir)
	tweets.drop('author', axis=1, inplace=True)
	valences = tweets['valence']
	print(tweets.head())

	# Tokenize the tweets (convert sentence to sequence of words)
	print("2 -- Tokenizing the tweets: converting sentences to sequence of words")
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(tweets['tweet'])

	sequences = tokenizer.texts_to_sequences(tweets['tweet'])
	word_index = tokenizer.word_index

	# Pad sequences to ensure samples are the same size
	print("3 -- Padding sequences to ensure samples are the same size")
	training_data = pad_sequences(sequences)
	print(training_data)
	print("4 -- Loading pre-trained word embeddings. This may take a few minutes.")

	embeddings_index = {}
	f = open(embeddings_dir,'rb')
	for line in f:
		values = line.split()
		word = values[0].decode('UTF-8')
		coefs = np.asarray(values[1:], dtype='float32')
		embeddings_index[word] = coefs
	f.close()

	print("5 -- Finding word embeddings for words in our tweets.")
	# prepare word embedding matrix
	num_words = len(word_index)+1
	embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
	for word, i in word_index.items():
		if i >= num_words:
			continue
		embedding_vector = embeddings_index.get(word)
		if embedding_vector is not None:
	        # words not found in embedding index will be all-zeros.
			embedding_matrix[i] = embedding_vector

	return tweets, training_data, word_index, embedding_matrix, valences


TWEETS_DIR = './dataset/data.csv'
EMBEDDINGS_DIR = 'glove.6B.50d.txt'

tweets, tweets_preprocessed, word_index, embedding_matrix, valences = loadData(TWEETS_DIR, EMBEDDINGS_DIR)

1 -- Loading tweets and labels
   valence                                              tweet
0        0  @switchfoot http://twitpic.com/2y1zl - Awww, t...
1        0  is upset that he can't update his Facebook by ...
2        0  @Kenichan I dived many times for the ball. Man...
3        0    my whole body feels itchy and like its on fire 
4        0  @nationwideclass no, it's not behaving at all....
2 -- Tokenizing the tweets: converting sentences to sequence of words
3 -- Padding sequences to ensure samples are the same size
[[     0      0      0 ...     41      9    385]
 [     0      0      0 ...     40    273   1170]
 [     0      0      0 ...     31     12  27341]
 ...
 [     0      0      0 ...     14     11   2107]
 [     0      0      0 ...  13870 131975  98577]
 [     0      0      0 ... 230191 690959 690960]]
4 -- Loading pre-trained word embeddings. This may take a few minutes.
5 -- Finding word embeddings for words in our tweets.


In [None]:
# Dataset Information

print("Tweets Size: ", tweets.shape)
print("Training Data: ", tweets_preprocessed.shape)
print("Sample Tweet Post-Processing: ", tweets_preprocessed[100])
print("Max Tweet Length: ", tweets_preprocessed.shape[1])
print()
print(tweets_preprocessed)
print(valences[800000])

Tweets Size:  (1600000, 2)
Training Data:  (1600000, 118)
Sample Tweet Post-Processing:  [     0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0    851     12    347   5938
  23813    264    326   3078     16    326      3   8008     12      4
    347   5938   1946    264     39   1533     58 233041]
Max Tweet Length:  118

[[     0      0      0 ...     41      9    385]

In [None]:
#Choosing a portion of the data to run

chosenTweets = []
for i in range(0, len(tweets), 1000):
  currTweetData = tweets_preprocessed[i]
  temp = []
  for index in currTweetData:
    if index != 0:
      temp.append(embedding_matrix[index])
    else:
      temp.append(np.array([0 for j in range(50)], dtype='float32'))
  chosenTweets.append(temp)
print(chosenTweets)


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
#Choosing the proper valences

chosenValences = []
for i in range(0, len(tweets), 1000):
  if valences[i] == 0:
    chosenValences.append(np.array([1,0,0], dtype='float32'))
  elif valences[i] == 2:
    chosenValences.append(np.array([0,1,0], dtype='float32'))
  elif valences[i] == 4:
    chosenValences.append(np.array([0,0,1], dtype='float32'))
chosenValences

[array([1., 0., 0.], dtype=float32),
 array([1., 0., 0.], dtype=float32),
 array([1., 0., 0.], dtype=float32),
 array([1., 0., 0.], dtype=float32),
 array([1., 0., 0.], dtype=float32),
 array([1., 0., 0.], dtype=float32),
 array([1., 0., 0.], dtype=float32),
 array([1., 0., 0.], dtype=float32),
 array([1., 0., 0.], dtype=float32),
 array([1., 0., 0.], dtype=float32),
 array([1., 0., 0.], dtype=float32),
 array([1., 0., 0.], dtype=float32),
 array([1., 0., 0.], dtype=float32),
 array([1., 0., 0.], dtype=float32),
 array([1., 0., 0.], dtype=float32),
 array([1., 0., 0.], dtype=float32),
 array([1., 0., 0.], dtype=float32),
 array([1., 0., 0.], dtype=float32),
 array([1., 0., 0.], dtype=float32),
 array([1., 0., 0.], dtype=float32),
 array([1., 0., 0.], dtype=float32),
 array([1., 0., 0.], dtype=float32),
 array([1., 0., 0.], dtype=float32),
 array([1., 0., 0.], dtype=float32),
 array([1., 0., 0.], dtype=float32),
 array([1., 0., 0.], dtype=float32),
 array([1., 0., 0.], dtype=float32),
 

In [29]:
from keras.models import Sequential
from keras.layers import Embedding, Input
from keras.layers.merge import Concatenate
from keras.layers.core import Dense, Activation, Flatten
from keras.layers import Dropout, concatenate
from keras.layers.recurrent import LSTM
from keras.layers.wrappers import Bidirectional
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard
from keras import metrics
from keras.models import Model

model = Sequential()

# print(chosenTweets)

chosenTweets = np.array(chosenTweets,dtype=np.float)

model.add(LSTM(64, return_sequences = True, input_shape=(118, 50), activation='relu'))
model.add(Dropout(.2))

model.add(LSTM(64, activation='relu'))
model.add(Dropout(.2))

model.add(Dense(3, activation = 'tanh'))

print(model.summary())

LOSS = 'binary_crossentropy'
OPTIMIZER = 'RMSprop'

model.compile(loss = LOSS, optimizer = OPTIMIZER, metrics = [metrics.binary_accuracy])

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_25 (LSTM)              (None, 118, 64)           29440     
                                                                 
 dropout_23 (Dropout)        (None, 118, 64)           0         
                                                                 
 lstm_26 (LSTM)              (None, 64)                33024     
                                                                 
 dropout_24 (Dropout)        (None, 64)                0         
                                                                 
 dense_8 (Dense)             (None, 3)                 195       
                                                                 
Total params: 62,659
Trainable params: 62,659
Non-trainable params: 0
_________________________________________________________________
None


In [30]:
from sklearn.utils import shuffle

TEST_SIZE = 0.7

EPOCHS = 10
BATCH_SIZE = 128
# chosenTweets = np.asarray(chosenTweets).astype('float32')
# chosenValences = np.asarray(chosenValences).astype('float32')
chosenValences = np.array(chosenValences)
print (chosenTweets.shape)
print (chosenValences.shape)

trainTweets, trainValences = shuffle(chosenTweets, chosenValences)

model.fit(trainTweets, trainValences, 
          epochs = EPOCHS, 
          batch_size = BATCH_SIZE, 
          validation_split = TEST_SIZE)

(1600, 118, 50)
(1600, 3)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f65311a7450>