In [0]:
from google.colab import drive
drive.mount('/content/drive')

##Data Input and Preprocessing

In [0]:
##few import statements
import sys
import os
import re
import numpy as np

import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.utils import shuffle

import tensorflow as tf
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import LSTM,Dense,Dropout
from keras.models import Model,Sequential
from keras.layers import Input,Bidirectional
from keras.layers import Embedding

from keras import metrics

In [0]:
## setting the random seed so that everybody gets same results for same setting
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)

In [0]:
os.chdir("/content/drive/My Drive/concetto dsc")

In [0]:
os.listdir()

In [0]:
## reading the data
train_data = []
with open('sample_data/tweets.txt','r',encoding='latin1') as f:
  for line in f.readlines():
    train_data.append(line[:-1])

In [0]:
train_data[10:12]

In [0]:
## reading the labels
train_labels = []
with open('sample_data/labels.txt','r',encoding='latin1') as f:
  for line in f.readlines():
    train_labels.append(int(line[:-1]=='1'))

In [0]:
train_labels[10:12]

In [0]:
train_data,train_labels = shuffle(train_data,train_labels)

In [0]:
## setting up some hypaer parameters
maxwords = 2000

max_sentence_length = 256
embeddingdim = 100
hiddensize = 64

In [0]:
## tokenizes each sentence
## creates a dictionary of words
tokenizer = Tokenizer(num_words=maxwords)
tokenizer.fit_on_texts(train_data)

In [0]:
train_tokens = tokenizer.texts_to_sequences(train_data) ## replaces each word with its index
train_indices = pad_sequences(train_tokens,max_sentence_length,padding='post') ## zero padding for equal length sentences

In [0]:
type(train_indices)

##Model and Training

In [0]:
def model(max_sent_len,max_num_words,embedding_dim,n_hidden):
  indices = Input(shape=(max_sent_len,))  
  vectors = Embedding(input_dim=max_num_words,output_dim=embedding_dim,trainable=True,input_length=max_sent_len)(indices)
  lstm_out = LSTM(n_hidden,dropout = 0.5,return_sequences=False)(vectors)
  probs = Dense(1,activation='sigmoid',)(lstm_out)
  
  lstm_model = Model(inputs=indices,output=probs)
  
  return lstm_model

In [0]:
lstm_model = model(max_sentence_length,maxwords,embeddingdim,hiddensize)

###Training

In [0]:
## setting up hyperparameter for training  
lr_rate = 0.0001
batchsize = 16
numepochs = 5

In [0]:
lstm_model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [0]:
lstm_model.fit(train_indices,train_labels,validation_split=0.2,batch_size=batchsize,epochs=numepochs)

In [0]:
lstm_model.fit(train_indices,train_labels,validation_split=0.2,batch_size=batchsize,epochs=numepochs)