<a href="https://colab.research.google.com/github/nighthawk198/207_Applied_ML/blob/master/Sentence_Classification_with_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Libraries

In [0]:
from unicodedata import normalize
import pandas as pd
import numpy as np
import string, os, re
import psutil
import pickle
from sklearn.model_selection import train_test_split

# keras module for building LSTM 
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint
import keras.utils as ku 

# for pre-trained embeddings
import gensim
from gensim.models.word2vec import Word2Vec
from gensim.models.phrases import Phraser, Phrases
from gensim.models import KeyedVectors

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

## Load Datasets

In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
final_file = open(os.path.join("drive/My Drive", "data/reddit_train_test_capped.pkl"),'rb')
train_df, test_df= pickle.load(final_file),  pickle.load(final_file)

final_file.close()
train_df.shape

(50000, 10)

In [0]:
# subset to only the fields I will need
train_df = train_df[['score','body','is_popular']]
test_df = test_df[['score','body','is_popular']]

In [0]:
# remove nan in body, the input
train_df.dropna(subset=['body'], inplace=True)
test_df.dropna(subset=['body'], inplace=True)

In [0]:
# Google News embeddings based on 3M words in 300 dimensions
filename = os.path.join("drive/My Drive", "data/GoogleNews-vectors-negative300.bin")
gensim_embeddings = KeyedVectors.load_word2vec_format(filename, binary=True)

pretrained_weights = gensim_embeddings.wv.syn0
vocab_size, embedding_size = pretrained_weights.shape

In [0]:
df_data = pd.read_csv(os.path.join("drive/My Drive", "data/all_tweets_clean_v2.csv"))

_, t_test_df = train_test_split(df_data, test_size=0.3)

## Dataset Prep

### Dataset Cleaning

[My notes]: The data is relatively clean as-is, so we don't have to do that much cleaning. If reddit, might need to do more. If Twitter, think it'll be relatively easy since the data is somewhat clean as is. 

In [0]:
def clean_text(txt):
  txt = re.sub(r'https:\/\/t[.]co\/[A-Za-z0-9]*$', '', txt)
  txt = re.sub(r'\n', ' ', txt)
  txt = "".join(v for v in txt if v not in string.punctuation).lower()
  txt = txt.encode("utf8").decode("ascii",'ignore')
  txt = re.sub(' +', ' ', txt)
  return(txt)

train_corpus = train_df['body'].apply(clean_text)
test_corpus = test_df['body'].apply(clean_text)
t_test_corpus = t_test_df['body'].apply(clean_text)

### Tokenize

In [0]:
tokenizer = Tokenizer()

# tokenize our text
tokenizer.fit_on_texts(train_corpus)
# turn text into token sequence
train_sequences = tokenizer.texts_to_sequences(train_corpus)
test_sequences = tokenizer.texts_to_sequences(test_corpus)
t_test_sequences = tokenizer.texts_to_sequences(t_test_corpus)


### Padding Sequences and Obtaining Variables: Predictors and Targets


In [55]:
x_train = pad_sequences(train_sequences, maxlen = 100)
x_test = pad_sequences(test_sequences, maxlen = 100)
t_x_test = pad_sequences(t_test_sequences, maxlen = 100)
print(t_x_test.shape)

y_train = train_df['is_popular'].tolist()
y_test = test_df['is_popular'].tolist()
t_y_test = t_test_df['is_popular'].tolist()

(3339, 100)


## Model

### Architecture

In [10]:
def create_model2(embedding_vectors):
  model = Sequential()
  
  
  model.add(Embedding(input_dim=vocab_size,
                      output_dim=embedding_size,
                      weights=[pretrained_weights],
                      trainable=False,
                      name='embedding_layer'))
  
  model.add(LSTM(100))
  
  #model.add(Dropout(0.1))
  
  model.add(Dense(1, activation='sigmoid'))
  
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  
  return(model)

model = create_model2(embedding_vectors=100)

model.summary()


W0803 07:05:50.754146 139793014032256 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0803 07:05:50.771279 139793014032256 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0803 07:05:50.774182 139793014032256 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0803 07:05:50.783721 139793014032256 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W0803 07:05:50.784650 1397930140

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_layer (Embedding)  (None, None, 300)         900000000 
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 900,160,501
Trainable params: 160,501
Non-trainable params: 900,000,000
_________________________________________________________________


### Train

In [11]:
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=3, batch_size=64)

Train on 49846 samples, validate on 14949 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f237846ec18>

In [12]:
# Final evaluation of the model
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy:{}".format(scores[1]*100))

Accuracy:92.2068365776975


In [45]:
model.predict(x_test).shape

(14949, 1)

In [57]:
reddit_test_df=os.path.join("drive/My Drive", "data/reddit_train_test_capped.pkl")
twitter_test_df=os.path.join("drive/My Drive", "data/twitter_train_test_smaller_v2.pkl")
  
final_file = open(reddit_test_df,'rb')
_ , reddit_test_df =  pickle.load(final_file),  pickle.load(final_file)
final_file.close()
  
final_file = open(twitter_test_df,'rb')
_ , twitter_test_df =  pickle.load(final_file),  pickle.load(final_file)
final_file.close()
  
mod = "lstm_sentence_classification"

reddit_test_df.dropna(subset=['body'], inplace=True)

reddit_test_df[mod] = model.predict(x_test)
print("twitter_test_df", twitter_test_df.shape)
print("t_x_test", t_x_test.shape)
twitter_test_df[mod] = model.predict(t_x_test)
    
reddit_test_df.to_csv('drive/My Drive/models/reddit_test_predictions_mj.csv',index=False)
twitter_test_df.to_csv('drive/My Drive/models/twitter_test_predictions_mj.csv',index=False)

twitter_test_df (3339, 7)
t_x_test (3339, 100)
