In [214]:
import pandas as pd
import csv
import re
import pickle

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from wordcloud import WordCloud
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import gensim
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

In [215]:
DATASET = "data/train.tsv"

In [216]:
df = pd.read_csv(DATASET, sep='\t')
df = df.drop(columns = ["PhraseId", "SentenceId"])
df.head()

Unnamed: 0,Phrase,Sentiment
0,A series of escapades demonstrating the adage ...,1
1,A series of escapades demonstrating the adage ...,2
2,A series,2
3,A,2
4,series,2


In [217]:
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
STOP_WORDS = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    words = word_tokenize(text) # tokenized words
    
    filter_text = [w for w in words if not w in STOP_WORDS]
    
    for text in filter_text:
        text = lemmatizer.lemmatize(text)
    
    return " ".join(filter_text)

In [218]:
df.Phrase = df.Phrase.apply(lambda x: preprocess(x))
df.head()

Unnamed: 0,Phrase,Sentiment
0,series escapades demonstrating adage good goos...,1
1,series escapades demonstrating adage good goose,2
2,series,2
3,,2
4,series,2


In [219]:
with open('df.pickle', 'wb') as handle:
    pickle.dump(df, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Understanding the Data

In [220]:
# negative
plt.figure(figsize = (20,20)) 
wc = WordCloud(max_words = 2000 , width = 1600 , height = 800).generate(" ".join(df[df.Sentiment == '0'].Phrase))
plt.imshow(wc , interpolation = 'bilinear')

  res_values = method(rvalues)


ValueError: We need at least 1 word to plot a word cloud, got 0.

<Figure size 1440x1440 with 0 Axes>

In [221]:
# positive
plt.figure(figsize = (20,20)) 
wc = WordCloud(max_words = 2000 , width = 1600 , height = 800).generate(" ".join(df[df.Sentiment == '4'].Phrase))
plt.imshow(wc , interpolation = 'bilinear')

ValueError: We need at least 1 word to plot a word cloud, got 0.

<Figure size 1440x1440 with 0 Axes>

## Splitting Data

In [222]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
print("TRAIN size:", len(df_train))
print("TEST size:", len(df_test))

TRAIN size: 124848
TEST size: 31212


## Creating Vocabulary

In [225]:
documents = [_text.split() for _text in df_train.Phrase] 

In [226]:
w2v_model = gensim.models.word2vec.Word2Vec(size=300, 
                                            window=7, 
                                            min_count=10, 
                                            workers=8)

In [227]:
w2v_model.build_vocab(documents)

In [228]:
words = w2v_model.wv.vocab.keys()
vocab_size = len(words)
print("Vocab size", vocab_size)

Vocab size 9049


In [229]:
w2v_model.most_similar("love")

  """Entry point for launching an IPython kernel.


[('framed', 0.23185424506664276),
 ('polished', 0.20323209464550018),
 ('48', 0.19862402975559235),
 ('schaeffer', 0.19234591722488403),
 ('befuddling', 0.18858270347118378),
 ('dropping', 0.1852714717388153),
 ('jokes', 0.18349961936473846),
 ('dass', 0.18309736251831055),
 ('thought', 0.18301591277122498),
 ('bet', 0.18293318152427673)]

In [231]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train.Phrase)

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

Total words 15130


In [233]:
x_train = pad_sequences(tokenizer.texts_to_sequences(df_train.Phrase), maxlen=300)
x_test = pad_sequences(tokenizer.texts_to_sequences(df_test.Phrase), maxlen=300)