In [1]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding, LSTM
from keras.layers import Conv1D, Flatten

import wandb
from wandb.keras import WandbCallback

import numpy as np

from keras.preprocessing import text
from proj2_helpers import *

Using TensorFlow backend.


In [2]:
from __future__ import division, print_function
from gensim import models
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Conv1D, GlobalMaxPooling1D, Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
import collections
import re
import string

In [3]:
from nltk import word_tokenize, WordNetLemmatizer

In [4]:
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments['tokens'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)

### DATA LOADING

In [5]:
RESULT_POS_PATH = './Results/pp_pos_otpl_nd.txt'
RESULT_NEG_PATH = './Results/pp_neg_otpl_nd.txt'
RES_PATH = './Results/pp_test_otpl.txt'

In [6]:
# load the data files = list with each line being a tweet
result_pos = open(RESULT_POS_PATH, "r").read().splitlines()
result_neg = open(RESULT_NEG_PATH, "r").read().splitlines()
test_set = open(RES_PATH, "r").read().splitlines()

### DATAFRAME CONSTRUCTION

In [7]:
#-----------------------------------------TRAINING SET---------------------------------------------------------------------------

# create labels
label_pos = [1] * len(result_pos)
#create a df
pos_df = pd.DataFrame(list(zip(label_pos, result_pos)),columns=["Sentiment","Tweet"]) 
del label_pos

# create labels
label_neg = [-1] * len(result_neg)
# create a df
neg_df = pd.DataFrame(list(zip(label_neg, result_neg)),columns=["Sentiment","Tweet"]) #create a df
del label_neg

# regroup the dfs, ignore index in order to get new ones (->no duplicate)
train_df = pd.concat([pos_df,neg_df],ignore_index=True) #regroup the dfs, ignore index in order to get new ones (->no duplicate)

train_tokens = [word_tokenize(sen) for sen in train_df.Tweet] 

train_df['tokens'] = train_tokens

CNNLabel = [0 if val == -1 else 1 for val in train_df.Sentiment.values]

train_df.insert(2,"CNN_Labels",CNNLabel)

# shuffle the rows
train_df = train_df.sample(frac=1) 

pos = []
neg = []
for l in train_df.CNN_Labels:
    if l == 0:
        pos.append(0)
        neg.append(1)
    elif l == 1:
        pos.append(1)
        neg.append(0)
        
train_df['Pos']= pos
train_df['Neg']= neg

train_df

Unnamed: 0,Sentiment,Tweet,CNN_Labels,tokens,Pos,Neg
68517,1,mao keep taste silly,1,"[mao, keep, taste, silly]",1,0
58313,1,happy birthday fool,1,"[happy, birthday, fool]",1,0
5585,1,morning thanking man another day another chanc...,1,"[morning, thanking, man, another, day, another...",1,0
51280,1,hour live australia right waiting,1,"[hour, live, australia, right, waiting]",1,0
135379,-1,ago rate sanity,0,"[ago, rate, sanity]",0,1
...,...,...,...,...,...,...
16006,1,yay wanna ask something lee dmdmdmdm,1,"[yay, wan, na, ask, something, lee, dmdmdmdm]",1,0
17693,1,favourite tweet,1,"[favourite, tweet]",1,0
10342,1,ask massage therapy physical therapy help,1,"[ask, massage, therapy, physical, therapy, help]",1,0
135183,-1,pathogen wild farmed fish sea louse elli horwo...,0,"[pathogen, wild, farmed, fish, sea, louse, ell...",0,1


In [8]:
#-----------------------------------------TEST SET---------------------------------------------------------------------------
test_ids = np.linspace(1,10000,10000, dtype=int)
# create a df
test_df = pd.DataFrame(list(zip(test_ids, test_set)), columns=["Tweet_submission_id","Tweet"]) 

test_tokens = [word_tokenize(sen) for sen in test_df.Tweet] 

test_df['tokens'] = test_tokens

test_df

Unnamed: 0,Tweet_submission_id,Tweet,tokens
0,1,sea doo pro sea scooter sport portable sea doo...,"[sea, doo, pro, sea, scooter, sport, portable,..."
1,2,shuck well work week come cheer put battery ca...,"[shuck, well, work, week, come, cheer, put, ba..."
2,3,stay away bug that baby,"[stay, away, bug, that, baby]"
3,4,madam lol perfectly fine contagious anymore mao,"[madam, lol, perfectly, fine, contagious, anym..."
4,5,whenever fall asleep watch always wake headache,"[whenever, fall, asleep, watch, always, wake, ..."
...,...,...,...
9995,9996,nice time friend lastnite,"[nice, time, friend, lastnite]"
9996,9997,please stop,"[please, stop]"
9997,9998,without daughter two time oscar winner sally f...,"[without, daughter, two, time, oscar, winner, ..."
9998,9999,fun class sweetcheeks,"[fun, class, sweetcheeks]"


### Split data into test and train

In [9]:
data_train, data_test = train_test_split(train_df, test_size=0.20, random_state=42)

In [10]:
all_training_words = [word for tokens in data_train["tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in data_train["tokens"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

960179 words total, with a vocabulary size of 38011
Max sentence length is 24


In [11]:
all_test_words = [word for tokens in data_test["tokens"] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in data_test["tokens"]]
TEST_VOCAB = sorted(list(set(all_test_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_test_words), len(TEST_VOCAB)))
print("Max sentence length is %s" % max(test_sentence_lengths))

239398 words total, with a vocabulary size of 19718
Max sentence length is 26


### WORD EMBEDDING

In [12]:
word2vec_path = 'GoogleNews-vectors-negative300.bin.gz'
word2vec = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [13]:
training_embeddings = get_word2vec_embeddings(word2vec, data_train, generate_missing=True)

In [14]:
MAX_SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 300

### Tokenize and Pad sequences

In [15]:
tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)
tokenizer.fit_on_texts(data_train["Tweet"].tolist())
training_sequences = tokenizer.texts_to_sequences(data_train["Tweet"].tolist())

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

Found 38010 unique tokens.


In [16]:
train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [17]:
train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)

(38011, 300)


In [18]:
test_sequences = tokenizer.texts_to_sequences(data_test["Tweet"].tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [19]:
X_train = train_cnn_data
X_test = test_cnn_data

### Define CNN

In [20]:
def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):
    
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=False)
    
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    convs = []
    filter_sizes = [2,3,4,5,6]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=200, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)


    l_merge = concatenate(convs, axis=1)

    x = Dropout(0.1)(l_merge)  
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    preds = Dense(labels_index, activation='sigmoid')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model

In [21]:
label_names = ['Pos','Neg']

In [22]:
y_train = data_train[label_names].values
y_test = data_test[label_names].values

In [23]:
model = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, 
                len(list(label_names)))

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 50, 300)      11403300    input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 49, 200)      120200      embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 48, 200)      180200      embedding_1[0][0]                
____________________________________________________________________________________________

### Train CNN

In [24]:
num_epochs = 3
batch_size = 34

In [26]:
hist = model.fit(X_train, y_train, epochs=num_epochs, validation_split=0.2, shuffle=True, batch_size=batch_size)

Train on 110854 samples, validate on 27714 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


### Test CNN

In [27]:
predictions = model.predict(test_cnn_data, batch_size=1024, verbose=1)



In [28]:
labels = [1, 0]

In [29]:
predictions

array([[7.7029389e-01, 2.2979766e-01],
       [5.1655674e-01, 4.8287201e-01],
       [9.0799481e-01, 9.1882288e-02],
       ...,
       [4.6922418e-01, 5.3029549e-01],
       [3.0102835e-03, 9.9710280e-01],
       [7.5116745e-06, 9.9999285e-01]], dtype=float32)

In [30]:
prediction_labels=[]
for p in predictions:
    prediction_labels.append(labels[np.argmax(p)])

In [31]:
sum(data_test.CNN_Labels==prediction_labels)/len(prediction_labels)

0.766215397049909

In [32]:
data_test.CNN_Labels.value_counts()

0    17538
1    17105
Name: CNN_Labels, dtype: int64

### TEST SET

In [34]:
all_Test_words = [word for tokens in test_df["tokens"] for word in tokens]
Test_sentence_lengths = [len(tokens) for tokens in test_df["tokens"]]
r_TEST_VOCAB = sorted(list(set(all_Test_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_Test_words), len(r_TEST_VOCAB)))
print("Max sentence length is %s" % max(Test_sentence_lengths))

68947 words total, with a vocabulary size of 9748
Max sentence length is 19


In [35]:
Test_sequences = tokenizer.texts_to_sequences(test_df["Tweet"].tolist())
Test_lr = pad_sequences(Test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [36]:
r_y_pred = model.predict(Test_lr, batch_size=1024, verbose=1)



In [37]:
labels = [1, 0]
r_prediction_labels=[]
for p in r_y_pred:
    prediction_labels.append(labels[np.argmax(p)])

In [38]:
r_prediction_labels = [-1 if pred == 0 else 1 for pred in prediction_labels]

In [39]:
test_id = test_df['Tweet_submission_id'].to_numpy()

In [42]:
create_csv_submission(test_id,r_prediction_labels, "./Submissions/W2V_CNN.csv")