In [None]:
import collections
import numpy as np
import pandas as pd
import gensim
from keras.models import Sequential
from keras.layers import Dense
from keras.layers.convolutional import Convolution1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers import Flatten
from keras.utils import np_utils

from sklearn.preprocessing import LabelEncoder

# Googles Word2Vec 

Shows how to use googles pretrained model as inputs to a CNN 

### First lets load in the pretrained model and do some data exploration

In [2]:
# Set random seed (for reproducibility)
np.random.seed(1000)

In [4]:
#Get word vectors using googles pretrained word2vec  
#takes a minute 
google = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',binary=True)

#includes some stop words (i.e. the, also, should, but not a, and, of)
#includes misspellings 
#includes commony paired words (i.e. New_York)

vocab = google.vocab.keys()
total_vocab = len(vocab)
print ("Set includes", total_vocab, "words")

# Copy word vectors and delete Word2Vec model  and original corpus to save memory
X_vecs = google.wv
#del google #wait to explore model first 

Set includes 3000000 words


  


In [6]:
from gensim.models import KeyedVectors
google.wv.most_similar('climate_change')

  


[('global_warming', 0.889603853225708),
 ('Climate_Change', 0.7147639393806458),
 ('Climate', 0.6953692436218262),
 ('Global_warming', 0.661054253578186),
 ('climate', 0.6569506525993347),
 ('greenhouse_gas_emissions', 0.6449477076530457),
 ('greenhouse_gases', 0.6432511806488037),
 ('carbon_emissions', 0.6395047307014465),
 ('Global_Warming', 0.6281516551971436),
 ('reducing_carbon_emissions', 0.6227284669876099)]

In [7]:
del google #save mem 

In [9]:
#Explore the vectors 
X_vecs['hello'].size #check vectors 

300

## Now see how using pretrained vectors improves the model

In [12]:
# Load in data
# One hot encode sentiment 

data = pd.read_csv("../wyns/data/tweet_global_warming.csv", encoding="latin")
print("Full dataset: {}".format(data.shape[0]))
data['existence'].fillna(value='ambiguous', inplace = True) #replace NA's in existence with "ambiguous"
data['existence'].replace(('Y', 'N'), ('Yes', 'No'), inplace=True) #rename so encoder doesnt get confused
data = data.dropna() #now drop NA values
print("dataset without NaN: {}".format(data.shape[0]))
X = data.iloc[:,0]
Y = data.iloc[:,1]

#one hot encoding = dummy vars from categorical var 
#Create a one-hot encoded binary matrix 
#N, Y, Ambig
#1, 0, 0 
#0, 1, 0
#0, 0, 1

#encode class as integers 
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y) 

#convert integers to one hot encoded
Y = np_utils.to_categorical(encoded_Y)

Full dataset: 6090
dataset without NaN: 6087


In [14]:
def read_data(data_file):
    for i, line in enumerate (data_file): 
        yield gensim.utils.simple_preprocess (line)

def build_dataset(vocab, n_words):
    """Process the top n_words from raw inputs (vocab from read_data) into a dataset."""
    count = [['UNK', -1]] #stores when word is found --> UNK = unknown 
    count.extend(collections.Counter(vocab).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    token = list() 
    unk_count = 0
    for word in vocab: #
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK'] assigned to 0 
            unk_count += 1
        token.append(index) #outputs a list of integers that represent words
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys())) #allows for word lookup by integer
    return token, count, dictionary, reversed_dictionary

In [15]:
top_words = 20000 #use number higher than expected unique words

tweet_vocab = list(read_data(data['tweet']))
flat_tweet_vocab = [item for sublist in tweet_vocab for item in sublist]
token, count, dictionary, reversed_dictionary = build_dataset(flat_tweet_vocab, top_words)

print("Number of unique words: {}".format(len(count))) #correct num of unique words 

Number of unique words: 12117


In [17]:
# Create train and test sets
# Generate random indexes

test_split = 0.8
train_size = int(len(X)*test_split)
test_size = len(X) - train_size
vector_size = 300
window_size = 10
max_tweet_length=512

indexes = set(np.random.choice(len(tweet_vocab), train_size + test_size, replace=False))

X_train = np.zeros((train_size, max_tweet_length, vector_size))
Y_train = np.zeros((train_size, 3), dtype=np.int32)
X_test = np.zeros((test_size, max_tweet_length, vector_size))
Y_test = np.zeros((test_size, 3), dtype=np.int32)

In [18]:
for i, index in enumerate(indexes):
    for t, token in enumerate(tweet_vocab[index]):
        if t >= max_tweet_length:
            break
        
        if token not in X_vecs:
            continue
    
        if i < train_size:
            X_train[i, t, :] = X_vecs[token]
        else:
            X_test[i - train_size, t, :] = X_vecs[token]
            
            
    if i < train_size:
        Y_train[i, :] = Y[index]
    else:
        Y_test[i - train_size, :] = Y[index]

# Lets look at how our model performs now! 

In [20]:
#Some variables 

top_words = 1000
max_words = 150
filters = 32 #filter = 1 x KERNEL 

# create the model 
model = Sequential()

model.add(Convolution1D(32, kernel_size=3, activation='elu', padding='same',
                 input_shape=(max_tweet_length, vector_size)))

model.add(Convolution1D(filters=filters, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(3, activation='sigmoid')) 
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
print(model.summary())

# Fit the model
model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=2, batch_size=128,
    verbose=1)

# Final evaluation of the model
scores = model.evaluate(X_test, Y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 512, 32)           28832     
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 512, 32)           3104      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 256, 32)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 8192)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 250)               2048250   
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 753       
Total params: 2,080,939
Trainable params: 2,080,939
Non-trainable params: 0
_________________________________________________________________
