In [1]:
import collections
import numpy as np
import pandas as pd
import gensim
from keras.models import Sequential
from keras.layers import Dense
from keras.layers.convolutional import Convolution1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers import Flatten
from keras.utils import np_utils

from sklearn.preprocessing import LabelEncoder

Using TensorFlow backend.


In [2]:
# Set random seed (for reproducibility)
np.random.seed(1000)

In [3]:
#Get word vectors using googles pretrained word2vec  
#takes a minute 
google = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',binary=True)

#includes some stop words (i.e. the, also, should, but not a, and, of)
#includes misspellings 
#includes commony paired words (i.e. New_York)

vocab = google.vocab.keys()
total_vocab = len(vocab)
print ("Set includes", total_vocab, "words")

# Copy word vectors and delete Word2Vec model  and original corpus to save memory
X_vecs = google.wv
#del google #wait to explore model first 

Set includes 3000000 words




In [4]:
from gensim.models import KeyedVectors
google.wv.most_similar('climate_change')

  from ipykernel import kernelapp as app


[('global_warming', 0.889603853225708),
 ('Climate_Change', 0.7147639393806458),
 ('Climate', 0.6953692436218262),
 ('Global_warming', 0.661054253578186),
 ('climate', 0.6569506525993347),
 ('greenhouse_gas_emissions', 0.6449477076530457),
 ('greenhouse_gases', 0.6432511806488037),
 ('carbon_emissions', 0.6395047307014465),
 ('Global_Warming', 0.6281516551971436),
 ('reducing_carbon_emissions', 0.6227284669876099)]

In [5]:
del google 

In [6]:
#Explore the vectors 
X_vecs['hello'] #check vectors 
#X_vecs['global warming'.split()] #check vectors 
#X_vecs['global_warming'] # Includes common phrases 

array([-0.05419922,  0.01708984, -0.00527954,  0.33203125, -0.25      ,
       -0.01397705, -0.15039062, -0.265625  ,  0.01647949,  0.3828125 ,
       -0.03295898, -0.09716797, -0.16308594, -0.04443359,  0.00946045,
        0.18457031,  0.03637695,  0.16601562,  0.36328125, -0.25585938,
        0.375     ,  0.171875  ,  0.21386719, -0.19921875,  0.13085938,
       -0.07275391, -0.02819824,  0.11621094,  0.15332031,  0.09082031,
        0.06787109, -0.0300293 , -0.16894531, -0.20800781, -0.03710938,
       -0.22753906,  0.26367188,  0.012146  ,  0.18359375,  0.31054688,
       -0.10791016, -0.19140625,  0.21582031,  0.13183594, -0.03515625,
        0.18554688, -0.30859375,  0.04785156, -0.10986328,  0.14355469,
       -0.43554688, -0.0378418 ,  0.10839844,  0.140625  , -0.10595703,
        0.26171875, -0.17089844,  0.39453125,  0.12597656, -0.27734375,
       -0.28125   ,  0.14746094, -0.20996094,  0.02355957,  0.18457031,
        0.00445557, -0.27929688, -0.03637695, -0.29296875,  0.19

In [7]:
# Load in data
# One hot encode sentiment 

data = pd.read_csv("../core/data/tweet_global_warming.csv", encoding="latin")
print("Full dataset: {}".format(data.shape[0]))
data['existence'].fillna(value='ambiguous', inplace = True) #replace NA's in existence with "ambiguous"
data['existence'].replace(('Y', 'N'), ('Yes', 'No'), inplace=True) #rename so encoder doesnt get confused
data = data.dropna() #now drop NA values
print("dataset without NaN: {}".format(data.shape[0]))
X = data.iloc[:,0]
Y = data.iloc[:,1]
#print("Number of unique words: {}".format(len(np.unique(X)))) ##why is this wrong?? ##

#one hot encoding = dummy vars from categorical var 
#Create a one-hot encoded binary matrix 
#N, Y, Ambig
#1, 0, 0 
#0, 1, 0
#0, 0, 1

#encode class as integers 
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y) 

#convert integers to one hot encoded
Y = np_utils.to_categorical(encoded_Y)

Full dataset: 6090
dataset without NaN: 6087


In [8]:
########I think we want to look at gensim over nltk for this data########
###################### See 3 cells below! #######
from nltk.stem.lancaster import LancasterStemmer
from nltk.tokenize import RegexpTokenizer

corpus = X
# Tokenize and stem
tkr = RegexpTokenizer('[a-zA-Z0-9@]+')
stemmer = LancasterStemmer()

tokenized_corpus = []

for i, tweet in enumerate(corpus):
    tokens = [stemmer.stem(t) for t in tkr.tokenize(tweet) if not t.startswith('@')]
    tokenized_corpus.append(tokens)

In [9]:
def read_data(data_file):
    for i, line in enumerate (data_file): 
        # do some pre-processing and return a list of words for each review text
        yield gensim.utils.simple_preprocess (line)

def build_dataset(vocab, n_words):
    """Process the top n_words from raw inputs (vocab from read_data) into a dataset."""
    count = [['UNK', -1]] #stores when word is found --> UNK = unknown 
    count.extend(collections.Counter(vocab).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    token = list() 
    unk_count = 0
    for word in vocab: #
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK'] assigned to 0 
            unk_count += 1
        token.append(index) #outputs a list of integers that represent words
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys())) #allows for word lookup by integer
    return token, count, dictionary, reversed_dictionary

In [18]:
top_words = 20000 #use number higher than expected unique words

tweet_vocab = list(read_data(data['tweet']))
flat_tweet_vocab = [item for sublist in tweet_vocab for item in sublist]
token, count, dictionary, reversed_dictionary = build_dataset(flat_tweet_vocab, top_words)

print("Number of unique words: {}".format(len(count))) #correct num of unique words 

Number of unique words: 12117


In [13]:
#Compare preprocessing methods 

#using ntlk 
print ('using ntlk to preprocess:', tokenized_corpus[15])

#using gensim simple preprocesser 
print ('using gensim to preprocess:', tweet_vocab[15])

using ntlk to preprocess: ['govern', 'report', 'say', 'glob', 'warm', 'may', 'caus', 'cant', 'ment', 'il', 'cnsnews', 'com', 'link']
using gensim to preprocess: ['government', 'report', 'says', 'global', 'warming', 'may', 'cause', 'cancer', 'mental', 'illness', 'cnsnews', 'com', 'link']


In [14]:
# Create train and test sets
# Generate random indexes

test_split = 0.8
train_size = int(len(X)*test_split)
test_size = len(X) - train_size
vector_size = 300
window_size = 10
max_tweet_length=512

indexes = set(np.random.choice(len(tweet_vocab), train_size + test_size, replace=False))

X_train = np.zeros((train_size, max_tweet_length, vector_size))
Y_train = np.zeros((train_size, 3), dtype=np.int32)
X_test = np.zeros((test_size, max_tweet_length, vector_size))
Y_test = np.zeros((test_size, 3), dtype=np.int32)

In [15]:
for i, index in enumerate(indexes):
    for t, token in enumerate(tweet_vocab[index]):
        if t >= max_tweet_length:
            break
        
        if token not in X_vecs:
            continue
    
        if i < train_size:
            X_train[i, t, :] = X_vecs[token]
        else:
            X_test[i - train_size, t, :] = X_vecs[token]
            
            
    if i < train_size:
        Y_train[i, :] = Y[index]
    else:
        Y_test[i - train_size, :] = Y[index]

In [16]:
print(X_test[1][2])

[ 0.18652344 -0.14941406  0.05883789  0.12011719 -0.0279541  -0.1328125
  0.08837891 -0.203125    0.20410156  0.140625   -0.11328125 -0.08105469
 -0.11328125 -0.07910156 -0.07519531  0.15625     0.10693359  0.20996094
  0.18554688 -0.06982422 -0.11230469 -0.06933594 -0.05932617 -0.11621094
  0.05859375 -0.02294922 -0.03417969  0.12597656  0.09570312  0.13378906
 -0.10009766 -0.1328125  -0.12255859  0.09375    -0.11035156  0.00282288
 -0.16113281 -0.08691406 -0.13671875  0.11230469  0.21972656 -0.16503906
 -0.04711914 -0.06835938  0.06835938 -0.07128906 -0.00334167  0.05371094
  0.10644531  0.03637695  0.07177734 -0.14453125 -0.05883789 -0.02539062
 -0.01708984  0.04443359  0.03833008 -0.05957031 -0.04736328  0.0088501
 -0.01098633  0.11035156  0.0135498  -0.18359375  0.11181641  0.02648926
 -0.06933594  0.06176758 -0.13964844 -0.05273438 -0.04248047 -0.07519531
  0.0703125   0.15332031 -0.125      -0.01489258 -0.02099609 -0.15820312
 -0.0255127   0.02783203 -0.01403809 -0.22070312 -0.0

In [17]:
#Some variables 

top_words = 1000
max_words = 150
filters = 32 #filter = 1 x KERNEL 

# create the model 
model = Sequential()

model.add(Convolution1D(32, kernel_size=3, activation='elu', padding='same',
                 input_shape=(max_tweet_length, vector_size)))

model.add(Convolution1D(filters=filters, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(3, activation='sigmoid')) 
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
print(model.summary())

# Fit the model
model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=2, batch_size=128,
    verbose=1)

# Final evaluation of the model
scores = model.evaluate(X_test, Y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 512, 32)           28832     
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 512, 32)           3104      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 256, 32)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 8192)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 250)               2048250   
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 753       
Total params: 2,080,939
Trainable params: 2,080,939
Non-trainable params: 0
_________________________________________________________________
