In [2]:
import numpy as np
import pandas as pd 
import pickle
import json
import gensim
import os
import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from pandas.plotting import scatter_matrix
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import sequence
from keras.optimizers import RMSprop, SGD
from keras.models import Sequential, Model
from keras.layers.core import Dense, Dropout, Activation, Flatten, Reshape
from keras.layers import Input, Bidirectional, LSTM, regularizers
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Conv1D, MaxPooling1D, MaxPooling2D, Conv2D
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping

%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt

In [3]:
filename = '../../wyns/data/tweet_global_warming.csv' 

In [4]:
df = pd.read_csv(filename, encoding='latin')
df.head()

Unnamed: 0,tweet,existence,existence.confidence
0,Global warming report urges governments to act...,Yes,1.0
1,Fighting poverty and global warming in Africa ...,Yes,1.0
2,Carbon offsets: How a Vatican forest failed to...,Yes,0.8786
3,Carbon offsets: How a Vatican forest failed to...,Yes,1.0
4,URUGUAY: Tools Needed for Those Most Vulnerabl...,Yes,0.8087


In [5]:
model_path = "GoogleNews-vectors-negative300.bin"
word_vector_model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)

In [6]:
def normalize(txt, vocab=None, replace_char=' ',
                max_length=300, pad_out=False,
                to_lower=True, reverse = False,
                truncate_left=False, encoding=None,
                letters_only=False):
  
    txt = txt.split()
    # Remove HTML
    # This will keep characters and other symbols
    txt = [re.sub(r'http:.*', '', r) for r in txt]
    txt = [re.sub(r'https:.*', '', r) for r in txt]
    
    txt = ( " ".join(txt))
    # Remove non-emoticon punctuation and numbers
    txt = re.sub("[.,!0-9]", " ", txt)
    if letters_only: 
        txt = re.sub("[^a-zA-Z]", " ", txt)
    txt = " ".join(txt.split())
    # store length for multiple comparisons
    txt_len = len(txt)

    if truncate_left:
        txt = txt[-max_length:]
    else:
        txt = txt[:max_length]
    # change case
    if to_lower:
        txt = txt.lower()
    # Reverse order
    if reverse:
        txt = txt[::-1]
    # replace chars
    if vocab is not None:
        txt = ''.join([c if c in vocab else replace_char for c in txt])
    # re-encode text
    if encoding is not None:
        txt = txt.encode(encoding, errors="ignore")
    # pad out if needed
    if pad_out and max_length>txt_len:
        txt = txt + replace_char * (max_length - txt_len)
    if txt.find('@') > -1:
        for i in range(len(txt.split('@'))-1):
            try:
                if str(txt.split('@')[1]).find(' ') > -1:
                    to_remove = '@' + str(txt.split('@')[1].split(' ')[0]) + " "
                else:
                    to_remove = '@' + str(txt.split('@')[1])
                txt = txt.replace(to_remove,'')
            except:
                pass
    return txt

In [7]:
def balance(df):
    print("Balancing the classes")
    type_counts = df['Sentiment'].value_counts()
    min_count = min(type_counts.values)

    balanced_df = None
    for key in type_counts.keys():

        df_sub = df[df['Sentiment']==key].sample(n=min_count, replace=False)
        if balanced_df is not None:
            balanced_df = balanced_df.append(df_sub)
        else:
            balanced_df = df_sub
    return balanced_df

In [8]:
def tweet_to_sentiment(tweet):
    norm_text = normalize(tweet[0])
    if tweet[1] in ('Yes', 'Y'):
        return ['positive', norm_text]
    elif tweet[1] in ('No', 'N'):
        return ['negative', norm_text]
    else:
        return ['other', norm_text]
    
df = pd.read_csv(filename, encoding='latin')
data = []
for index, row in df.iterrows():
    data.append(tweet_to_sentiment(row))
        
twitter = pd.DataFrame(data, columns=['Sentiment', 'clean_text'], dtype=str)

In [9]:
# For this demo lets just keep one and five stars the others are marked 'other
twitter = twitter[twitter['Sentiment'].isin(['positive', 'negative'])]
print(len(twitter))
twitter.head()

4225


Unnamed: 0,Sentiment,clean_text
0,positive,global warming report urges governments to act...
1,positive,fighting poverty and global warming in africa ...
2,positive,carbon offsets: how a vatican forest failed to...
3,positive,carbon offsets: how a vatican forest failed to...
4,positive,uruguay: tools needed for those most vulnerabl...


In [10]:
pd.options.display.max_colwidth = 300
print(twitter.loc[0])

Sentiment                                                                                                               positive
clean_text    global warming report urges governments to act|brussels belgium (ap) - the world faces increased hunger and [link]
Name: 0, dtype: object


In [11]:
#Run this cell to balance training data
# twitter = balance(twitter)
# len(twitter)

In [12]:
# Now go from the pandas into lists of text and labels
text = twitter['clean_text'].values
labels_0 = pd.get_dummies(twitter['Sentiment'])  # mapping of the labels with dummies (has headers)
labels = labels_0.values # removes the headers
# Perform the Train/test split
X_train_, X_test_, Y_train_, Y_test_ = train_test_split(text,labels, test_size = 0.2, random_state = 42)

In [13]:
### Now for a simple bidirectional LSTM algorithm we set our feature sizes and train a tokenizer
# First we Tokenize and get the data into a form that the model can read - this is BoW
# In this cell we are also going to define some of our hyperparameters
max_fatures = 2000
max_len=300
batch_size = 32
embed_dim = 300
lstm_out = 140

dense_out=len(labels[0]) #length of features
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(X_train_)
X_train = tokenizer.texts_to_sequences(X_train_)
X_train = pad_sequences(X_train, maxlen=max_len, padding='post')
X_test = tokenizer.texts_to_sequences(X_test_)
X_test = pad_sequences(X_test, maxlen=max_len, padding='post')
word_index = tokenizer.word_index

In [14]:
# prepare embedding matrix
num_words = min(max_fatures, len(word_index))
embedding_matrix = np.zeros((num_words, embed_dim))
for word, i in word_index.items():
    if i >= max_len:
        continue
    # words not found in embedding index will be all-zeros.
    if word in word_vector_model.vocab:
        embedding_matrix[i] = word_vector_model.word_vec(word)

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = True to fine tune the embeddings
embedding_layer = Embedding(num_words,
                            embed_dim,
                            weights=[embedding_matrix],
                            input_length=max_fatures,
                            trainable=False)

In [15]:
# Define the model using the pre-trained embedding
sequence_input = Input(shape=(max_len,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Bidirectional(LSTM(lstm_out, recurrent_dropout=0.5, activation='tanh'))(embedded_sequences)
preds = Dense(dense_out, activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 300)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 2000, 300)         600000    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 280)               493920    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 562       
Total params: 1,094,482
Trainable params: 494,482
Non-trainable params: 600,000
_________________________________________________________________
None


In [None]:
model_hist_embedding = model.fit(X_train, Y_train_, epochs = 20, batch_size=batch_size, verbose = 2,
                        validation_data=(X_test,Y_test_))

Train on 3380 samples, validate on 845 samples
Epoch 1/20
 - 130s - loss: 0.5342 - acc: 0.7485 - val_loss: 0.4716 - val_acc: 0.7893
Epoch 2/20
 - 130s - loss: 0.4675 - acc: 0.7914 - val_loss: 0.4584 - val_acc: 0.7905
Epoch 3/20


In [None]:
confusion_matrix(Y_test_[:,1], np.round(model.predict(X_test))[:,1])

In [None]:
# Training Accuracy
x = np.arange(20)+1
fig=plt.figure(dpi=300)
ax = fig.add_subplot(111)
ax.plot(x, model_hist_embedding.history['acc'])
ax.plot(x, model_hist_embedding.history['val_acc'])
ax.legend(['Training', 'Testing'], loc='lower right')
plt.ylabel("Accuracy")
axes = plt.gca()
axes.set_ylim([0.45,1.01])
plt.xlabel("Epoch")
plt.title("LSTM Accuracy")
plt.show()
fig.savefig(fname='03.png', bbox_inches='tight', format='png') 

In [None]:
# model_hist_embedding.model.save("../../wyns/data/climate_sentiment_m2.h5")

In [None]:
model = load_model("../../wyns/data/climate_sentiment_m3.h5")

In [None]:
# model = model_hist_embedding.model

In [None]:
def tweet_to_sentiment(tweet):
    # Review is coming in as Y/N/NaN
    # this then cleans the summary and review and gives it a positive or negative value
    norm_text = normalize(tweet[0])
    if tweet[1] in ('Yes', 'Y'):
        return ['positive', norm_text]
    elif tweet[1] in ('No', 'N'):
        return ['negative', norm_text]
    else:
        return ['other', norm_text]

def clean_tweet(tweet):
    norm_text = normalize(tweet[0])
    return [tweet[1], tweet[2], norm_text, tweet[3], tweet[4], tweet[5]]

In [None]:
df = pd.read_csv("tweets.txt", delimiter="~~n~~", engine="python")

In [None]:
data = []
for index, row in df.iterrows():
    data.append(clean_tweet(row))
twitter = pd.DataFrame(data, columns=['long', 'lat', 'clean_text', 'time', 'retweets', 'location'], dtype=str)
to_predict_ = twitter['clean_text'].values

In [None]:
### Now for a simple bidirectional LSTM algorithm we set our feature sizes and train a tokenizer
# First we Tokenize and get the data into a form that the model can read - this is BoW
# In this cell we are also going to define some of our hyperparameters

max_fatures = 2000
max_len=300
batch_size = 32
embed_dim = 300
lstm_out = 140

dense_out=len(labels[0]) #length of features
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(to_predict_)
to_predict = tokenizer.texts_to_sequences(to_predict_)
to_predict = pad_sequences(to_predict, maxlen=max_len, padding='post')
word_index = tokenizer.word_index

In [None]:
predictions = model.predict(to_predict)

In [None]:
print("negative predictions: {}".format(sum(np.round(predictions)[:,0])))
print("positive predictions: {}".format(sum(np.round(predictions)[:,1])))

In [None]:
df_out = pd.DataFrame([twitter['long'], twitter['lat'], twitter['clean_text'],
                      twitter['time'], twitter['retweets'], twitter['location'], predictions[:,0], predictions[:,1]]).T
df_out = df_out.rename(index=str, columns={"Unnamed 0": "negative", "Unnamed 1": "positive"})
print(df_out.shape)
df_out.head()

In [None]:
df_out.to_csv("sample_prediction.csv", index=False)