In [1]:
import numpy as np
import pandas as pd 
import pickle
import json
import gensim
import os
import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from pandas.plotting import scatter_matrix
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import sequence
from keras.optimizers import RMSprop, SGD
from keras.models import Sequential, Model
from keras.layers.core import Dense, Dropout, Activation, Flatten, Reshape
from keras.layers import Input, Bidirectional, LSTM, regularizers
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Conv1D, MaxPooling1D, MaxPooling2D, Conv2D
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping

%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
filename = '../wyns/data/tweet_global_warming.csv' 

In [3]:
df = pd.read_csv(filename, encoding='latin')
df.head()

Unnamed: 0,tweet,existence,existence.confidence
0,Global warming report urges governments to act...,Yes,1.0
1,Fighting poverty and global warming in Africa ...,Yes,1.0
2,Carbon offsets: How a Vatican forest failed to...,Yes,0.8786
3,Carbon offsets: How a Vatican forest failed to...,Yes,1.0
4,URUGUAY: Tools Needed for Those Most Vulnerabl...,Yes,0.8087


In [6]:
model_path = "GoogleNews-vectors-negative300.bin"
word_vector_model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)

In [7]:
def normalize(txt, vocab=None, replace_char=' ',
                max_length=300, pad_out=False,
                to_lower=True, reverse = False,
                truncate_left=False, encoding=None,
                letters_only=False):
  
    txt = txt.split()
    # Remove HTML
    # This will keep characters and other symbols
    txt = [re.sub(r'http:.*', '', r) for r in txt]
    txt = [re.sub(r'https:.*', '', r) for r in txt]
    
    txt = ( " ".join(txt))
    # Remove non-emoticon punctuation and numbers
    txt = re.sub("[.,!0-9]", " ", txt)
    if letters_only: 
        txt = re.sub("[^a-zA-Z]", " ", txt)
    txt = " ".join(txt.split())
    # store length for multiple comparisons
    txt_len = len(txt)

    if truncate_left:
        txt = txt[-max_length:]
    else:
        txt = txt[:max_length]
    # change case
    if to_lower:
        txt = txt.lower()
    # Reverse order
    if reverse:
        txt = txt[::-1]
    # replace chars
    if vocab is not None:
        txt = ''.join([c if c in vocab else replace_char for c in txt])
    # re-encode text
    if encoding is not None:
        txt = txt.encode(encoding, errors="ignore")
    # pad out if needed
    if pad_out and max_length>txt_len:
        txt = txt + replace_char * (max_length - txt_len)
    if txt.find('@') > -1:
        for i in range(len(txt.split('@'))-1):
            try:
                if str(txt.split('@')[1]).find(' ') > -1:
                    to_remove = '@' + str(txt.split('@')[1].split(' ')[0]) + " "
                else:
                    to_remove = '@' + str(txt.split('@')[1])
                txt = txt.replace(to_remove,'')
            except:
                pass
    return txt

In [8]:
def balance(df):
    print("Balancing the classes")
    type_counts = df['Sentiment'].value_counts()
    min_count = min(type_counts.values)

    balanced_df = None
    for key in type_counts.keys():

        df_sub = df[df['Sentiment']==key].sample(n=min_count, replace=False)
        if balanced_df is not None:
            balanced_df = balanced_df.append(df_sub)
        else:
            balanced_df = df_sub
    return balanced_df

In [9]:
def tweet_to_sentiment(tweet):
    norm_text = normalize(tweet[0])
    if tweet[1] in ('Yes', 'Y'):
        return ['positive', norm_text]
    elif tweet[1] in ('No', 'N'):
        return ['negative', norm_text]
    else:
        return ['other', norm_text]
    
df = pd.read_csv(filename, encoding='latin')
data = []
for index, row in df.iterrows():
    data.append(tweet_to_sentiment(row))
        
twitter = pd.DataFrame(data, columns=['Sentiment', 'clean_text'], dtype=str)

In [10]:
# For this demo lets just keep one and five stars the others are marked 'other
# twitter = twitter[twitter['Sentiment'].isin(['positive', 'negative'])]
twitter.head()
print(len(twitter))
# twitter['Sentiment'].unique()


6090


In [152]:
pd.options.display.max_colwidth = 300
print(twitter.loc[0])

Sentiment                                                                                                               positive
clean_text    global warming report urges governments to act|brussels belgium (ap) - the world faces increased hunger and [link]
Name: 0, dtype: object


In [111]:
#Run this cell to balance training data
# twitter = balance(twitter)
# len(twitter)

In [13]:
# Now go from the pandas into lists of text and labels
text = twitter['clean_text'].values
labels_0 = pd.get_dummies(twitter['Sentiment'])  # mapping of the labels with dummies (has headers)
# print(labels_0[:10], twitter['Sentiment'].iloc[:10])
# labels = labels_0.values
labels = labels_0.values[:,[0,2]] # removes the headers
print(labels)
# Perform the Train/test split
X_train_, X_test_, Y_train_, Y_test_ = train_test_split(text,labels, test_size = 0.2, random_state = 42)

[[0 1]
 [0 1]
 [0 1]
 ...
 [1 0]
 [1 0]
 [1 0]]


In [223]:
print(labels_0)

      negative  other  positive
0            0      0         1
1            0      0         1
2            0      0         1
3            0      0         1
4            0      0         1
5            0      0         1
6            0      0         1
7            0      0         1
8            0      0         1
9            0      0         1
10           0      0         1
11           0      0         1
12           0      0         1
13           0      0         1
14           0      1         0
15           0      0         1
16           0      0         1
17           1      0         0
18           0      0         1
19           0      0         1
20           0      0         1
21           0      0         1
22           0      0         1
23           0      0         1
24           0      0         1
25           0      0         1
26           0      0         1
27           0      0         1
28           0      0         1
29           0      0         1
...     

In [192]:
Y_test = []
xt = []
for i in range(Y_test_.shape[0]):
    if Y_test_[i].mean() > 0:
        Y_test.append(Y_test_[i])
        xt.append(X_test[i])
Y_test = np.array(Y_test)
xt = np.array(xt)
print(Y_test_.shape, Y_test.shape)

(1218, 2) (844, 2)


In [144]:
# print(Y_train_[:,0].mean(), Y_train_[:,1].mean())
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(Y_train_),
                                                 Y_train_)
print(class_weights)

TypeError: unhashable type: 'numpy.ndarray'

In [14]:
### Now for a simple bidirectional LSTM algorithm we set our feature sizes and train a tokenizer
# First we Tokenize and get the data into a form that the model can read - this is BoW
# In this cell we are also going to define some of our hyperparameters
max_fatures = 2000
max_len = 2000
words_len = 30
batch_size = 32
embed_dim = 300
lstm_out = 140

dense_out=len(labels[0]) #length of features
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(X_train_)
X_train = tokenizer.texts_to_sequences(X_train_)
X_train = pad_sequences(X_train, maxlen=words_len, padding='post')
print(X_train[:,-1].mean())
X_test = tokenizer.texts_to_sequences(X_test_)
X_test = pad_sequences(X_test, maxlen=words_len, padding='post')
word_index = tokenizer.word_index
# print(len(word_index))

0.0


In [15]:
# prepare embedding matrix
num_words = min(max_fatures, len(word_index))
embedding_matrix = np.zeros((num_words, embed_dim))
for word, i in word_index.items():
    if i >= max_len:
#         print(i)
        continue
    # words not found in embedding index will be all-zeros.
    if word in word_vector_model.vocab:
        embedding_matrix[i] = word_vector_model.word_vec(word)

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = True to fine tune the embeddings
embedding_layer = Embedding(num_words,
                            embed_dim,
                            weights=[embedding_matrix],
                            input_length=max_fatures,
                            trainable=False)

In [200]:
# Define the model using the pre-trained embedding
# import keras
sequence_input = Input(shape=(words_len,), dtype='int32')
# gbm_input = Input(shape=(Y_train_.shape[1],), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
# embedded_sequences = keras.layers.concatenate([gbm_input,embedding_layer(sequence_input)],axis=1)
x = Bidirectional(LSTM(lstm_out, recurrent_dropout=0.5, activation='tanh'))(embedded_sequences)
# x = Bidirectional(LSTM(lstm_out, recurrent_dropout=0.5, activation='tanh'))(x)
x = Dense(250, activation='elu')(x)
preds = Dense(dense_out, activation='softmax')(x)

model = Model([sequence_input], preds)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_28 (InputLayer)        (None, 30)                0         
_________________________________________________________________
embedding_22 (Embedding)     (None, 2000, 300)         600000    
_________________________________________________________________
bidirectional_25 (Bidirectio (None, 280)               493920    
_________________________________________________________________
dense_23 (Dense)             (None, 250)               70250     
_________________________________________________________________
dense_24 (Dense)             (None, 2)                 502       
Total params: 1,164,672
Trainable params: 564,672
Non-trainable params: 600,000
_________________________________________________________________
None


In [201]:
model_hist_embedding = model.fit(X_train, Y_train_, epochs = 20, batch_size=batch_size, verbose = 2,
                        class_weight = [1/Y_train_[:,0].mean(), 1/Y_train_[:,1].mean()],validation_data=(xt,Y_test))

Train on 4872 samples, validate on 844 samples
Epoch 1/20
 - 22s - loss: 0.3415 - acc: 0.5603 - val_loss: 0.4122 - val_acc: 0.7998
Epoch 2/20
 - 16s - loss: 0.2878 - acc: 0.5926 - val_loss: 0.3954 - val_acc: 0.8009
Epoch 3/20
 - 16s - loss: 0.2736 - acc: 0.6096 - val_loss: 0.3754 - val_acc: 0.8140
Epoch 4/20
 - 15s - loss: 0.2486 - acc: 0.6238 - val_loss: 0.3768 - val_acc: 0.8258
Epoch 5/20
 - 15s - loss: 0.2331 - acc: 0.6367 - val_loss: 0.3935 - val_acc: 0.8164
Epoch 6/20
 - 15s - loss: 0.2126 - acc: 0.6486 - val_loss: 0.3995 - val_acc: 0.8235
Epoch 7/20
 - 16s - loss: 0.2047 - acc: 0.6470 - val_loss: 0.3949 - val_acc: 0.8329
Epoch 8/20
 - 16s - loss: 0.1873 - acc: 0.6667 - val_loss: 0.4104 - val_acc: 0.8270
Epoch 9/20
 - 16s - loss: 0.1688 - acc: 0.6780 - val_loss: 0.4277 - val_acc: 0.8246
Epoch 10/20
 - 17s - loss: 0.1554 - acc: 0.6870 - val_loss: 0.4720 - val_acc: 0.8389
Epoch 11/20
 - 16s - loss: 0.1428 - acc: 0.6907 - val_loss: 0.4771 - val_acc: 0.8318
Epoch 12/20
 - 16s - loss: 

In [207]:
def acc(y,y_):
    score = 0
    for i in range(y.shape[0]):
        if np.argmax(y[i]) == np.argmax(y_[i]):
            score+=1
    score = score / y.shape[0]
    return score

pp = model.predict(xt)
print(acc(Y_test,pp))

0.8329383886255924


In [195]:
# train a gradient boosted machine
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multioutput import MultiOutputClassifier
gbm = GradientBoostingClassifier(n_estimators = 5000)
mo = MultiOutputClassifier(gbm, n_jobs = -1)
mo.fit(X_train,Y_train_)

MultiOutputClassifier(estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=5000,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
           n_jobs=-1)

In [208]:
# print(mo.score(xt,Y_test))
# print(mo.score(X_train,Y_train_))
# predd = mo.predict(X_train)
# pred_test = mo.predict(Y_test)

In [172]:
# print(embedding_matrix)

[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.17871094  0.32617188 -0.11865234 ...  0.41796875  0.31445312
   0.06542969]
 [-0.06005859 -0.09326172 -0.07226562 ...  0.02392578  0.11083984
   0.10107422]
 ...
 [ 0.14648438 -0.01379395 -0.01916504 ...  0.04199219  0.16308594
   0.02026367]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [-0.01782227  0.32617188 -0.14746094 ... -0.07226562  0.19628906
   0.02307129]]


In [None]:
model_hist_embedding = model.fit(X_train, Y_train_, epochs = 20, batch_size=batch_size, verbose = 2,
                        validation_data=(X_test,Y_test_))

In [210]:
confusion_matrix(Y_test[:,1], np.round(model.predict(xt))[:,1])

array([[126,  89],
       [ 52, 577]], dtype=int64)

In [None]:
# Training Accuracy
x = np.arange(20)+1
fig=plt.figure(dpi=300)
ax = fig.add_subplot(111)
ax.plot(x, model_hist_embedding.history['acc'])
ax.plot(x, model_hist_embedding.history['val_acc'])
ax.legend(['Training', 'Testing'], loc='lower right')
plt.ylabel("Accuracy")
axes = plt.gca()
axes.set_ylim([0.45,1.01])
plt.xlabel("Epoch")
plt.title("LSTM Accuracy")
plt.show()
fig.savefig(fname='03.png', bbox_inches='tight', format='png') 

In [161]:
# model_hist_embedding.model.save("../wyns/data/climate_sentiment_m6.h5")
# m6 is nans included in the training set but removed from test set, where a nan is [0,0] for [for, against]


In [16]:
model = load_model("../wyns/data/climate_sentiment_m6.h5")

In [17]:
# model = model_hist_embedding.model

In [27]:
def tweet_to_sentiment(tweet):
    # Review is coming in as Y/N/NaN
    # this then cleans the summary and review and gives it a positive or negative value
    norm_text = normalize(tweet[0])
    if tweet[1] in ('Yes', 'Y'):
        return ['positive', norm_text]
    elif tweet[1] in ('No', 'N'):
        return ['negative', norm_text]
    else:
        return ['other', norm_text]

def clean_tweet(tweet):
    norm_text = normalize(tweet[0])
    return [tweet[1], tweet[2], norm_text, tweet[3], tweet[4], tweet[5], tweet[0]]

In [28]:
df = pd.read_csv("../wyns/data/tweets.txt", delimiter="~~n~~", engine="python")
df.head()

Unnamed: 0,"Fixed: Take the politics out of climate change and work with other parties to create an independent climate change commission. Climate change is a major environmental issue. We want all policies, economic or other, to enhance and not harm the environment. https://t.co/B9Tl1YoK19",174.161834,-37.292621,2018-06-14 23:51:23,0,"Auckland, New Zealand"
0,"@rolandscahill @tracymoore1013 I'd say China, ...",-112.323914,33.29026,2018-06-14 23:40:04,0,"Phoenix, AZ"
1,We live in #interestingtimes. Antarctica's mel...,174.613267,-41.362455,2018-06-14 23:34:49,0,"Wellington City, New Zealand"
2,Big announcement at #PECdinner2018: Sec. Cind...,-75.280284,39.871811,2018-06-14 23:14:33,0,"Philadelphia, PA"
3,"When you've got 70+ MM customers, energy compa...",-87.940033,41.644102,2018-06-14 23:13:56,0,"Chicago, IL"
4,“Seriously addressing climate change in the im...,-123.260264,49.638739,2018-06-14 22:42:30,0,"Squamish, British Columbia"


In [29]:
data = []
for index, row in df.iterrows():
    data.append(clean_tweet(row))
twitter = pd.DataFrame(data, columns=['long', 'lat', 'clean_text', 'time', 'retweets', 'location','raw_text'], dtype=str)
to_predict_ = twitter['clean_text'].values

In [21]:
### Now for a simple bidirectional LSTM algorithm we set our feature sizes and train a tokenizer
# First we Tokenize and get the data into a form that the model can read - this is BoW
# In this cell we are also going to define some of our hyperparameters

max_fatures = 2000
max_len=30
batch_size = 32
embed_dim = 300
lstm_out = 140

dense_out=len(labels[0]) #length of features
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(to_predict_)
to_predict = tokenizer.texts_to_sequences(to_predict_)
to_predict = pad_sequences(to_predict, maxlen=max_len, padding='post')
word_index = tokenizer.word_index

In [22]:
predictions = model.predict(to_predict)

In [23]:
print("negative predictions: {}".format(sum(np.round(predictions)[:,0])))
print("positive predictions: {}".format(sum(np.round(predictions)[:,1])))

negative predictions: 4903.0
positive predictions: 21894.0


In [24]:
df_out = pd.DataFrame([twitter['long'], twitter['lat'], twitter['clean_text'],
                      twitter['time'], twitter['retweets'], twitter['location'], predictions[:,0], predictions[:,1]]).T
df_out = df_out.rename(index=str, columns={"Unnamed 0": "negative", "Unnamed 1": "positive"})
print(df_out.shape)
df_out.head()

(26797, 8)


Unnamed: 0,long,lat,clean_text,time,retweets,location,negative,positive
0,-112.324,33.2903,i'd say china but since they're the ones who p...,2018-06-14 23:40:04,0,"Phoenix, AZ",0.582745,0.417255
1,174.613,-41.3625,we live in #interestingtimes antarctica's melt...,2018-06-14 23:34:49,0,"Wellington City, New Zealand",0.17913,0.82087
2,-75.2803,39.8718,big announcement at #pecdinner : sec cindy ada...,2018-06-14 23:14:33,0,"Philadelphia, PA",0.0825765,0.917424
3,-87.94,41.6441,when you've got + mm customers energy companie...,2018-06-14 23:13:56,0,"Chicago, IL",0.00692675,0.993073
4,-123.26,49.6387,“seriously addressing climate change in the im...,2018-06-14 22:42:30,0,"Squamish, British Columbia",0.0479214,0.952079


In [25]:
df_out.to_csv("sample_prediction.csv", index=False)

In [None]:
import re
new_lines = []
for i in df_out['clean_text']:
    raw = i.split(" ")
    out = ""
    count=0
    for j in raw:
        if count < 6:
            out += j + " "
            count += 1
        else:
            out += j + "\n"
            count = 0
    new_lines.append(out)
df_out['new_lines'] = pd.Series(new_lines)
# print(new_lines)
#     print(re.sub('( [^ ][^ ][^ ]*),', r'\1 ', i))

In [231]:
df_out.head()

Unnamed: 0,long,lat,clean_text,time,retweets,location,negative,positive,new_lines
0,-112.324,33.2903,i'd say china but since they're the ones who perpetrated that climate change hoax my money is on no korea they have better beaches for those hotels trumplethinskin wants to build,2018-06-14 23:40:04,0,"Phoenix, AZ",0.00498438,0.995016,
1,174.613,-41.3625,we live in #interestingtimes antarctica's melt continues &amp; volcanoes add hot lava to sea &amp; earthquakes raise the sea floor creating an interesting future for coastal new zealand will #wellington be ready?,2018-06-14 23:34:49,0,"Wellington City, New Zealand",0.00104104,0.998959,
2,-75.2803,39.8718,big announcement at #pecdinner : sec cindy adams dunn unveils climate change resiliency plan for pa public lands,2018-06-14 23:14:33,0,"Philadelphia, PA",0.000292582,0.999707,
3,-87.94,41.6441,when you've got + mm customers energy companies will respond this is how america is moving on #climateaction,2018-06-14 23:13:56,0,"Chicago, IL",0.895099,0.104901,
4,-123.26,49.6387,“seriously addressing climate change in the immediate future demands not a theoretically effective strategy but an actually effective one”: why carbon pricing isn’t working via,2018-06-14 22:42:30,0,"Squamish, British Columbia",0.19368,0.80632,


In [232]:
len(df_out)

16268