In [None]:
import pandas as pd
import numpy as np

# The file can be downloaded from my website and saved locally
df = pd.read_csv(r"C:\Users\mawal\OneDrive\Desktop\training.1600000.processed.noemoticon.csv", 
                 encoding = "ISO-8859-1",
                names = ['y', 'id', 'date', 'notsure','handle','text'])

# 2 is a neutral sentiment
df = df[df['y'] != 2]

# The target vector is 0 and 4 so we are dividing by 4 to get 0 and 1
df['y'] = df['y']/4


In [None]:
# How many observations we have
one = df[df['y'] == 1]
zeroes = df[df['y'] == 0]

print(one.shape)
print(zeroes.shape)
print(df.shape)

In [None]:
# Shuffling the data and creating a test sample that can be run instead of the full sample
df_test = df.sample(frac = 1)
df_test = df_test[:100000]

In [None]:
# Creating target and feature vectors
# target = df_test['y']
# features = df_test['text']
target = df['y']
features = df['text']

In [None]:
# Splitting the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = .05, shuffle = True)

In [None]:
# Viewing the shape of the test file and making sure the targets are shuffled
print(y_test)
y_test.shape

In [None]:
import os
cwd = os.getcwd()
import time

import nltk
from nltk.stem import WordNetLemmatizer
import re
from nltk.corpus import stopwords
nltk.download('punkt')
import emoji

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Recoding the start time of the cell
start_run = time.time()
local_time = time.ctime(start_run)
print('start time: {}'.format(local_time))

# Converting the tweets from the text column into a single list
text_list_train = X_train.tolist()
text_list_test = X_test.tolist()

# Turning any punction into a period.
text_list2 = [re.sub(r'[,!?;-]+', '.', word) for word in text_list_train]
text_list_test2 = [re.sub(r'[,!?;-]+', '.', word) for word in text_list_test]

# Removing stopwords and non alpha characters from tokens
text_list3 = [word.lower() for word in text_list2 if word.isalpha()
                or word == '.' or emoji.get_emoji_regexp().search(word)
                or word not in stopwords.words('english')]

text_list_test3 = [word.lower() for word in text_list_test2 if word.isalpha()
                or word == '.' or emoji.get_emoji_regexp().search(word)
                or word not in stopwords.words('english')]

# Lemmatizing the tokens
# lemma = WordNetLemmatizer()
# text_list4 = [lemma.lemmatize(word) for word in text_list3]

# Tokenizing using the tensorflow function so we can use <OOV>
tokenizer = Tokenizer(oov_token = "<OOV>")
tokenizer.fit_on_texts(text_list3)

# Creating a word index where each word in the corpus is indexed starting from 0
word_index = tokenizer.word_index

# Using the tokenizer to generate sequences that are the token indices in place of the tokens
sequences = tokenizer.texts_to_sequences(text_list3)
sequences_test = tokenizer.texts_to_sequences(text_list_test3)

# Adding padding (post) to make all of the tweet lengths the same (cut the length at 20 tokens)
padded_sequences = pad_sequences(sequences, maxlen = 40, padding = 'post')
padded_sequences_test = pad_sequences(sequences_test, maxlen = 40, padding = 'post')

# tokenizer.fit_on_texts(text_list_test3)
# word_index_test = tokenizer.word_index
# sequences_test = tokenizer.texts_to_sequences(text_list_test3)
# padded_sequences_test = pad_sequences(sequences_test, maxlen = 40, padding = 'post')

# Calculaing the end time and total run time
end_run = time.time()
local_time = time.ctime(end_run)
print('end time: {}'.format(local_time))
duration_run = round((end_run - start_run)/60, 2)
print('Total run time: {}'.format(duration_run))

print('size of the vocabulary: {}'.format(len(word_index)))
print('First 10 words and their indices: {}'.format(list(word_index.items())[:10]))
print('First five seqeunces: {}'.format(padded_sequences[:5]))


In [None]:
# Doing some diagnostic tests
print(len(word_index))
print(len(text_list_train))
print(len(text_list_test))

In [None]:
# Further looks at the data
print(list(word_index.items())[:50])
print(padded_sequences_test[0:5])

In [None]:
import numpy as np
import h5py

# We can save the training and validation word embeddings so we can uplaod them in the future
with h5py.File('padded_sequences.h5', 'w') as hf:
    hf.create_dataset('padded_sequences', data = padded_sequences)
    
with h5py.File('y_train.h5', 'w') as hf:
    hf.create_dataset('y_train', data = y_train)
    
with h5py.File('padded_sequences_test.h5', 'w') as hf:
    hf.create_dataset('padded_sequences_test', data = padded_sequences_test)

with h5py.File('y_test.h5', 'w') as hf:
    hf.create_dataset('y_test', data = y_test)

In [None]:
# Uploading the training and validation data (if needed)
with h5py.File('padded_sequences.h5', 'r') as hf:
    X_train_bow = hf['padded_sequences'][:]

with h5py.File('y_train.h5', 'r') as hf:
    y_train_bow = hf['y_train'][:]
    
with h5py.File('padded_sequences_test.h5', 'r') as hf:
    X_val_bow = hf['padded_sequences_test'][:]
    
with h5py.File('y_test.h5', 'r') as hf:
    y_val_bow = hf['y_test'][:]
    
    
print(X_train_bow.shape)
print(X_val_bow.shape)

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Generating the BiLSTM model.
model2 = keras.Sequential([layers.Embedding(len(word_index) + 1, 32),
                         layers.Bidirectional(layers.LSTM(32, dropout = .5)),
                         layers.Dropout(.5),
                         layers.Dense(16, activation = 'relu'),
                         layers.Dropout(.5),
                         layers.Dense(1, activation = 'sigmoid')])

model2.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

model2.summary()

history = model2.fit(padded_sequences, y_train, batch_size = 16, epochs = 3, validation_data = (padded_sequences_test, y_test),
                    validation_steps = 10)

In [None]:
# We can plot the data to see the training and validation error rates over epochs
# Turning the model into a dictionary
history_dict = history.history

# Extracting the accuracy of the model
acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
epochs = range(1, len(acc) +1)

import matplotlib.pyplot as plt
plt.plot(epochs, acc, 'bo', label = 'Training acc') 
plt.plot(epochs, val_acc, 'b', label = 'Validation acc')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc= 'lower right')
plt.ylim((0.5, 1))
plt.show()

In [None]:
import io

# We can view the word embeddings in a word cloud through tensorflow embedding projector
# https://projector.tensorflow.org/

out_vectors = io.open('vecs.tsv', 'w', encoding = 'utf-8')
out_metadata = io.open('meta.tsv', 'w', encoding = 'utf-8')

vocab = list(word_index.keys())

weights = model2.layers[0].get_weights()[0] #0 layer is the embedding layer
for num, word in enumerate(vocab):
    vec = weights[num+1]
    out_metadata.write(word + '\n')
    out_vectors.write('\t'.join([str(x) for x in vec]) + '\n')
out_vectors.close()
out_metadata.close()

In [None]:
# Importing the unlabelled BLM data
df = pd.read_csv(r"C:\Users\mawal\OneDrive\Desktop\Twitter\Final_df\df_BLM.csv", encoding = "ISO-8859-1")

# Extracting the tweets from the dataframe and creating a list
df_text = df['text']
df_text = df_text.tolist()

In [None]:
# Preprocessing the BLM twitter data to make predictions
# Turning any punction into a period.
pred_data = [re.sub(r'[,!?;-]+', '.', word) for word in df_text]

# Removing stopwords and non alpha characters from tokens
pred_data = [word.lower() for word in pred_data if word.isalpha()
                or word == '.' or emoji.get_emoji_regexp().search(word)
                or word not in stopwords.words('english')]

# Using the tokenizer to generate sequences that are the token indices in place of the tokens
pred_data = tokenizer.texts_to_sequences(pred_data)

# Adding padding (post) to make all of the tweet lengths the same (cut the length at 40 tokens)
pred_data = pad_sequences(pred_data, maxlen = 40, padding = 'post')



In [None]:
# Generating sentiment predictions
sentiment = model2.predict_proba(pred_data, verbose = 1)

In [None]:
# Creating sentiment probability column and classifier column in the dataframe
df['sentiment_probs'] = sentiment
df['sentiment'] = np.where(df['sentiment_probs'] > .5, 1, 0)

In [None]:
# Saving the predictions to local drive
df.to_csv(r'C:\Users\mawal\OneDrive\Desktop\Twitter\predictions\BLM_labelled.csv')