In [1]:
import pandas as pd
import sys
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense
from keras.layers import Embedding
from keras.models import Model
from preprocess_tweets import preprocess_tweet, generate_embedding_matrix

Using TensorFlow backend.


In [3]:
X_train, y_train, X_test, embedding_matrix = generate_embedding_matrix()

In [2]:
GLOVE_DIMENSION = 25
MAX_WORDS = 30 

In [3]:
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

In [4]:
train_tweets = []  
test_tweets = []  
labels = []

In [5]:
# Preprocessing the train tweets
for row in df_train.iterrows():
    tweet = preprocess_tweet(row[1]['text'])
    train_tweets.append(tweet)
    labels.append(row[1]['target'])
    
# Preprocessing the train tweets
for row in df_test.iterrows():
    tweet = preprocess_tweet(row[1]['text'])
    test_tweets.append(tweet)
    
print('Test tweets proccessed.')
print('Total of %s train tweets.' % len(train_tweets))
print('Total of %s test tweets.' % len(test_tweets))

Test tweets proccessed.
Total of 7613 train tweets.
Total of 3263 test tweets.


In [6]:
# Mapping every unique word to a integer (bulding the vocabulary)
print('Bulding the vocabulary...')
word_to_index = {}
words_freq = {}
m = 0

Bulding the vocabulary...


In [7]:
for i, tweet in enumerate(train_tweets):
    words = tweet.split()
        
    for word in words[:MAX_WORDS]:
        if word not in word_to_index:
            word_to_index[word] = m
            m += 1
        if word not in words_freq:
            words_freq[word] = 1
        else:
            words_freq[word] += 1

In [8]:
word_to_index["unk"] = m
vocabulary_size = len(word_to_index)
print('Bulding the vocabulary done, vocabulary size: %s.' % vocabulary_size)

Bulding the vocabulary done, vocabulary size: 15704.


In [9]:
print('Converting training tweets to integer sequences...')
train_sequences = []

for i, tweet in enumerate(train_tweets):
    words = tweet.split()

    tweet_seq = []
    for word in words[:MAX_WORDS]:
        if word not in word_to_index:
            tweet_seq.append(word_to_index["unk"])
        else:
            tweet_seq.append(word_to_index[word])

    train_sequences.append(tweet_seq)

Converting training tweets to integer sequences...


In [10]:
# Padding the sequences to match the `MAX_WORDS`
X_train = pad_sequences(train_sequences, maxlen=MAX_WORDS, padding="post", value=vocabulary_size)
print('Conversion done.')
print(X_train.shape)

Conversion done.
(7613, 30)


In [11]:
print('Converting testing tweets to integer sequences...')
test_sequences = []

for i, tweet in enumerate(test_tweets):
    words = tweet.split()

    tweet_seq = []
    for word in words[:MAX_WORDS]:
        if word not in word_to_index:
            tweet_seq.append(word_to_index["unk"])
        else:
            tweet_seq.append(word_to_index[word])

    test_sequences.append(tweet_seq)

Converting testing tweets to integer sequences...


In [12]:
# Padding the sequences to match the `MAX_WORDS`
X_test = pad_sequences(test_sequences, maxlen=MAX_WORDS, padding="post", value=vocabulary_size)
print('Conversion done.')
print(X_test.shape)

Conversion done.
(3263, 30)


In [13]:
print('Reading glove embeddings...')
glove_embeddings_file = open('../data/glove.twitter.27B.25d.txt', 'r', encoding='UTF-8')

glove_embeddings = dict()
for line in glove_embeddings_file:
    parts = line.split()
    key = parts[0]
    embedding = [float(t) for t in parts[1:]]
    glove_embeddings[key] = np.array(embedding)
print ("Done reading embeddings")

Reading glove embeddings...
Done reading embeddings


In [14]:
# Generating the embedding matrix for our vocabulary (this is needed for the Embedding layer in keras models)
print('Generating the embedding matrix...')
unknown = []
hits = 0
embedding_matrix = np.zeros((vocabulary_size + 1, GLOVE_DIMENSION))
for word, idx in word_to_index.items():
    if word in glove_embeddings:
        emb = glove_embeddings[word]
        embedding_matrix[idx] = emb
        hits += 1
    else:
        unknown.append(word)
        emb = glove_embeddings["unk"]
        embedding_matrix[idx] = emb

Generating the embedding matrix...


In [15]:
embedding_matrix[vocabulary_size] = [0]*GLOVE_DIMENSION
print('Generating done.')
print('%s words of %s found' % (hits, vocabulary_size))
print(embedding_matrix.shape)

Generating done.
12347 words of 15704 found
(15705, 25)
