In [1]:
# importing packages 
import pandas as pd 
import numpy as np
import nltk
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
import spacy
from nltk.corpus import stopwords
import keras.backend as K

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# loading the stopwords library
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nlp = spacy.load("en_core_web_lg")

[nltk_data] Downloading package stopwords to /home/paul/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/paul/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
def get_word_embeddings(stems):
    vectors = []
    for stem in stems: 
        token = nlp(stem)
        vectors.append(token.vector)
    return vectors

def padding_step(vectors, length = 36):
    vectors = np.array(vectors)
    dim_embed = vectors.shape[1]
    num_words = vectors.shape[0]
    diff = num_words - length 
    
    if diff == 0:
        return vectors
    else:
        if diff<0 :
            diff = np.abs(diff)
            if diff % 2 ==0:
                return np.concatenate([np.zeros(shape = (int(diff/2), dim_embed)),
                                       vectors, 
                                       np.zeros((int(diff/2), dim_embed))])
            else :
                return np.concatenate([np.zeros((int(diff/2), dim_embed)),
                                       vectors,
                                       np.zeros((int(diff/2)+1, dim_embed))])
        else : 
            return vectors[int(diff/2):int(diff/2)+length, :]


In [None]:
# Loading the training data
print("Loading data ...")
train_data = pd.read_csv(trainfile, sep = "\t",
                         names = ["sentiment", "subject", "word", "timestamp", "original_text"])
print("Data loaded")

# first lower the text 
print("Text tokenization ...")
train_data['text'] = train_data['original_text'].apply(str.lower)
# parse the words
# we want to emphasize that there are special care to take about the word not and its contractions: 
# it might be useful to keep them
train_data['text'] = train_data["text"].apply(lambda sentence: sentence.replace("can\'t", "can not"))
train_data['text'] = train_data["text"].apply(lambda sentence: sentence.replace("n\'t", " not"))
train_data['words'] = train_data["text"].apply(lambda sentence:  "".join((char if char.isalpha() else " ") for char in sentence).lower().split() )
print("Tokenization done")

# getting rid off stopwords
print("Removing stopwords ...")
self.stopwords = stopwords.words("english")
self.stopwords.remove("not")
train_data['words'] = train_data["words"].apply(lambda words : [word for word in words if word not in self.stopwords])
print("Stopwords removed")

# stemming the words with a Porter Stemmer
print("Starting stemming ...")
stemmer = nltk.porter.PorterStemmer()
train_data['stems'] = train_data["words"].apply(lambda words : [stemmer.stem(word) for word in words])
print("Stemming done")

# performing word embedding
print("Starting word embedding ...")
train_data['words_embedded'] = train_data['stems'].apply(get_word_embeddings)
print("Word embedding done")
# averaging the word embedding for a given text
train_data['avg_embedding'] = train_data['words_embedded'].apply(lambda x: np.mean(x, axis =0))

# saving polarisation appart
print("Starting final formatting of the data ...")
y = pd.get_dummies(train_data['sentiment'])

# transforming the aspect data into dummies
train_data = pd.get_dummies(train_data, columns = ['subject'])

# getting rid of unnecessary data
train_data = train_data[['avg_embedding',
                         'subject_AMBIENCE#GENERAL', 'subject_DRINKS#PRICES',
                         'subject_DRINKS#QUALITY', 'subject_DRINKS#STYLE_OPTIONS',
                         'subject_FOOD#PRICES', 'subject_FOOD#QUALITY',
                         'subject_FOOD#STYLE_OPTIONS', 'subject_LOCATION#GENERAL',
                         'subject_RESTAURANT#GENERAL', 'subject_RESTAURANT#MISCELLANEOUS',
                         'subject_RESTAURANT#PRICES', 'subject_SERVICE#GENERAL']]

for i in range(300):
    train_data["avg_embedding" + '_' + str(i)] = train_data["avg_embedding"].apply(lambda x: x[i])
train_data.drop(["avg_embedding"], axis = 1, inplace = True)

self.X = train_data.values
self.y = y['positive']*1 + y['negative']*-1