In [1]:
# importing packages 
import pandas as pd 
import numpy as np
import nltk
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
import spacy
from nltk.corpus import stopwords
import keras.backend as K

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# loading the stopwords library
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nlp = spacy.load("en_core_web_lg")

[nltk_data] Downloading package stopwords to /home/paul/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/paul/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [7]:
def get_word_embeddings(stems):
    vectors = []
    for stem in stems: 
        token = nlp(stem)
        vectors.append(token.vector)
    return vectors

def padding_step(vectors, length = 36):
    vectors = np.array(vectors)
    dim_embed = vectors.shape[1]
    num_words = vectors.shape[0]
    diff = num_words - length 
    
    if diff == 0:
        return vectors
    else:
        if diff<0 :
            diff = np.abs(diff)
            if diff % 2 ==0:
                return np.concatenate([np.zeros(shape = (int(diff/2), dim_embed)),
                                       vectors, 
                                       np.zeros((int(diff/2), dim_embed))])
            else :
                return np.concatenate([np.zeros((int(diff/2), dim_embed)),
                                       vectors,
                                       np.zeros((int(diff/2)+1, dim_embed))])
        else : 
            return vectors[int(diff/2):int(diff/2)+length, :]


In [140]:
path = '../data/traindata.csv'
print("Loading data ...")
data = pd.read_csv(path, sep = "\t",
                   names = ["sentiment", "subject", "word", "timestamp", "original_text"])
print("Data loaded")

# first lower the text 
print("Text tokenization ...")
data['text'] = data['original_text'].apply(str.lower)
# parse the words
# we want to emphasize that there are special care to take about the word not and its contractions: 
# it might be useful to keep them
data['text'] = data["text"].apply(lambda sentence: sentence.replace("can\'t", "can not"))
data['text'] = data["text"].apply(lambda sentence: sentence.replace("n\'t", " not"))
data['words'] = data["text"].apply(lambda sentence:  "".join((char if char.isalpha() else " ") for char in sentence).lower().split() )
print("Tokenization done")

# getting rid off stopwords
print("Removing stopwords ...")
stopwords = nltk.corpus.stopwords.words("english")
stopwords.remove("not")
data['words'] = data["words"].apply(lambda words : [word for word in words if word not in stopwords])
print("Stopwords removed")

# stemming the words with a Porter Stemmer
print("Starting stemming ...")
stemmer = nltk.porter.PorterStemmer()
data['stems'] = data["words"].apply(lambda words : [stemmer.stem(word) for word in words])
print("Stemming done")

# performing word embedding
print("Starting word embedding ...")
data['words_embedded'] = data['stems'].apply(get_word_embeddings)
print("Word embedding done")

# averaging the word embedding for a given text
data['avg_embedding'] = data['words_embedded'].apply(lambda x: np.mean(x, axis =0))

# saving polarisation appart
print("Starting final formatting of the data ...")
y = pd.get_dummies(data['sentiment'])

# transforming the aspect data into dummies
data = pd.get_dummies(data, columns = ['subject'])

# getting rid of unnecessary data
data = data[['avg_embedding',
             'subject_AMBIENCE#GENERAL', 'subject_DRINKS#PRICES',
              'subject_DRINKS#QUALITY', 'subject_DRINKS#STYLE_OPTIONS',
              'subject_FOOD#PRICES', 'subject_FOOD#QUALITY',
              'subject_FOOD#STYLE_OPTIONS', 'subject_LOCATION#GENERAL',
              'subject_RESTAURANT#GENERAL', 'subject_RESTAURANT#MISCELLANEOUS',
              'subject_RESTAURANT#PRICES', 'subject_SERVICE#GENERAL']]

for i in range(300):
    data["avg_embedding" + '_' + str(i)] = data["avg_embedding"].apply(lambda x: x[i])
data.drop(["avg_embedding"], axis = 1, inplace = True)

X = data.values
# y = y['positive']*1 + y['negative']*-1
print('Data formated')



In [81]:
X_train, y_train = formatting_data("../data/traindata.csv")

Loading data ...
Data loaded
Text tokenization ...
Tokenization done
Removing stopwords ...
Stopwords removed
Starting stemming ...
Stemming done
Starting word embedding ...
Word embedding done
Starting final formatting of the data ...
Data formated


In [141]:
X_test, y_test = formatting_data("../data/devdata.csv")

Loading data ...
Data loaded
Text tokenization ...
Tokenization done
Removing stopwords ...
Stopwords removed
Starting stemming ...
Stemming done
Starting word embedding ...
Word embedding done
Starting final formatting of the data ...
Data formated


In [115]:
from keras.optimizers import SGD, Adam

In [167]:
model = Sequential()
reduce = ReduceLROnPlateau(monitor="val_loss", patience = 10, factor=.5, verbose=1)
early  = EarlyStopping(monitor = "val_loss", patience = 100, verbose = 1)
model.add(Dense(200, input_shape = (X_train.shape[1], ), activation ='relu'))
model.add(Dense(100, activation = 'relu'))
model.add(Dense(50, activation = 'relu'))
model.add(Dense(20, activation = 'relu'))
model.add(Dropout(.25))
model.add(Dense(3, activation = 'softmax'))
model.summary()
model.compile(loss = 'categorical_crossentropy', optimizer = 'sgd', metrics = ["accuracy"])
model.fit(X_train, y_train, 
          validation_split = .3, 
          epochs = 500, batch_size = 128, 
          verbose = 0, 
          callbacks=[reduce, early])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_149 (Dense)            (None, 200)               62600     
_________________________________________________________________
dense_150 (Dense)            (None, 100)               20100     
_________________________________________________________________
dense_151 (Dense)            (None, 50)                5050      
_________________________________________________________________
dense_152 (Dense)            (None, 20)                1020      
_________________________________________________________________
dropout_19 (Dropout)         (None, 20)                0         
_________________________________________________________________
dense_153 (Dense)            (None, 3)                 63        
Total params: 88,833
Trainable params: 88,833
Non-trainable params: 0
_________________________________________________________________


KeyboardInterrupt: 

In [150]:
model.evaluate(X_test, y_test)



[0.5969842862575612, 0.7526595731999012]

In [190]:
model = LinearSVC(C = .9)

In [191]:
flat_y_train = y_train['negative']*-1 + y_train['positive']*1
flat_y_test = y_test['negative']*-1 + y_test['positive']*1

In [192]:
model.fit(X_train, flat_y_train)

LinearSVC(C=0.9, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [193]:
model.score(X_test, flat_y_test)

0.7659574468085106

In [162]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

In [163]:
model = OneVsRestClassifier(LogisticRegression())

In [164]:
model.fit(X_train, y_train)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=1)

In [165]:
model.score(X_test, y_test)

0.7420212765957447

TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [168]:
pred = model.predict_classes(X_test)

In [169]:
labels = pred[]

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,