In [1]:
# importing packages 
import pandas as pd 
import numpy as np
import nltk
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
import spacy
from nltk.corpus import stopwords
import keras.backend as K
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# loading the stopwords library
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nlp = spacy.load("en_core_web_lg")

[nltk_data] Downloading package stopwords to /home/paul/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/paul/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
def get_word_embeddings(stems):
    vectors = []
    for stem in stems: 
        token = nlp(stem)
        vectors.append(token.vector)
    return vectors

In [25]:
def padding_step(vectors, length = 36):
    vectors = np.array(vectors)
    dim_embed = vectors.shape[1]
    num_words = vectors.shape[0]
    diff = num_words - length 
    
    if diff == 0:
        return vectors
    else:
        if diff<0 :
            diff = np.abs(diff)
            if diff % 2 ==0:
                return np.concatenate([np.zeros(shape = (int(diff/2), dim_embed)),
                                       vectors, 
                                       np.zeros((int(diff/2), dim_embed))])
            else :
                return np.concatenate([np.zeros((int(diff/2), dim_embed)),
                                       vectors,
                                       np.zeros((int(diff/2)+1, dim_embed))])
        else : 
            return vectors[int(diff/2):int(diff/2)+length, :]

In [5]:
class Classifier:
    """Le Classifier"""
    def __init__(self):
        pass
    
    def train(self, trainfile):
        """Trains the classifier model on the training set stored in file trainfile"""
        # Loading the training data
        print("Loading data ...")
        train_data = pd.read_csv(trainfile, sep = "\t",
                                 names = ["sentiment", "subject", "word", "timestamp", "original_text"])
        print("Data loaded")
        
        # first lower the text 
        print("Text tokenization ...")
        train_data['text'] = train_data['original_text'].apply(str.lower)
        # parse the words
        # we want to emphasize that there are special care to take about the word not and its contractions: 
        # it might be useful to keep them
        train_data['text'] = train_data["text"].apply(lambda sentence: sentence.replace("can\'t", "can not"))
        train_data['text'] = train_data["text"].apply(lambda sentence: sentence.replace("n\'t", " not"))
        train_data['words'] = train_data["text"].apply(lambda sentence:  "".join((char if char.isalpha() else " ") for char in sentence).lower().split() )
        print("Tokenization done")
        
        # getting rid off stopwords
        print("Removing stopwords ...")
        self.stopwords = stopwords.words("english")
        self.stopwords.remove("not")
        train_data['words'] = train_data["words"].apply(lambda words : [word for word in words if word not in self.stopwords])
        print("Stopwords removed")
        
        # stemming the words with a Porter Stemmer
        print("Starting stemming ...")
        stemmer = nltk.porter.PorterStemmer()
        train_data['stems'] = train_data["words"].apply(lambda words : [stemmer.stem(word) for word in words])
        print("Stemming done")
        
        # performing word embedding
        print("Starting word embedding ...")
        train_data['words_embedded'] = train_data['stems'].apply(get_word_embeddings)
        print("Word embedding done")
        # averaging the word embedding for a given text
        train_data['avg_embedding'] = train_data['words_embedded'].apply(lambda x: np.mean(x, axis =0))
        
        # saving polarisation appart
        print("Starting final formatting of the data ...")
        y = pd.get_dummies(train_data['sentiment'])
        
        # transforming the aspect data into dummies
        train_data = pd.get_dummies(train_data, columns = ['subject'])
        
        # getting rid of unnecessary data
        train_data = train_data[['avg_embedding',
                                 'subject_AMBIENCE#GENERAL', 'subject_DRINKS#PRICES',
                                 'subject_DRINKS#QUALITY', 'subject_DRINKS#STYLE_OPTIONS',
                                 'subject_FOOD#PRICES', 'subject_FOOD#QUALITY',
                                 'subject_FOOD#STYLE_OPTIONS', 'subject_LOCATION#GENERAL',
                                 'subject_RESTAURANT#GENERAL', 'subject_RESTAURANT#MISCELLANEOUS',
                                 'subject_RESTAURANT#PRICES', 'subject_SERVICE#GENERAL']]
        
        for i in range(300):
            train_data["avg_embedding" + '_' + str(i)] = train_data["avg_embedding"].apply(lambda x: x[i])
        train_data.drop(["avg_embedding"], axis = 1, inplace = True)
        
        self.X = train_data.values
        self.y = y['positive']*1 + y['negative']*-1
        
            
        self.model = SVC()
        print("Data formatted")
        
        print("Starting model fitting ...")
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size = .3)
        
        self.model.fit(X_train, y_train)  
        print("Model fitted")
        
        
    def predict(self, datafile):
        """Predicts class labels for the input instances in file 'datafile'
        Returns the list of predicted labels
        """
        raise(NotImplemented('Implement it !'))

In [6]:
####### DEV MODE #####
classifier = Classifier()

In [7]:
classifier.train("../data/traindata.csv")

Loading data ...
Data loaded
Text tokenization ...
Tokenization done
Removing stopwords ...
Stopwords removed
Starting stemming ...
Stemming done
Starting word embedding ...
Word embedding done
Starting final formatting of the data ...


AttributeError: 'Classifier' object has no attribute 'y'

In [8]:
actual_values = pd.read_csv('../data/traindata.csv', sep = '\t', names = ["polarisation", "1", "2", "3", "4"])
y = actual_values["polarisation"]

In [9]:
y = pd.get_dummies(y)


In [10]:
y = y['negative']*-1 + y['positive']*1

In [11]:
X = classifier.X

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = .3)

In [18]:
from sklearn.svm import LinearSVC
model = LinearSVC(C = .35)

In [19]:
model.fit(X_train, y_train)

LinearSVC(C=0.35, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [20]:
predictions = model.predict(X_test)

In [36]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import KFold
kfold = KFold(5)

In [39]:
optimization = {}

for c in [.25, .5, .75, 1, 1.25]:

    scores = []
    model = LinearSVC(C=c)
    for train_index, test_index in kfold.split(X):
        X_train = X[train_index, :]
        X_test  = X[test_index, :]
        y_train = y[train_index]
        y_test  = y[test_index]

        model.fit(X_train, y_train)
        predictions = model.predict(X_test)

        scores += [sum(predictions == y_test)/len(y_test)]
        
    optimization[str(c)] = np.mean(scores)

In [40]:
optimization

{'0.25': 0.777078626799557,
 '0.5': 0.7797430786267996,
 '0.75': 0.7790741971207088,
 '1': 0.7737519379844962,
 '1.25': 0.7737563676633444}

In [44]:
K.clear_session()
model = Sequential()
model.add(Dense(200, input_shape = (X.shape[1], ), activation = 'relu'))
model.add(Dropout(.1))
model.add(Dense(200, activation = 'relu'))
model.add(Dense(3, activation = 'softmax'))

model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
model.fit(X_train, pd.get_dummies(y_train), epochs = 10, batch_size = 32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f178fe82b00>

In [45]:
model.predict_classes(X_test)

array([2, 2, 2, 1, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 0, 2, 2, 0, 2, 2,
       2, 2, 0, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 0, 2,
       2, 2, 2, 0, 2, 2, 0, 2, 0, 2, 0, 2, 2, 2, 0, 0, 0, 2, 2, 0, 2, 0,
       0, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2,
       2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2,
       2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0,
       2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2,
       0, 0, 2, 2, 2, 2, 2, 0, 2, 2, 0, 0, 2, 2, 0, 2, 2, 0, 0, 2, 2, 0,
       2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 0, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 0, 2, 2, 2])

In [47]:
model.evaluate(X_test, pd.get_dummies(y_test))



[0.7156643708546956, 0.7833333325386047]

In [27]:
import matplotlib.pyplot as plt
%matplotlib inline


In [28]:
plt.matshow(padding_step(train_data['vectors'][303], length = 36))
plt.savefig('/home/paul/Desktop/vector.png')

NameError: name 'train_data' is not defined

In [180]:
train_data.vectors.apply(len).argmax()

  """Entry point for launching an IPython kernel.


303