In [1]:
# importing packages 
import pandas as pd 
import numpy as np
import nltk
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
import spacy
from nltk.corpus import stopwords
import keras.backend as K

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# loading the stopwords library
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nlp = spacy.load("en_core_web_lg")

[nltk_data] Downloading package stopwords to /home/paul/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/paul/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [37]:
class Classifier:
    """Le Classifier"""
    def __init__(self):
        pass
    
    def train(self, trainfile):
        """Trains the classifier model on the training set stored in file trainfile"""
        # Loading the training data
        train_data = pd.read_csv(trainfile, sep = "\t",
                                 names = ["sentiment", "subject", "word", "timestamp", "original_text"])
        
        # first lower the text 
        train_data['text'] = train_data['original_text'].apply(str.lower)
        # parse the words
        # we want to emphasize that there are special care to take about the word not and its contractions: 
        # it might be useful to keep them
        train_data['text'] = train_data["text"].apply(lambda sentence: sentence.replace("can\'t", "can not"))
        train_data['text'] = train_data["text"].apply(lambda sentence: sentence.replace("n\'t", " not"))
        train_data['words'] = train_data["text"].apply(lambda sentence:  "".join((char if char.isalpha() else " ") for char in sentence).lower().split() )
        
        # getting rid off stopwords
        self.stopwords = stopwords.words("english")
        self.stopwords.remove("not")
        
        train_data['words'] = train_data["words"].apply(lambda words : [word for word in words if word not in self.stopwords])
        
        # stemming the words with a Porter Stemmer
        stemmer = nltk.porter.PorterStemmer()
        train_data['stems'] = train_data["words"].apply(lambda words : [stemmer.stem(word) for word in words])
        

        self.sentiments = pd.get_dummies(train_data['sentiment'])
        

        print("Starting Word Embedding")
        train_data['vectors'] = train_data['stems'].apply(get_word_embeddings)
        
        
        def padding(vectors, length = 36):
            vectors = np.array(vectors)
            size = vectors.shape[1]
        #train_data['mean_vector'] = train_data['vectors'].apply(lambda x: np.mean(x, axis = 0))
        # Storing the training data into an attribute of the Classifier
        self.data_train = train_data
        
        # keeping the categories that we will be trying to predict
        self.label_categories = pd.get_dummies(train_data['subject'])
        self.categories = self.label_categories.columns
        self.label_sentiment = train_data['sentiment']
        """
        # perform wordcount
        word_count = {}
        for row in train_data['stems']:
            for word in row:
                if word in word_count.keys():
                    word_count[word]+=1
                else:
                    word_count[word]=1
                
        self.vocabulary = np.unique(word_count.keys())
        self.word_count = word_count
        """
        
    def predict(self, datafile):
        """Predicts class labels for the input instances in file 'datafile'
        Returns the list of predicted labels
        """
        raise(NotImplemented('Implement it !'))

In [3]:
train_data = pd.read_csv("../data/traindata.csv", sep = "\t",
                                 names = ["sentiment", "subject", "word", "timestamp", "original_text"])

In [4]:
train_data.head()

Unnamed: 0,sentiment,subject,word,timestamp,original_text
0,positive,AMBIENCE#GENERAL,seating,18:25,short and sweet – seating is great:it's romant...
1,positive,AMBIENCE#GENERAL,trattoria,25:34,This quaint and romantic trattoria is at the t...
2,positive,FOOD#QUALITY,food,98:102,The have over 100 different beers to offer thi...
3,negative,SERVICE#GENERAL,STAFF,5:10,THIS STAFF SHOULD BE FIRED.
4,positive,FOOD#STYLE_OPTIONS,menu,4:8,"The menu looked great, and the waiter was very..."


In [21]:
stopwords = nltk.corpus.stopwords.words('english')

train_data['words'] = train_data["original_text"].apply(lambda sentence:  "".join((char if char.isalpha() else " ") for char in sentence).lower().split() )
train_data['words'] = train_data['words'].apply(lambda words : [word for word in words if word not in stopwords ])

In [22]:
train_data['pos_tagging'] = train_data['words'].apply(nltk.pos_tag)

In [32]:
train_data['pos_tagging_string'] = train_data.pos_tagging.apply(lambda pos_labels: [tuple2string(x) for x in pos_labels])

In [33]:
vocabulary = {}
for index, words in enumerate(train_data.pos_tagging_string):
    for pos in words :
        if pos in vocabulary.keys():
            vocabulary[pos]+=[index]
        else :
            vocabulary[pos]=[index]

In [None]:
for tag in vocabulary.keys():
    train_data[tag]=0
    for index in vocabulary[tag]:
        train_data[tag][index] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [None]:
def tuple2string(tup):
    return str(tup[0] + '_' + tup[1])

In [28]:
train_data.pos_tagging_string

0       [short_JJ, sweet_NN, seating_VBG, great_JJ, ro...
1       [quaint_NN, romantic_JJ, trattoria_NN, top_JJ,...
2       [different_JJ, beers_NNS, offer_VBP, thier_JJR...
3                                   [staff_NN, fired_VBD]
4       [menu_NN, looked_VBD, great_JJ, waiter_NN, nic...
5        [tuna_NN, wasabe_NN, potatoes_NNS, excellent_JJ]
6       [whole_JJ, set_VBN, truly_RB, unprofessional_J...
7       [sometimes_RB, get_VB, bad_JJ, food_NN, bad_JJ...
8       [place_NN, best_JJS, chinese_JJ, style_NN, bbq...
9       [great_JJ, place_NN, relax_NN, enjoy_NN, dinne...
10      [bread_NN, received_VBD, horrible_JJ, rock_NN,...
11      [thought_JJ, place_NN, using_VBG, much_JJ, msg...
12      [always_RB, good_JJ, drinks_NNS, service_NN, p...
13      [particular_JJ, sushi_NN, please_NN, every_DT,...
14      [prix_NN, fixe_NN, menu_NN, worth_NN, every_DT...
15      [scallops_NNS, appetizer_VBP, delicious_JJ, sa...
16      [ambience_NN, cute_NN, quaint_NN, good_JJ, bus...
17      [best_