# EJERCICIO N 1

En esta primera prueba, intentamos extraer un feature vector para cada review. Este es un vector de 300 características, consistiendo de

80 -> 1 gramas

70 -> 2 gramas

60 -> 3 gramas

50 -> 4 gramas

40 -> 5 gramas


300 Features de los ngramas mas frecuentes en toda la base de datos


In [1]:
# -*- coding: utf-8 -*-
"""
Titulo: Multilayer Neural Network
Autor: M. Portocarrero

RED NEURAL PARA CLASIFICACION BINARIA

Recordemos que el objetivo de esta red es la de resolver un problema de clasificacion de dos clases
Por ello, sólo tendremos una neurona en la Output Layer

La arquitectura de esta red neural es como sigue
n: numero de datos
Input Layer:
    j características
Hidden Layer:
    un numero x de neuronas
Output Layer:
    una sola neurona, con función Sigmoide
"""
""" <--- IMPORTS ---> """
import numpy as np
import pandas as pd

""" <--- FUNCTIONS ---> """
def sigmoid(x,derivative = False):
    if derivative:
        return sigmoid(x)*(1 - sigmoid(x))
    return 1.0 / (1.0 + np.exp(-x))

def sigmoid_prime(x):
    """ Funcion para calcular la derivada del sigmoide cuando x es sig(x)"""
    return x * (1 - x)

def evalPrediction(x,threshold,target):
    """Funcion que evalua el acierto de una salida, Devuelve un booleano"""
    if x > threshold:
        return target == 1
    else:
        return target == 0
        
sig_PrimeVectorized = np.vectorize(sigmoid_prime)
sigVectorized = np.vectorize(sigmoid)
evalPredictionVectorized = np.vectorize(evalPrediction)


In [2]:
""" <--- OBJECTS ---> """
class Weights(object):
    def __init__(self, numFeatures,numNeurons):
        self.numFeatures =  numFeatures
        self.numNeurons = numNeurons
        self.weights = np.random.random((numFeatures,numNeurons))
    
    def printWeights(self):
        print(self.weights)

class DeepNeuralNetwork(object):
    """
    Red Neural que reune las capas y las matrices de pesos
    data : Matriz con todos los datos de entrenamiento
            Filas -> cada dato de entrenamiento, Cols: los coeficientes
    target : Matriz donde se almacenan los resultados
    """
    def __init__(self,neurons,data,target,_test_data,_test_target):
        # We include the bias at the start of data matrix
        self.training_data = data
        self.n = data.shape[0] #Numero de ejemplos en el training
        self. numFeatures = self.training_data.shape[1]
        self.training_target = target
        # Input Layer
        self.inLayer = None
        # First Weight Matrix(nro col = nro de neuronas)
        self.neuronsInHiddenLayer = neurons
        # acordemonos que un feature adicional seria el bias
        self.weights_1 = np.random.random((self.numFeatures+1,self.neuronsInHiddenLayer))
        # Hidden Layers
        self.hiddenLayer = None        
        # Second Weight Matrix, Agregar una columna para el bias
        self.weights_2 = np.random.random((self.neuronsInHiddenLayer + 1,1)) # Solo una neurona de salida
        # Output Layer
        self.outLayer = None
        self.Layers = None
        
        # Loading Test Data, (Agregamos una fila de 1 para el bias)
        _test_data = np.append( np.full((len(_test_data),1),1),_test_data,axis = 1)
        self.test_data = _test_data
        self.test_target = _test_target
        
        ## Labels para imprimir los arreglos
        self.enum = {
                0:"Input Data:\n",
                1:"Weights 1:\n",
                2:"Hidden Layer:\n",
                3:"Weights 2:\n",
                4:"Output Layer:\n",}
        
        
    def forwardPropagation(self,i):
        # Seleccionamos la cantidad de datos
        self.inLayer = np.array([self.training_data[i%self.n]])
        #self.inLayer = np.array(self.training_data)
        self.inLayer = np.append( np.full((len(self.inLayer),1),1),self.inLayer,axis = 1)
        
        self.hiddenLayer = np.dot(self.inLayer,self.weights_1)
        self.hiddenLayer = sigmoid(self.hiddenLayer)
        self.hiddenLayer = np.append( np.full((len(self.hiddenLayer),1),1),self.hiddenLayer,axis = 1)
        
        self.outLayer = np.dot(self.hiddenLayer,self.weights_2)
        self.outLayer = sigmoid(self.outLayer)
        
        self.updateLayers()
        
    def updateLayers(self):
        self.Layers = [self.inLayer,self.weights_1,self.hiddenLayer,self.weights_2,self.outLayer]
        
    def backPropagation(self,i,learning_rate):
        """Back Propagation for Stochastic"""
        error = (self.outLayer[0][0] - self.training_target[i%self.n])
        weights_1 = self.weights_1
        weights_2 = self.weights_2
        
        # First Layer Propagation
        weights_2 = weights_2 - learning_rate * error * self.hiddenLayer.T
        
        # Second Layer Propagation
        #Expresiones temporales
        #Derivada del sigmoide = sig x * (1-sig x)
        hidden = self.hiddenLayer.T 
        hidden = np.delete(hidden,(0),axis = 0)
        hidden = sig_PrimeVectorized(hidden).T
    
        derivative = error * np.dot(np.array([self.inLayer[0]]).T,hidden)

        w2 = self.weights_2
        w2 = np.delete(w2,(0),axis=0)
        w2 = np.tile(w2,self.neuronsInHiddenLayer)
        derivative = np.dot(derivative,w2)
        #print(derivative)

        weights_1 = weights_1 - learning_rate * derivative
        
        
        self.weights_1 = weights_1
        self.weights_2 = weights_2
        self.updateLayers()
        
        
        
    def Train(self,iterations,learn_rate):
        """
        Function that executes the training that is one entire cycle
        Adjust the weights matrices
        """
        alpha = learn_rate
        b= True
        for i in range(iterations):
            self.forwardPropagation(i)
            self.backPropagation(i,alpha)
            #self.printNN()
            if(i % 10000 == 0):
                print("Progress:",i / 10000,"%")
                if(self.getAccuracy() > 70 and b):
                    alpha /= 10
                    b = False
        self.printNN()
        
    def printNN(self):
        print("=================")
        i = 0
        for layer in self.Layers:
            print(self.enum[i],layer)
            i +=1
        print("=================")
    def getAccuracy(self):
        """
        Funcion que imprime el accuracy con todos los datos de test
        """
        prediction = np.dot(self.test_data,self.weights_1)
        prediction = sigmoid(prediction)
        prediction = np.append( np.full((len(prediction),1),1),prediction,axis = 1)
        prediction = np.dot(prediction,self.weights_2)
        prediction = sigmoid(prediction)
        
        threshold = 0.5
    
        #print(prediction)
        
        xs = evalPredictionVectorized(prediction,threshold,self.test_target)
        #print(xs)
        
        numSuccesses = np.count_nonzero(xs == True)

        
        print("Acc:", numSuccesses / len(xs) * 100)
        return numSuccesses / len(xs) * 100

In [3]:
df = pd.read_csv('shuffled_movie_data.csv')
df.head()

Unnamed: 0,review,sentiment
0,I have seen this movie and in all honestly was...,0
1,I only saw this recently but had been aware of...,1
2,Albert Pyun delivers a very good action/drama ...,1
3,This is a truly great and beautiful movie. The...,1
4,"You know, I'm sure the boys were sitting aroun...",0


In [4]:
import numpy as np
from nltk.stem.porter import PorterStemmer
import re
from nltk.corpus import stopwords

stop = stopwords.words('english')
porter = PorterStemmer()

# Devuelve una lista de tokens filtrados por stopwords conservando la secuencia
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    #print(text.split())
    text = [w for w in text.split() if w not in stop]
    tokenized = [porter.stem(w) for w in text]
    return text

#tokenizer(df.iloc[1]['review'])

In [5]:
from nltk.util import ngrams
from nltk import FreqDist

n = 50000
print('Tokenizing Opinions')
corpus_words = []
for i in range(n):
    corpus_words += [tokenizer(df.iloc[i]['review'])]
#print(corpus_words)

Tokenizing Opinions


In [6]:
#Generamos un diccionario con las palabras mas frecuentes
#para ngramas n = 1,2,3,4,5
word_dict = {}
for size in 1,2,3,4,5:
    word_dict[size] =  FreqDist(ngrams(corpus_words[0],size))

print("Adding up all values")
for i in range(1,n):
    for size in 1,2,3,4,5:
        word_dict[size].update(FreqDist(ngrams(corpus_words[i],size)))
    if i %10000 == 0:
        print("Completed ",20*i/10000,"%")  


Tokenizing Opinions
Adding up all values
Completed  20.0 %
Completed  40.0 %
Completed  60.0 %
Completed  80.0 %


In [7]:
#word_dict[3].most_common(2)[0][0]

## Aca debemos contar la ocurrencia de los n gramas obtenidos y guardarlos
## para formar un vector para cada dato de ejemplo

number_features = 0
keys = []
for i in 1,2,3,4,5:
    number_features+= 90-10*i
    tmp = word_dict[i].most_common(90-10*i)
    for j in range(90-10*i):
        keys += (tmp[j][0],)
#keys
#number_features = 300!

In [8]:
#print(keys)
##Dictionario que almacena los indices para cada palabra en la lista keys
text_tmp_dict = {}
for i in range(number_features):
    text_tmp_dict[keys[i]] = i
#print(text_tmp_dict)

## La funcion cuenta el numero de apariciones de los tokens para cada valor de entrenamiento
## y los almacena en una array
def get_vectors(start,stop,number_features):
    trainingSet_length = stop - start
    trainingSet = []
    for example in range(start,stop):
        text_tmp = corpus_words[example]
        #print(text_tmp)
        #print("Occurency in Example")

        word_vector = [0 for i in range(number_features)]
        #print(word_vector)

        for i in 1,2,3,4,5:
            for token in ngrams(text_tmp,i):
                if token in keys:
                    #print(token)
                    word_vector[text_tmp_dict[token]] += 1
                    #text_tmp_dict[token][1] += 1
        #print(word_vector)
        trainingSet += [word_vector]
        if(example % 1000 == 0):
            print("Progreso: ", example /1000,"%")
        
    #El training set que se retorna contiene una columna mas debido al BIAS
    trainingSet = np.array(trainingSet)
    #trainingSet = np.c_[np.ones(trainingSet_length),trainingSet]
    print("Finished")
    return trainingSet

data = get_vectors(0,50000,number_features)
#print(data, data.shape)

Progreso:  0.0 %
Progreso:  1.0 %
Progreso:  2.0 %
Progreso:  3.0 %
Progreso:  4.0 %
Progreso:  5.0 %
Progreso:  6.0 %
Progreso:  7.0 %
Progreso:  8.0 %
Progreso:  9.0 %
Progreso:  10.0 %
Progreso:  11.0 %
Progreso:  12.0 %
Progreso:  13.0 %
Progreso:  14.0 %




Progreso:  15.0 %
Progreso:  16.0 %
Progreso:  17.0 %
Progreso:  18.0 %
Progreso:  19.0 %
Progreso:  20.0 %
Progreso:  21.0 %
Progreso:  22.0 %
Progreso:  23.0 %
Progreso:  24.0 %
Progreso:  25.0 %
Progreso:  26.0 %
Progreso:  27.0 %
Progreso:  28.0 %
Progreso:  29.0 %
Progreso:  30.0 %
Progreso:  31.0 %
Progreso:  32.0 %
Progreso:  33.0 %
Progreso:  34.0 %
Progreso:  35.0 %
Progreso:  36.0 %
Progreso:  37.0 %
Progreso:  38.0 %
Progreso:  39.0 %
Progreso:  40.0 %
Progreso:  41.0 %
Progreso:  42.0 %
Progreso:  43.0 %
Progreso:  44.0 %
Progreso:  45.0 %
Progreso:  46.0 %
Progreso:  47.0 %
Progreso:  48.0 %
Progreso:  49.0 %
Finished


In [9]:
## Obtenemos los valores target
## Obteniendo los valores de los test
target = np.array([df['sentiment']])

target = target.T ## REPRESENTA A LOS VALORES TARGET
print(target[:5],target.shape) # Verificamos correspondencia

[[0]
 [1]
 [1]
 [1]
 [0]] (50000, 1)


In [10]:
# Separamos la data 45k para el training y 5k para el target
training_data = data[:45000]
training_target = target[:45000]

test_data = data[45000:]
test_target = target[45000:]

El accuracy de clasificacion alcanzado llega a un maximo de 75% sobre 5k muestras de test.
Se trato de subir el accuracy sobre entrenando en las muestras de training pero no se tuvo mejora.
Tal vez aumentando un poco mas el numero de entrenamientos

NOTA: EL learning rate es dinámico, es decir, se reduce a medida que se va alcanzando mayor accuracy.

In [17]:
NN = DeepNeuralNetwork(32,training_data,training_target,test_data,test_target)
NN.Train(1000000,0.01)
NN.getAccuracy()

Progress: 0.0 %
Acc: 50.44
Progress: 1.0 %
Acc: 50.44
Progress: 2.0 %
Acc: 49.559999999999995
Progress: 3.0 %
Acc: 50.339999999999996
Progress: 4.0 %
Acc: 49.559999999999995
Progress: 5.0 %
Acc: 49.559999999999995
Progress: 6.0 %
Acc: 50.44
Progress: 7.0 %
Acc: 49.66
Progress: 8.0 %
Acc: 50.44
Progress: 9.0 %
Acc: 50.42
Progress: 10.0 %
Acc: 50.44
Progress: 11.0 %
Acc: 49.8
Progress: 12.0 %
Acc: 50.54
Progress: 13.0 %
Acc: 49.68
Progress: 14.0 %
Acc: 49.559999999999995
Progress: 15.0 %
Acc: 60.0
Progress: 16.0 %
Acc: 65.12
Progress: 17.0 %
Acc: 70.44
Progress: 18.0 %
Acc: 71.88
Progress: 19.0 %
Acc: 72.46000000000001
Progress: 20.0 %
Acc: 72.68
Progress: 21.0 %
Acc: 72.56
Progress: 22.0 %
Acc: 72.42
Progress: 23.0 %
Acc: 72.88
Progress: 24.0 %
Acc: 72.92
Progress: 25.0 %
Acc: 73.2
Progress: 26.0 %
Acc: 73.04
Progress: 27.0 %
Acc: 73.06
Progress: 28.0 %
Acc: 73.2
Progress: 29.0 %
Acc: 73.22
Progress: 30.0 %
Acc: 73.54
Progress: 31.0 %
Acc: 73.3
Progress: 32.0 %
Acc: 73.32
Progress: 33.0

74.98

# EJERCICIO N° 2

En esta parte realizaremos el mismo ejercicio pero usando word embedding 
(word2vec). El metodo es sencillo, se calculara el embedding para cada review, realizando el promedio de la suma de los embeddings de cada palabra encontrada en el preview.
NOTA: Acordemonos que usamos palabras prefiltradas por stopwords, en el tokenizer.

In [6]:
from gensim.models import Word2Vec

model = Word2Vec(corpus_words,size=100)
#print(model)
w2v = dict(zip(model.wv.index2word,model.wv.syn0))
#print(w2v)


In [52]:
def MeanEmbeddingVectorizer(tokenized_review):
    mean_vector = []
    for word in tokenized_review:
        if word in w2v:
            mean_vector.append(w2v[word].tolist())
        else:
            mean_vector.append(np.zeros(100))
    mean_vector = np.mean(mean_vector,axis = 0)
    #print(mean_vector)
    return mean_vector

data = []
for i in range(50000):
    data.append(MeanEmbeddingVectorizer(corpus_words[i]))
data = np.array(data)
print(data.shape)

## Obtenemos los valores target
## Obteniendo los valores de los test
target = np.array([df['sentiment']])

target = target.T ## REPRESENTA A LOS VALORES TARGET
print(target[:5],target.shape) # Verificamos correspondencia

(50000, 100)
[[0]
 [1]
 [1]
 [1]
 [0]] (50000, 1)


In [60]:
training_data = data[:45000]
training_target = target[:45000]

test_data = data[45000:]
test_target = target[45000:]

In [63]:
NN = DeepNeuralNetwork(32,training_data,training_target,test_data,test_target)
NN.Train(1000000,0.001)
NN.getAccuracy()

Progress: 0.0 %
Acc: 50.44
Progress: 1.0 %
Acc: 57.720000000000006
Progress: 2.0 %
Acc: 61.040000000000006
Progress: 3.0 %
Acc: 65.36
Progress: 4.0 %
Acc: 69.94
Progress: 5.0 %
Acc: 73.7
Progress: 6.0 %
Acc: 74.5
Progress: 7.0 %
Acc: 74.96000000000001
Progress: 8.0 %
Acc: 75.18
Progress: 9.0 %
Acc: 75.56
Progress: 10.0 %
Acc: 75.9
Progress: 11.0 %
Acc: 76.16000000000001
Progress: 12.0 %
Acc: 76.7
Progress: 13.0 %
Acc: 76.92
Progress: 14.0 %
Acc: 76.98
Progress: 15.0 %
Acc: 77.58
Progress: 16.0 %
Acc: 78.03999999999999
Progress: 17.0 %
Acc: 78.14
Progress: 18.0 %
Acc: 78.46
Progress: 19.0 %
Acc: 78.74
Progress: 20.0 %
Acc: 78.58000000000001
Progress: 21.0 %
Acc: 79.0
Progress: 22.0 %
Acc: 78.86
Progress: 23.0 %
Acc: 78.97999999999999
Progress: 24.0 %
Acc: 79.47999999999999
Progress: 25.0 %
Acc: 79.78
Progress: 26.0 %
Acc: 79.82000000000001
Progress: 27.0 %
Acc: 79.9
Progress: 28.0 %
Acc: 79.96
Progress: 29.0 %
Acc: 79.96
Progress: 30.0 %
Acc: 80.16
Progress: 31.0 %
Acc: 80.02
Progress: 

84.48

Para el segundo caso se nota una gran mejora. El accuracy llego hasta 85%. Notemos que los embeddings se generaron para la base de datos.

Conclusiones:

Es importante usar las mejores características.

Los embeddings entrenados son una gran herramienta a tener en cuenta en Sentiment Analisis