In [1]:
import numpy as np
import pandas as pd

In [2]:
#lecture des donnees à partir fichier txt 
data = []
file = open("imdb_labelled.txt", "r")
for review in file:
    x = review.strip().split('\t')  
    data.append([x[0], int(x[1])]) 

In [3]:
#reprisentation sous forme un tableau
df = pd.DataFrame(data, columns=['review', 'sentiment']) 
df.head(10)

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1
5,"The rest of the movie lacks art, charm, meanin...",0
6,Wasted two hours.,0
7,Saw the movie today and thought it was a good ...,1
8,A bit predictable.,0
9,Loved the casting of Jimmy Buffet as the scien...,1


In [4]:
# description du tableau: 1000 lignes, 2 colonnes
df.shape

(1000, 2)

In [5]:
#tableau contient 500 review negatifs et 500 positifs
df.sentiment.value_counts()

0    500
1    500
Name: sentiment, dtype: int64

In [6]:
import nltk #Natural Language Toolkit
import re #regex
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

ps=nltk.stem.porter.PorterStemmer()

revs = df.review.copy() #liste des phrases
senti=df.sentiment.copy() #liste des sentiments

i=0
positiveTokens= [] # tokens du review  positif
negativeTokens= [] # tokens du review  negatif

#Exploration et tokeniser un review
def tokenize(phrase): 
    phrase = re.sub('[^a-zA-Z]', ' ', phrase).lower().split() #supprimer les caractères speciaux
    phrase = [nltk.WordNetLemmatizer().lemmatize(word) for word in phrase if not word in stopwords.words('english')] #lemmatizing
    phrase = ' '.join(phrase)
    phrase=word_tokenize(phrase) #tokenizing (phrase to array of words)
    return phrase

In [7]:
#separerles  positifs et negatifs tokens
for rev in revs:    
    if senti[i]==0:
        negativeTokens.append(tokenize(rev)) #tableau de negatif tokens
    else:
        positiveTokens.append(tokenize(rev)) #tableau de positif tokens
    i+=1

positiveTokens=(np.concatenate((positiveTokens), axis=0)) #  list de tout les positifs tokens
negativeTokens=(np.concatenate((negativeTokens), axis=0)) # list de tout les negatifs tokens

In [8]:
WordFreq=[]

#calculater frequence d'un mot (Number of occurences of a word in Tokens)
def wordFrequency(word,array): 
    wordFreq=np.count_nonzero(array==word)
    return wordFreq

#--------------------- VISUALISATION-------------------------
#create dataframe of [word,posFreq,negFreq]
for word in list(set(np.concatenate((positiveTokens,negativeTokens), axis=0))):
    WordFreq.append([word,wordFrequency(word,positiveTokens),wordFrequency(word,negativeTokens)]) #[word,posFreq,negFreq]
wordFreqDF = pd.DataFrame(WordFreq, columns=['word','positive Freq', 'negative Freq'])
wordFreqDF.head()
#---------------------------------------------------------------

Unnamed: 0,word,positive Freq,negative Freq
0,circumstance,1,0
1,though,7,4
2,thoroughly,3,0
3,escalating,1,0
4,cutting,1,0


In [9]:
DataSet=[]
  
#calculate row of dataset [review,PosF,NegF,sentiment]
def phraseFreq(phrase,sentiment):  
    Posfreq=0
    Negfreq=0
    for word in tokenize(phrase):
        Posfreq+=wordFrequency(word,positiveTokens) #la somme des frequences positifs
        Negfreq+=wordFrequency(word,negativeTokens) #la somme des frequences negatifs

    return [phrase,Posfreq,Negfreq,sentiment]


#convert review(input) to vector(PosF,NegF)
def review2vec(review):
    Posfreq=0
    Negfreq=0
    for word in tokenize(phrase):
        Posfreq+=wordFrequency(word,positiveTokens)
        Negfreq+=wordFrequency(word,negativeTokens)
    return [Posfreq,Negfreq]


def createDataSet():
    i=0
    for rev in revs:
        DataSet.append(phraseFreq(rev,senti[i]))
        i+=1

createDataSet()

DataSet=pd.DataFrame(DataSet, columns=['review','PosF', 'NegF','sentiment'])
DataSet.head(100) 


Unnamed: 0,review,PosF,NegF,sentiment
0,"A very, very, very slow-moving, aimless movie ...",117,124,0
1,Not sure who was more lost - the flat characte...,42,40,0
2,Attempting artiness with black & white and cle...,166,248,0
3,Very little music or anything to speak of.,19,23,0
4,The best scene in the movie was when Gerardo i...,160,145,1
...,...,...,...,...
95,MANNA FROM HEAVEN is a terrific film that is b...,150,105,1
96,The scenes are often funny and occasionally to...,89,59,1
97,The cast of veteran actors are more than just ...,36,14,1
98,Ursula Burton's portrayal of the nun is both t...,62,34,1


In [10]:
# Import sklearn 
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import scale
from sklearn import datasets
import seaborn as sns  # Bibliothèque pour la visualisation des données

In [11]:
# Données + classes cibles
data   = np.array(DataSet.values[:,1:3], dtype=np.float32)
target = DataSet.values[:,-1]
print(data[0],target[0])

[117. 124.] 0


In [12]:
#Partition aléatoire de l’échantillon
# 10%=100 exemples pour le test
(trainX, testX, trainY, testY) = train_test_split(data, target, test_size=0.1)

len(testY)



100

In [13]:
# Transformer l'étiquette(sentiments) en un vecteur binaire : 3 --> (0,0,0,1,0,0,0,0,0,0)
trainYC = np.array(list(map(lambda x: [1,0] if x == 1 else [0,1], trainY)))
testYC = np.array(list(map(lambda x: [1,0] if x == 1 else [0,1], testY)))


trainYC

# review => network => [0.82,0.18]

array([[1, 0],
       [1, 0],
       [1, 0],
       ...,
       [0, 1],
       [1, 0],
       [0, 1]])

In [14]:
class MultiLayerPerceptron:
    
    def __init__(self, arch , alpha = 0.1):
        # poids + biais
        self.W = {}
        self.B = {}
        
        # Taux d'adaptation
        self.alpha = alpha
        
        # Architecture :nbre de couches et nombre de neurones par couche
        self.arch = arch
        
        # Initialisation des poids: valeurs issues d'une distribution normale
        for i in np.arange(1,len(self.arch)):  
            # Poids
            w = np.random.randn(self.arch[i], self.arch[i-1])
            self.W[i] = w/np.sqrt(self.arch[i])
            # Bias
            b = np.random.randn(self.arch[i],1)
            self.B[i] = b/np.sqrt(self.arch[i])            
            
            
    def sigmoid(self, x):
        return 1.0/(1 + np.exp(-x))
    
    
    def dsigmoid(self, x): # x correspond ici à sigmoid(uj(t)), voir le cours
        return x * (1 - x)
    
    
    #Calcul et mémorisation de l'état de tous les neurones du réseau 
    def forward_pass(self, x):
        a = np.atleast_2d(x).T
        
        stats = {}
        stats[0] = a
        for layer in np.arange(1, len(self.arch)):
            a = self.sigmoid(np.dot(self.W[layer], a) + self.B[layer])
            stats[layer] = a
        return stats    
    
    
    #Sortie du réseau associée à une entrée X (les états des autres neurones ne sont pas mémorisés)
    def predict(self, X):
        a = np.atleast_2d(X).T
        for layer in np.arange(1, len(self.arch)):
            a = self.sigmoid(np.dot(self.W[layer], a) + self.B[layer])
        return a
    
    
    #Calcul de l'erreur quadratique moyenne
    def quadratic_loss(self, X, Y):
        Y = np.atleast_2d(Y).T
        predictions = self.predict(X)
        n = X.shape[0]
        loss = (1/n) * 0.5 * np.sum((predictions - Y) ** 2) 
        return loss 
    
    
    #Calcul des gradients locaux 
    def compute_gradient(self, x, y):
     
        L = len(self.arch) - 1 # indice de la couche de sortie 
        # Gradients
        Gw = {}
        Gb = {}
        A = self.forward_pass(x)
        # Les vecteurs delta  
        D = {}
        y = np.atleast_2d(y).T
        deltaL = (A[L] - y) * self.dsigmoid(A[L])
        D[L] = deltaL # Pour la sortie 
        
        # Calculer les vecteurs delta des autres couches en utilisants les vecteurs delta de la couche suivante
        for l in np.arange(L-1, 0, -1):
            D[l] = (self.W[l+1].T.dot(D[l+1])) * self.dsigmoid(A[l])
        
        for l in np.arange(L, 0, -1):
            Gb[l] = D[l]
            Gw[l] = D[l].dot(A[l-1].T)        
       
        return (Gw, Gb)
    
    
    # Mise à jour par rapport à l'erreur moyenne (relative à un bloc d'exemples)
    def update_with_bloc(self, bloc):
      
        m = len(bloc)
        # Gradients locaux
        GCw = {}
        GCb = {}
        # Initialiser à zeros 
        for i in np.arange(1,len(self.arch)):
            GCw[i] = np.zeros(self.W[i].shape)
            GCb[i] = np.zeros(self.B[i].shape)
            
        # Calcul des gradients
        for x, y in bloc:
            Gw, Gb = self.compute_gradient(x, y)
            for i in np.arange(1,len(self.arch)): 
                GCw[i] += Gw[i]
                GCb[i] += Gb[i]
                
        # Mettre à jour les poids 
        for l in np.arange(1,len(self.arch)):
            self.W[l] = self.W[l] - (self.alpha/m)*(GCw[l])
            self.B[l] = self.B[l] - (self.alpha/m)*(GCb[l])
    
    
    # Iteration: entrainement en utilisant tous les exemples, un bloc de taille bloc_size chaque fois
    def train(self, D, bloc_size):
        train_size = len(D)
        np.random.shuffle(D) # tirage au sort
        
        # Bloc d'exemples
        blocs = [D[k : k + bloc_size] for k in range(0, train_size, bloc_size)]
        
        for bloc in blocs: # Mise à jour suite au passage de chaque bloc
            self.update_with_bloc(bloc)
  

    # Apprentissage
    def fit(self, X, Y, bloc_size = 20, iterations = 10000, error_min = 0.001, displayPeriod = 5000):
     
        # Exemples avec X et Y Assemblés
        D = list(zip(X,Y))
        
        # Erreurs
        errors = [self.quadratic_loss(X,Y)]   # Erreur initiale    
        
        iter = 0
        print("Itération: {}-{}, Erreur: {:.6f}".format(iter, iterations,errors[iter]))
        while iter < iterations and errors[iter] > error_min: # Tour de boucle 
            
            self.train(D, bloc_size)  # Mettre à jour 
            errors.append(self.quadratic_loss(X,Y))         # Nouvelle erreur
          
            if (iter+1) % displayPeriod == 0:
                print("Itération: {}-{}, Error: {:.6f}".format(iter + 1, iterations,errors[iter]))
            iter += 1
        
        if errors[iter] < error_min: # Erreur inférieur à la valeur minimale
            print("Fin: erreur minimale atteinte : {:.6f}.", errors[iter])
        elif iter == iterations:
            print("Fin: nombre maximum d'itérations atteint.")
       
        return (errors, iter)

In [15]:
# Initialisation et apprentissage
# trainX.shape[1]
# testY
pmc = MultiLayerPerceptron(arch=[trainX.shape[1],15,15,2], alpha=0.1)
(errs, iter_fin) = pmc.fit(trainX, trainYC, iterations=500, bloc_size=5, error_min=0.00001, displayPeriod=20)

Itération: 0-500, Erreur: 0.312079
Itération: 20-500, Error: 0.126998
Itération: 40-500, Error: 0.121924
Itération: 60-500, Error: 0.122278


  return 1.0/(1 + np.exp(-x))


Itération: 80-500, Error: 0.128319
Itération: 100-500, Error: 0.127972
Itération: 120-500, Error: 0.123550
Itération: 140-500, Error: 0.124651
Itération: 160-500, Error: 0.130530
Itération: 180-500, Error: 0.117466
Itération: 200-500, Error: 0.120643
Itération: 220-500, Error: 0.135514
Itération: 240-500, Error: 0.122992
Itération: 260-500, Error: 0.126822
Itération: 280-500, Error: 0.119017
Itération: 300-500, Error: 0.127216
Itération: 320-500, Error: 0.122322
Itération: 340-500, Error: 0.125588
Itération: 360-500, Error: 0.129141
Itération: 380-500, Error: 0.120178
Itération: 400-500, Error: 0.133173
Itération: 420-500, Error: 0.121170
Itération: 440-500, Error: 0.119994
Itération: 460-500, Error: 0.120578
Itération: 480-500, Error: 0.122700
Itération: 500-500, Error: 0.121332
Fin: nombre maximum d'itérations atteint.


In [16]:
# Test pour un exemple 
# data.shape[0]
randIndex = np.random.randint(0,data.shape[0]-1,1)[0]
# print('Exemple : '+str(randIndex)+', classe réelle : '+str(target[randIndex]))
print(testX[7])
print(testY[7])
# # print(data[randIndex])
print('Sortie prédite : \n'+str(pmc.predict(testX[7]))+')' )
# testY

[19. 82.]
0
Sortie prédite : 
[[0.0078542 ]
 [0.99217033]])


In [17]:
targetTestR = ['']*(np.array(testY).shape[0])

# targetTestR
for index in range(testX.shape[0]):     
    o = np.round(pmc.predict(testX[index]),0)[:,0].astype(int)
    if((o==np.array([1,0])).all()):
        targetTestR[index] = 1
    elif((o==np.array([0,1])).all()):
        targetTestR[index] = 0

        
# Sortie calculée et sortie réelle pour la base de test      
targetTestRF=list(map(lambda x: '1' if x == 1 else '0', targetTestR))
# print(targetTestR)
testYF=list(map(lambda x: '1' if x == 1 else '0', testY))
print(testYF)

['1', '0', '0', '0', '1', '0', '1', '0', '0', '0', '0', '0', '0', '0', '1', '0', '1', '1', '1', '0', '1', '0', '1', '1', '0', '0', '1', '0', '0', '0', '1', '0', '0', '0', '1', '0', '1', '0', '0', '1', '1', '1', '1', '1', '0', '1', '1', '1', '0', '1', '0', '1', '1', '0', '1', '1', '1', '0', '1', '1', '0', '0', '1', '0', '1', '1', '0', '0', '1', '0', '1', '0', '1', '1', '1', '0', '1', '1', '0', '1', '1', '1', '0', '0', '1', '0', '0', '1', '0', '1', '1', '1', '1', '0', '0', '0', '0', '0', '0', '0']


In [54]:
from sklearn import metrics
# Taux de la classification correcte 
metrics.accuracy_score(testYF, targetTestRF) 

0.81