In [1]:
# Importamos las librerías

import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [4]:
#import nltk
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Luli\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Luli\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Luli\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [2]:
# Constitución del Corpus

#Set Random seed
np.random.seed(500)

# Add the Data using pandas
Corpus = pd.read_csv(r"corpus.csv",encoding='latin-1')

print(Corpus)


                                                   text        label
0      Stuning even for the non-gamer: This sound tr...  __label__2 
1      The best soundtrack ever to anything.: I'm re...  __label__2 
2      Amazing!: This soundtrack is my favorite musi...  __label__2 
3      Excellent Soundtrack: I truly like this sound...  __label__2 
4      Remember, Pull Your Jaw Off The Floor After H...  __label__2 
...                                                 ...          ...
9995   A revelation of life in small town America in...  __label__2 
9996   Great biography of a very interesting journal...  __label__2 
9997   Interesting Subject; Poor Presentation: You'd...  __label__1 
9998   Don't buy: The box looked used and it is obvi...  __label__1 
9999   Beautiful Pen and Fast Delivery.: The pen was...  __label__2 

[10000 rows x 2 columns]


In [5]:
# Preprocesamiento

# Paso - 1a : Elimimamos lineas vacías.
Corpus['text'].dropna(inplace=True)

# Paso - 1b : Conversión a minúsculas
Corpus['text'] = [entry.lower() for entry in Corpus['text']]

# Paso - 1c : Tokenization
Corpus['text']= [word_tokenize(entry) for entry in Corpus['text']]

# Paso - 1d : Eliminamos Stop words, Stemming y Lematización.

tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV


for index,entry in enumerate(Corpus['text']):
    # Inicialización de Final_words vacía
    Final_words = []
    # Inicializacion de WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag
    for word, tag in pos_tag(entry):
        # Eliminamos Stop words y caracteres no alfabéticos
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # El texto preprocesado se almacena en 'text_final'
    Corpus.loc[index,'text_final'] = str(Final_words)

print(Corpus['text_final'].head())

0    ['stun', 'even', 'sound', 'track', 'beautiful'...
1    ['best', 'soundtrack', 'ever', 'anything', 're...
2    ['amaze', 'soundtrack', 'favorite', 'music', '...
3    ['excellent', 'soundtrack', 'truly', 'like', '...
4    ['remember', 'pull', 'jaw', 'floor', 'hear', '...
Name: text_final, dtype: object


In [6]:
# División en conjunto de entrenamiento y test

Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],Corpus['label'],test_size=0.3)


In [7]:
# Codificación de etiquetas (categorías)
# Las categorías tienen una etiqueta tipo string, pero los algoritmos de clasifiación necesitan un valor numérico

Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [8]:
# Vectorización: consitución de matrices tf-idf a partir de los subconjuntos de entrenamiento y test

Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text_final'])

Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)


In [9]:
# Generación de los modelos

# Algoritmo - Naive Bayes
# Se añade al clasificador el subconjunto de entrenamiento
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)

# Se generan las predicciones
predictions_NB = Naive.predict(Test_X_Tfidf)

# Y se obtiene la precisión del modelo 
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)
print("Naive Bayes Confusion Matrix:")
print(confusion_matrix(Test_Y,predictions_NB))

# Algoritmo - SVM
# Se añade al clasificador el subconjunto de entrenamiento
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)

# Se generan las predicciones
predictions_SVM = SVM.predict(Test_X_Tfidf)

# Y se obtiene la precisión del modelo 
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print("SVM Matrix:")
print(confusion_matrix(Test_Y,predictions_SVM))

Naive Bayes Accuracy Score ->  83.23333333333333
Naive Bayes Confusion Matrix:
[[1312  226]
 [ 277 1185]]
SVM Accuracy Score ->  84.66666666666667
SVM Matrix:
[[1302  236]
 [ 224 1238]]
