In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Directorio donde se encuentra el dataset
root_path = '/content/drive/MyDrive/Pizza request/'

In [3]:
# Imports
import csv
import random
import numpy as np
import re
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

#import utils 
#import os

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec, Phrases
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

from keras.preprocessing.sequence import pad_sequences

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [4]:
# Lectura del dataset de train
trainfile = csv.reader(open(root_path + "train.csv"), delimiter='\t')
trainrows = np.array([[c for c in row] for row in trainfile])
row_count_train, column_count = np.shape(trainrows)
T_train = [int(c) for c in trainrows[:, 0]]
P_train = trainrows[:, 1]

In [5]:
# Preprocesamiento de los textos
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = nltk.stem.WordNetLemmatizer()       

# Eliminación de stop-wrods y stemming de los términos
P_train = [re.sub("[^a-zA-Z]", " ", l.lower()) for l in P_train]
P_train = [l.split() for l in P_train] 
P_train = [[lemmatizer.lemmatize(l) for l in row if l not in stopwords] for row in P_train]

In [None]:
# Creación del modelo de word2vec

embedding_size = 50
model = Word2Vec(sentences = P_train, size=embedding_size, min_count=3, window=5)

# Convertir features word2vec

vocab = model.wv.vocab
keys = list(vocab.keys())
filter_unknown = lambda word: vocab.get(word, None) is not None
encode = lambda review: list(map(keys.index, filter(filter_unknown, review)))
word_vector = list(map(encode, P_train))

input_length = embedding_size
#input_length = 20
#input_length = 150
# Convierte en matrix al vector de palabras. Todos tienen la misma longitud
X = pad_sequences(sequences=word_vector, maxlen=input_length, padding='post')
Y = np.array(T_train)

In [None]:
# Veo si hay desbalance de clases
zeros = T_train.count(0)
ones = T_train.count(1)
total = len(T_train)
print(zeros,ones, total)

2098 660 2758


In [None]:
# Separacion en train y validacion
x_train, x_test, y_train, y_test = train_test_split(X, Y, stratify=Y, test_size=0.33, random_state=42)

In [None]:
x_train

array([[2199,   61,   54, ...,  185,  845,  317],
       [1114,  506, 1217, ...,    0,    0,    0],
       [ 634, 2156, 3069, ...,    0,    0,    0],
       ...,
       [ 255,  256,   26, ...,    0,    0,    0],
       [ 348,  349,  234, ...,    0,    0,    0],
       [ 217,  535,  592, ...,    0,    0,    0]], dtype=int32)

In [None]:
unique, counts = np.unique(y_train, return_counts=True)
values = dict(zip(unique, counts))

# Veo si hay desbalance de clases
zeros = values[0]
ones = values[1]
total = values[0] + values[1]
print(zeros,ones, total)

# Scaling by total/2 helps keep the loss to a similar magnitude.
# The sum of the weights of all examples stays the same.
weight_for_0 = (1 / zeros)*(total)/2.0 
weight_for_1 = (1 / ones)*(total)/2.0

class_weight = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

1405 442 1847
Weight for class 0: 0.66
Weight for class 1: 2.09


## Train y validación

In [None]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(max_iter = 1000, class_weight = 'balanced')
#classifier = LogisticRegression(max_iter = 1000, class_weight = 'class_weight')
classifier.fit(x_train, y_train)
score = classifier.score(x_test, y_test)

print("Accuracy:", score)

Accuracy: 0.6081229418221734


In [None]:
y_pred = classifier.predict(x_test)

In [None]:
rdo_logistica = f1_score(y_test, y_pred , average="macro")
print(f"f1score: {rdo_logistica}")

f1score: 0.5145293250995266


## Entrenamiento con todos los datos

In [None]:
classifier = LogisticRegression(max_iter = 1000, class_weight = 'balanced')
classifier.fit(X, Y)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## Test

In [None]:
# Lectura del dataset de test
testfile = csv.reader(open(root_path + "test.csv"), delimiter='\t')
testrows = np.array([[c for c in row] for row in testfile])
row_count_test, column_count = np.shape(testrows)
T_test = [int(c) for c in testrows[:, 0]]
P_test = testrows[:, 1]

In [None]:
# Eliminación de stop-wrods y stemming de los términos
P_test = [re.sub("[^a-zA-Z]", " ", l.lower()) for l in P_test]
P_test = [l.split() for l in P_test] 
P_test = [[lemmatizer.lemmatize(l) for l in row if l not in stopwords] for row in P_test]

In [None]:
# Preprocesamiento Test
word_vector = list(map(encode, P_test))

# Convierte en matrix al vector de palabras. Todos tienen la misma longitud
x_test_real = pad_sequences(sequences=word_vector, maxlen=input_length, padding='post')
y_test_real = np.array(T_test)

In [None]:
y_pred_test = classifier.predict(x_test_real)

In [None]:
rdo_logistica_test_macro = f1_score(y_test_real, y_pred_test , average="macro")
print(f"f1score_test_macro: {rdo_logistica_test_macro}")
rdo_logistica_test_micro = f1_score(y_test_real, y_pred_test , average="micro")
print(f"f1score_test_micro: {rdo_logistica_test_micro}")
rdo_logistica_test_weighted = f1_score(y_test_real, y_pred_test , average="weighted")
print(f"f1score_test_weighted: {rdo_logistica_test_weighted}")

f1score_test_macro: 0.5465321262416698
f1score_test_micro: 0.6193243734108246
f1score_test_weighted: 0.6337112364052722
