# El modelo

![img](images/ml.jpg)

***El modelo consiste en intentar crear un clasificador binario de noticia y que pueda diferenciar las noticias que tengan relación con música, de las que no***

In [None]:
# Importo las librerias que voy a utilizar en este notebook

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import keras

In [None]:
df = pd.read_csv('../elmundo/input/df_model.csv',index_col=[0])
df.sample(5)

In [None]:
# Sacamos la X y la y
frases = df.stems.values
y = df.Label.values

In [None]:
# Dividimos en train y test

frases_train, frases_test, y_train, y_test = train_test_split(frases,
                                                             y,
                                                             test_size=0.15,
                                                             random_state=42)

In [None]:
# Vectorizamos los stems
vectorizer = CountVectorizer()
vectorizer.fit(frases_train)

X_train = vectorizer.transform(frases_train)
X_test = vectorizer.transform(frases_test)

In [None]:
# Sacamos un 0,05 para validación.
X_train, X_valid, y_train, y_valid = train_test_split(X_train,
                                                     y_train,
                                                     test_size=0.05,
                                                     random_state=42)

In [None]:
# Creamos la red neuronal para clasificación
# 1 salida.

input_dim = X_train.shape[1]
model = keras.models.Sequential([
    keras.layers.Dense(input_dim = input_dim,
                      units=30,
                      activation='sigmoid'),

    keras.layers.Dense(units=1, activation='sigmoid'),
])
opt = keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss = 'binary_crossentropy', 
             optimizer=opt, metrics='accuracy')
early_stopping_cb = keras.callbacks.EarlyStopping(patience = 3,
                                                 restore_best_weights=True)
history = model.fit(X_train,
                   y_train,
                   epochs=60,
                   batch_size=128,
                   callbacks = early_stopping_cb, 
                   validation_data=(X_valid, y_valid))
pd.DataFrame(history.history).plot(figsize=(8,5))
plt.grid()

plt.show()
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))


In [None]:
# Interpretamos los resultados antes de poder comparar con y_test

predict = model.predict(X_test)
y_pred = []
for i in predict:
    if i[0] > 0.5:
        y_pred.append(1)
    else:
        y_pred.append(0)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
model.save('../elmundo/input/clasificacion_musica.h5')

##### Acertamos 3 de cada 4.

## Logistic Regression

In [None]:
# Vamos a probar con una regresión logistica, por si pudieramos ver algo distinto.
params = {
    'penalty':['l1','l2'],
    'C': [0.2,0.3,0.4,0.5,0.6,0.7,0.8],
    'solver': ['liblinear'],
    'max_iter': [50,100,300],
}
classifier = LogisticRegression()
grid = GridSearchCV(classifier,params)
grid.fit(X_train, y_train)


In [None]:
grid.best_estimator_

In [None]:
grid.best_score_

In [None]:
log_y_pred = grid.best_estimator_.predict(X_test)
print(classification_report(log_y_pred,y_test))

La regresión logística se acerca mucho a los resultados de la red

## Random Forest

In [None]:
%%time
rf = RandomForestClassifier()
params = {
    'n_estimators': [50,100,300,500],
    'max_depth': [2,3,4],
    'max_leaf_nodes':[5,10,20,50,100],
}

grid2 = GridSearchCV(estimator=rf,param_grid=params, cv=4)
grid2.fit(X_train, y_train)

In [None]:
print(grid2.best_score_)
print(grid2.best_params_)

In [None]:
# Random Forest lo hace peor que la regresión logística y que la red.

## SVC.

In [None]:
svc = SVC()
params_grid = {
    'C':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
    'kernel':['linear','poly','rbf'],
    'degree':[2,3,4,5],
    
}
grid3 = GridSearchCV(estimator=svc,
                    param_grid=params_grid,
                    cv=4)
grid3.fit(X_train, y_train)

In [None]:
print(grid3.best_score_)
print(grid3.best_params_)

In [None]:
y_pred_svc = grid3.best_estimator_.predict(X_test)
print(classification_report(y_pred_svc,y_test))

## Resultados

Hemos conseguido un score de un 75% aproximadamente, acertando 3 de cada cuatro en un clasificador binario sobre texto. No son unos resultados muy buenos, pero pueden influir terminos en el cambio de idioma, igual que también podrían influir
las elecciones de categoría que ha detectado el modelo HuggingFace.