# Evaluación

![image info](https://raw.githubusercontent.com/albahnsen/MIAD_ML_and_NLP/main/images/moviegenre.png)

# Cargue de Librerías

In [2]:
!pip install unidecode
!pip install nltk

Collecting unidecode
  Downloading Unidecode-1.3.6-py3-none-any.whl (235 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/235.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.9/235.9 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.6


In [3]:
# Importación librerías
import pandas as pd
import numpy as np
import re
from unidecode import unidecode
import spacy
import nltk
from nltk.stem.snowball import SnowballStemmer
snow_stemmer = SnowballStemmer('english')
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns
from sklearn.metrics import r2_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10, 10]
plt.style.use("ggplot")
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


# Carga de la base de datos

In [4]:
dt= pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', encoding='UTF-8', index_col=0).reset_index().drop("index",axis=1)

# Visualización de la base de datos

In [5]:
dt.head()

Unnamed: 0,year,title,plot,genres,rating
0,2003,Most,most is the story of a single father who takes...,"['Short', 'Drama']",8.0
1,2008,How to Be a Serial Killer,a serial killer decides to teach the secrets o...,"['Comedy', 'Crime', 'Horror']",5.6
2,1941,A Woman's Face,"in sweden , a female blackmailer with a disfi...","['Drama', 'Film-Noir', 'Thriller']",7.2
3,1954,Executive Suite,"in a friday afternoon in new york , the presi...",['Drama'],7.4
4,1990,Narrow Margin,"in los angeles , the editor of a publishing h...","['Action', 'Crime', 'Thriller']",6.6


## Tamaño de la  base de datos

In [6]:
dt.shape

(7895, 5)

Se descartan las variables que no aportan al proyecto

In [7]:
dt=dt[["plot","genres"]]
dt.head()

Unnamed: 0,plot,genres
0,most is the story of a single father who takes...,"['Short', 'Drama']"
1,a serial killer decides to teach the secrets o...,"['Comedy', 'Crime', 'Horror']"
2,"in sweden , a female blackmailer with a disfi...","['Drama', 'Film-Noir', 'Thriller']"
3,"in a friday afternoon in new york , the presi...",['Drama']
4,"in los angeles , the editor of a publishing h...","['Action', 'Crime', 'Thriller']"


# Tipos de variables

In [8]:
dt.dtypes

plot      object
genres    object
dtype: object

# Peso de la base de datos

In [9]:
import sys
print("La base de datos tiene un peso de ",sys.getsizeof(dt)," bytes")

La base de datos tiene un peso de  7045029  bytes


# Cantidad de filas vacias en la base de datos

In [10]:
dt.isnull().sum()

plot      0
genres    0
dtype: int64

## Preprocesamiento

In [11]:
def preprocess(text,min_len=2, max_len=23):
    pat = re.compile(r"[^a-z ]")
    spaces = re.compile(r"\s{2,}")

    # Normalizamos el texto
    norm_text = unidecode(text)
    doc = nlp(norm_text)

    # Eliminamos stopwords
    filtered_tokens = filter(lambda token: not token.is_stop, doc )

    # Filtramos palabras por longitud
    filtered_tokens2 = filter(lambda token: len(token) >= min_len
                              and len(token) <= max_len, filtered_tokens)

    # Obtenemos los lemmas de cada token
    lemmas = map(lambda token: token.lemma_, filtered_tokens2 )
    stem = map( lambda token: snow_stemmer.stem(token), lemmas )
    lemma_text = " ".join(stem)

    # Quitamos grafía
    lower_text = lemma_text.lower()

    # Eliminamos caracteres especiales
    clean_text = re.sub(pat, "", lower_text)

    # Eliminamos espacios duplicados
    spaces_text = re.sub(spaces, " ", clean_text)
    return spaces_text.strip()
prepo_text=list(nlp.pipe(dt["plot"].apply(preprocess).tolist(), n_process=-1))

In [12]:
dt['preprocessed_plot'] = [doc.text for doc in prepo_text]

#  Preprocesamiento Variable objetivo

In [13]:
dt["genres"] = dt["genres"].apply(lambda x: x.replace('[','').replace(']','').replace("\'",''))
dt.head()

Unnamed: 0,plot,genres,preprocessed_plot
0,most is the story of a single father who takes...,"Short, Drama",stori singl father take year old son work rail...
1,a serial killer decides to teach the secrets o...,"Comedy, Crime, Horror",serial killer decid teach secret satisfi caree...
2,"in sweden , a female blackmailer with a disfi...","Drama, Film-Noir, Thriller",sweden femal blackmail disfigur facial scar me...
3,"in a friday afternoon in new york , the presi...",Drama,friday afternoon new york presid tredway corpo...
4,"in los angeles , the editor of a publishing h...","Action, Crime, Thriller",los angel editor publish hous carol hunnicut g...


In [14]:
genres_df = dt['genres'].str.get_dummies(sep=', ')
data = dt.drop(['genres','preprocessed_plot'], axis=1)
data = pd.concat([dt, genres_df], axis=1)
data.head()

Unnamed: 0,plot,genres,preprocessed_plot,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,...,Musical,Mystery,News,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
0,most is the story of a single father who takes...,"Short, Drama",stori singl father take year old son work rail...,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,a serial killer decides to teach the secrets o...,"Comedy, Crime, Horror",serial killer decid teach secret satisfi caree...,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,"in sweden , a female blackmailer with a disfi...","Drama, Film-Noir, Thriller",sweden femal blackmail disfigur facial scar me...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,"in a friday afternoon in new york , the presi...",Drama,friday afternoon new york presid tredway corpo...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"in los angeles , the editor of a publishing h...","Action, Crime, Thriller",los angel editor publish hous carol hunnicut g...,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


# Embedding de la variable plot con limpieza de tokens

In [15]:
vect = TfidfVectorizer(sublinear_tf=True, max_df=0.95, min_df=0.03).fit(dt.preprocessed_plot)
display(vect)

features = vect.transform(dt.preprocessed_plot).toarray()
display(features.shape)


(7895, 280)

# Generación de pesos de las clases según su frecuencia

In [17]:
emb=[]
target=[]
for i in range(data.shape[0]):
  for j in data.genres[i].split(', '):
    if j!="News":
      emb.append(features[i])
      target.append(j)
emb=np.array(emb)
target=np.array(target)
print(emb.shape,target.shape)

(21569, 280) (21569,)


In [18]:
from sklearn.utils import class_weight
classes_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(target),
    y=target)
dict(zip(np.unique(target),classes_weights))

{'Action': 0.7197103673796256,
 'Adventure': 0.9158033288043478,
 'Animation': 3.60685618729097,
 'Biography': 2.5141624898006762,
 'Comedy': 0.3078734762625253,
 'Crime': 0.6480874973708722,
 'Documentary': 2.2381446508249456,
 'Drama': 0.23651515982235868,
 'Family': 1.3750478133367334,
 'Fantasy': 1.3264251891027612,
 'Film-Noir': 5.58203933747412,
 'History': 3.4351011307533046,
 'Horror': 0.983000638045757,
 'Music': 2.750095626673467,
 'Musical': 3.4604524306112627,
 'Mystery': 1.2355502090851807,
 'Romance': 0.4956567699237062,
 'Sci-Fi': 1.2970713813217873,
 'Short': 10.193289224952741,
 'Sport': 3.5930368149258705,
 'Thriller': 0.4633313284069428,
 'War': 2.694777611194403,
 'Western': 3.9568886442854523}

In [19]:
target=pd.get_dummies(target)
gen=target.columns
target=target.to_numpy()

# Separación en Base de datos en Entrenamiento, Validación y de Testeo

In [35]:
X_train, X_test, y_train, y_test = train_test_split( features, genres_df.drop(["News"],axis=1).to_numpy(), test_size=0.15, random_state=42)#,stratify=genres_df.drop(["News"],axis=1).to_numpy())

# Configuración y cargue de librerías de Deep Learning

In [21]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding
!pip install keras-tuner --upgrade

Collecting keras-tuner
  Downloading keras_tuner-1.3.5-py3-none-any.whl (176 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/176.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.1/176.1 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.3.5 kt-legacy-1.0.5


In [22]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from keras_tuner import HyperModel, RandomSearch
import keras_tuner

In [36]:
batch_size=X_train.shape[0]//64
train_dataset =tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(len(X_train))
train_dataset = train_dataset.shuffle(len(X_train)).batch(batch_size, drop_remainder=True)

test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
test_dataset = test_dataset.batch(batch_size, drop_remainder=True)

# Definición de función de perdida acorde a un problema de clasificación desbalanceado

In [37]:
class WeightedCrossEntropy(tf.keras.losses.Loss):
  def __init__(self, weight, epsilon=1e-7, name="weighted_cross_entropy", **kwargs):
    super().__init__(name=name, **kwargs)
    self.weight = weight
    self.epsilon = epsilon

  def call(self, y_true, y_pred):
    y_pred = tf.clip_by_value(y_pred, self.epsilon, 1 - self.epsilon)
    y_true=tf.cast(y_true, tf.float32)
    loss = -(self.weight * y_true * tf.math.log(y_pred) + (1 - y_true) * tf.math.log(1 - y_pred))
    return tf.math.reduce_mean(loss)

  def get_config(self):
    config = {
      'weight': self.weight,
      'epsilon': self.epsilon
      }
    base_config = super().get_config()
    return {**base_config, **config}

# Arquitectura de la red neuronal

In [38]:
loss_fn = WeightedCrossEntropy(weight=classes_weights)

model = keras.Sequential()
model.add(Dense(units=328, input_shape=(None,X_train.shape[1])))
model.add(Dropout(0.3))
model.add(Dense(23, activation='sigmoid'))
model.compile(optimizer=Adam(0.001),
              loss=loss_fn,
              metrics=["accuracy"])


# Entrenamiento de modelo con los mejores hiperparámetros

In [39]:
cnn = model


In [42]:
hist = cnn.fit(train_dataset, epochs=5,
                 batch_size=batch_size)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Evaluación ROC AUC

In [43]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, cnn.predict(X_test), average="macro",multi_class='ovr')



0.804748916581757

# Reporte de Clasificación

In [44]:
from sklearn.metrics import classification_report
predictions = cnn.predict(X_test)
generos=['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir',
       'History', 'Horror', 'Music', 'Musical', 'Mystery', 'Romance',
       'Sci-Fi', 'Short', 'Sport', 'Thriller', 'War', 'Western']
print(classification_report(y_test,(predictions > 0.5),target_names=generos))

              precision    recall  f1-score   support

      Action       0.65      0.16      0.25       198
   Adventure       0.76      0.13      0.23       164
   Animation       0.37      0.20      0.26        49
   Biography       0.29      0.06      0.10        68
      Comedy       0.66      0.05      0.10       459
       Crime       0.71      0.31      0.43       212
 Documentary       0.38      0.31      0.34        55
       Drama       0.79      0.09      0.15       584
      Family       0.55      0.14      0.22       124
     Fantasy       0.47      0.06      0.11       110
   Film-Noir       0.30      0.17      0.22        35
     History       0.23      0.17      0.19        36
      Horror       0.53      0.18      0.27       130
       Music       0.24      0.06      0.10        64
     Musical       0.16      0.08      0.10        39
     Mystery       0.51      0.18      0.27       105
     Romance       0.75      0.23      0.36       291
      Sci-Fi       0.65    

  _warn_prf(average, modifier, msg_start, len(result))
