# Importar Librerias

In [None]:
#!pip install -U pip setuptools wheel
#!pip install -U spacy
#!python -m spacy download en_core_web_sm
#!pip install astropy
#!pip install python-math
#!pip install missingno
#!pip install regex

In [9]:
import regex as re
import astropy as ast
import python_math as math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

In [None]:
from wordcloud import WordCloud
from sklearn.preprocessing import MultiLabelBinarizer
from gensim.corpora.textcorpus import strip_multiple_whitespaces

# For word lemmitization
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

import warnings
warnings.filterwarnings("ignore")

from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))

# Load Data

In [None]:
movies_training = pd.read_csv("/Users/marioriveravargas/Downloads/Machine-Learning-NLP-main/Competencia/datos/dataTraining.csv", sep="," , index_col=0)
movies_training.head()

In [None]:
movies_training.info()

# Pre-Processing - Label Data

In [None]:
movies_training['genres_process']=movies_training['genres'].apply(lambda x: eval(x))
movies_training

In [None]:
multilabel = MultiLabelBinarizer()
y = multilabel.fit_transform(movies_training['genres_process'])
y

In [None]:
multilabel.classes_

In [None]:
tags = pd.DataFrame(y, columns= multilabel.classes_)
tags

In [None]:
data = movies_training[['title', 'plot']].reset_index(drop=True)
data_train = pd.concat([data, tags], axis=1)
data_train.head()


## Tipos de datos - Null Data

In [None]:
data_train.info()

In [None]:
msno.matrix(data_train)


Se verifica que no hay valores perdidos

## Conversión de columnas categóricas

Convertimos todo el conjunto de características de género(salidas) en tipos categóricos.

In [None]:
category = data_train.columns.drop(['title', 'plot'])
for col in tags:
    data_train[col] = data_train[col].astype('category')

data_train.info()

## Exploración Data

In [None]:
sum_genre = pd.DataFrame(tags[data_train.columns.drop(['title', 'plot'])].sum()).reset_index()
sum_genre.columns = ['Genre', 'Total']
sum_genre

In [None]:
f, ax = plt.subplots(1, 1, figsize=(12, 10))
sns.barplot(data=sum_genre, x='Genre', y='Total', axes=ax)
ax.set(ylabel='Número de películas')
plt.title('Número de películas por género', loc='center', fontdict={'fontsize':16})
plt.xticks(rotation=90)
plt.show()


Observaciones

* Las películas de género más bajas son Animation, Film-noir, History, Short and News.
* Las películas de género más altas son Drama (3965 películas), seguidas de Comedia (3046 películas) y Thriller (2024 películas )

## Número de géneros por película

In [None]:
sum_movie = data_train[category].sum(axis=1)
df_genres_per_movie = pd.DataFrame({'title': data_train.title, 'num_genres':sum_movie}).groupby('num_genres').count()
f, ax = plt.subplots(1, 1, figsize=(12, 10))
sns.barplot(data=df_genres_per_movie, x=df_genres_per_movie.index, y='title', axes=ax)
ax.set(xlabel='Número de géneros para una película', ylabel='Número de películas')
plt.title('Número de géneros por película', loc='center', fontdict={'fontsize':16})
plt.xticks(rotation=90)
plt.show()

In [None]:
print('En promedio, las películas se clasifican en {0:.2f} géneros'.format(sum_movie.mean()))
print('Número de películas con mas de 4 géneros. = {0}'.format(len(sum_movie[sum_movie==4])))

## WordCloud

In [None]:
import math 
fig = plt.figure(figsize=(18, 70))
num_cols = 2
num_rows = math.ceil(len(category)/num_cols)
for idx, col in enumerate(category):
    wordcloud = WordCloud(max_font_size=50).generate(' '.join(data_train[data_train[col]==1]['plot']))
    ax = fig.add_subplot(num_rows, num_cols, idx+1)
    ax.imshow(wordcloud)
    ax.axis("off")
    ax.set(title='Movie Genre: {0}'.format(col))
plt.show()

## Análisis de correlación de género
Ahora veamos qué géneros están mayormente correlacionados. En otras palabras, si una película es, por ejemplo, Acción, ¿cuáles son los otros géneros en los que puede caer?

In [None]:
# Heatmap relative to all numeric columns
corr_matrix = (data_train[category].astype('int')).corr()
mask = np.array(corr_matrix)
mask[np.tril_indices_from(mask)] = False
corr_matrix = (100*corr_matrix//1)/10

fig = plt.figure(figsize=(20, 20))
sns.heatmap(corr_matrix, mask=mask, annot=True, cbar=True, vmax=7, vmin=-7, cmap='RdYlGn')
plt.show()

Las siguientes categorías de género muestran una fuerte correlación positiva entre sí
* Acción, Aventura y Ciencia Ficción
* Animación, Fantasía y Familia
* Crimen, Thriller, Misterio y Drama
* Biografía, Documental e Historia
* drama y romanticismo
* Programa de juegos y telerrealidad
* Terror, Thriller y Fantasia
* Programa de entrevistas y noticias
* Guerra e Historia

Las siguientes categorías de género muestran una fuerte correlación negativa entre sí
* Animación y Teatro
* Comedia con Documental y Reality-TV
* Documental con Comedia, Drama y Romance
* Drama con Animación, Reality TV y Comedia

## Preprocesamiento de los datos

Aquí limpiamos nuestros datos (trama de la película) usando las siguientes técnicas de preprocesamiento

In [None]:
#!pip install -U pip setuptools wheel
#!pip install -U 'spacy[apple]'
#!python -m spacy download en_core_web


In [4]:
#!pip install strings
#nlp = spacy.load('en_core_web_trf')

In [None]:
def preprocess_text(text):
    """
    Function to process the the text text and tranform it into format usable by Machine learning models
    """
    # remove line breaks
    text = text.replace('\n', ' ')
    # to convert all the characters of the text into lower case alphabets
    text = text.lower()
    #remove numbers
    text = re.sub(r'[0-9]+', '', text) #remove numbers
    # Remove urls from the texts
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    #remove emails
    text = re.sub('\S*@\S*\s?', '', text)
    # Remove user related references from the texts:: '@' and '#' 
    text = re.sub(r'\@\w+|\#','', text)
    #remove spaces
    text = strip_multiple_whitespaces(text) 
    # Remove punctuations from the texts
    #text = text.translate(str.maketrans('', '', string.punctuation))

    # Lemmatizerr
    lemmatizer = WordNetLemmatizer()
    lemma_words = [lemmatizer.lemmatize(w, pos='v') for w in text.split()]

    joined_text = " ".join(lemma_words)
    return joined_text

In [None]:
data_train['plot_clean'] = data_train['plot'].apply(preprocess_text)

## Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data_train, random_state=42, test_size=0.15, shuffle=True)

print(train.shape)
print(test.shape)

## TF-IDF

In [None]:
train_text = train['plot_clean']
test_text = test['plot_clean']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')
vectorizer.fit(train_text)
vectorizer.fit(test_text)

In [None]:
x_train = vectorizer.transform(train_text)
y_train = train.drop(labels = ['title','plot','plot_clean'], axis=1)

x_test = vectorizer.transform(test_text)
y_test = test.drop(labels = ['title','plot','plot_clean'], axis=1)

## Multi-Label Classification

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier

In [7]:
%%time

# Using pipeline for applying logistic regression and one vs rest classifier
accuracy=0
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)),
            ])

for cat in category.tolist():
    printmd('**Processing {} Multilabel Clasification...**'.format(cat))
    
    # Training logistic regression model on train data
    LogReg_pipeline.fit(x_train, train[cat])
    
    # calculating test accuracy
    prediction = LogReg_pipeline.predict(x_test)
    print('Test accuracy is {}'.format(accuracy_score(test[cat], prediction)))
    print('Test Recall is {}'.format(recall_score(test[cat], prediction)))
    print('Test f1_score is {}'.format(f1_score(test[cat], prediction)))
    print("\n")



NameError: name 'category' is not defined

In [None]:
%%time

from sklearn.linear_model import SGDClassifier

# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(SGDClassifier(), n_jobs=-1)),
            ])

for cat in category.tolist():
    printmd('**Processing {} Multilabel Clasification...**'.format(cat))
    
    # Training logistic regression model on train data
    LogReg_pipeline.fit(x_train, train[cat])
    
    # calculating test accuracy
    prediction = LogReg_pipeline.predict(x_test)
    print('Test accuracy is {}'.format(accuracy_score(test[cat], prediction)))
    print('Test Recall is {}'.format(recall_score(test[cat], prediction)))
    print('Test f1_score is {}'.format(f1_score(test[cat], prediction)))
    print("\n")

In [None]:
%%time

from sklearn.naive_bayes import MultinomialNB

# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None), n_jobs=-1)),
            ])

for cat in category.tolist():
    printmd('**Processing {} Multilabel Clasification...**'.format(cat))
    
    # Training logistic regression model on train data
    LogReg_pipeline.fit(x_train, train[cat])
    
    # calculating test accuracy
    prediction = LogReg_pipeline.predict(x_test)
    print('Test accuracy is {}'.format(accuracy_score(test[cat], prediction)))
    print('Test Recall is {}'.format(recall_score(test[cat], prediction)))
    print('Test f1_score is {}'.format(f1_score(test[cat], prediction)))
    print("\n")

## Red Neuronal

In [6]:
maxlen = 200
tokenizer = Tokenizer(num_words=5000, lower=True)
tokenizer.fit_on_texts(data_train['plot_clean'])
sequences = tokenizer.texts_to_sequences(data_train['plot_clean'])
x = pad_sequences(sequences, maxlen=maxlen)

NameError: name 'Tokenizer' is not defined

In [None]:
data_n = data_train[['plot','plot_clean', 'Action', 'Adventure', 'Animation', 'Biography',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy',
       'Film-Noir', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News',
       'Romance', 'Sci-Fi', 'Short', 'Sport', 'Thriller', 'War', 'Western']]
data_n.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, 
                                                    y, 
                                                    test_size=0.3, 
                                                    random_state=42)

In [None]:
num_classes = y_train.shape[1] 
Y_train = to_categorical(y_train, num_classes)
Y_test = to_categorical(y_train, num_classes)

max_words = len(tokenizer.word_index) + 1
print(f"Numero de clases:{num_classes}")
print(f"Max Words:{max_words}")


In [None]:
# Modelado
# ==============================================================================
from sklearn.datasets import make_blobs
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
import multiprocessing


In [None]:
# Modelos
# ==============================================================================
modelo_1 = MLPClassifier(
                hidden_layer_sizes=(5),
                learning_rate_init=0.01,
                solver = 'lbfgs',
                max_iter = 1000,
                random_state = 123
            )

modelo_2 = MLPClassifier(
                hidden_layer_sizes=(10),
                learning_rate_init=0.01,
                solver = 'lbfgs',
                max_iter = 1000,
                random_state = 123
            )

modelo_3 = MLPClassifier(
                hidden_layer_sizes=(20, 20),
                learning_rate_init=0.01,
                solver = 'lbfgs',
                max_iter = 5000,
                random_state = 123
            )

modelo_4 = MLPClassifier(
                hidden_layer_sizes=(50, 50, 50),
                learning_rate_init=0.01,
                solver = 'lbfgs',
                max_iter = 5000,
                random_state = 123
            )

modelo_1.fit(X=X, y=y)
modelo_2.fit(X=X, y=y)
modelo_3.fit(X=X, y=y)
modelo_4.fit(X=X, y=y)

In [None]:
# Gráfico de predicciones
# ==============================================================================
fig, axs = plt.subplots(2, 2, figsize=(12,8))
axs = axs.flatten()
grid_x1 = np.linspace(start=min(X[:, 0]), stop=max(X[:, 0]), num=100)
grid_x2 = np.linspace(start=min(X[:, 1]), stop=max(X[:, 1]), num=100)
xx, yy = np.meshgrid(grid_x1, grid_x2)
X_grid = np.column_stack([xx.flatten(), yy.flatten()])

for i, modelo in enumerate([modelo_1, modelo_2, modelo_3, modelo_4]):
    
    predicciones = modelo.predict(X_grid)
    
    for j in np.unique(predicciones):
        axs[i].scatter(
            x = X_grid[predicciones == j, 0],
            y = X_grid[predicciones == j, 1], 
            c = plt.rcParams['axes.prop_cycle'].by_key()['color'][j],
            #marker = 'o',
            alpha = 0.3,
            label= f"Grupo {j}"
        )

    for j in np.unique(y):
        axs[i].scatter(
            x = X[y == j, 0],
            y = X[y == j, 1], 
            c = plt.rcParams['axes.prop_cycle'].by_key()['color'][j],
            marker    = 'o',
            edgecolor = 'black'
        )
        
    axs[i].set_title(f"Capas ocultas: {modelo.hidden_layer_sizes}")
    axs[i].axis('off')
axs[0].legend();

In [None]:
# Número de neuronas
# ==============================================================================
param_grid = {'hidden_layer_sizes':[1, 5, 10, 15, 25, 50]}

grid = GridSearchCV(
        estimator = MLPClassifier(
                        learning_rate_init=0.01,
                        solver = 'lbfgs',
                        alpha  = 0,
                        max_iter = 5000,
                        random_state = 123
                    ),
        param_grid = param_grid,
        scoring    = 'accuracy',
        cv         = 5,
        refit      = True,
        return_train_score = True
      )

_ = grid.fit(X, y)

In [None]:
fig, ax = plt.subplots(figsize=(6, 3.84))
scores = pd.DataFrame(grid.cv_results_)
scores.plot(x='param_hidden_layer_sizes', y='mean_train_score', yerr='std_train_score', ax=ax)
scores.plot(x='param_hidden_layer_sizes', y='mean_test_score', yerr='std_test_score', ax=ax)
ax.set_ylabel('accuracy')
ax.set_xlabel('número de neuronas')
ax.set_title('Error de validacion cruzada');


In [None]:
# learning rate
# ==============================================================================
param_grid = {'learning_rate_init':[0.0001, 0.001, 0.01, 0.1, 1, 10, 100]}

grid = GridSearchCV(
        estimator = MLPClassifier(
                        hidden_layer_sizes=(10),
                        solver = 'adam',
                        alpha  = 0,
                        max_iter = 5000,
                        random_state = 123
                    ),
        param_grid = param_grid,
        scoring    = 'accuracy',
        cv         = 5,
        refit      = True,
        return_train_score = True
      )

_ = grid.fit(X, y)

In [None]:
fig, ax = plt.subplots(figsize=(6, 3.84))
scores = pd.DataFrame(grid.cv_results_)
scores.plot(x='param_learning_rate_init', y='mean_train_score', yerr='std_train_score', ax=ax)
scores.plot(x='param_learning_rate_init', y='mean_test_score', yerr='std_test_score', ax=ax)
ax.set_xscale('log')
ax.set_xlabel('log(learning rate)')

In [None]:
# Espacio de búsqueda de cada hiperparámetro
# ==============================================================================
param_distributions = {
    'hidden_layer_sizes': [(10), (10, 10), (20, 20)],
    'alpha': np.logspace(-3, 3, 7),
    'learning_rate_init': [0.001, 0.01, 0.1],
}

# Búsqueda por validación cruzada
# ==============================================================================
grid = RandomizedSearchCV(
        estimator  = MLPClassifier(solver = 'lbfgs', max_iter= 2000),
        param_distributions = param_distributions,
        n_iter     = 50, # Número máximo de combinaciones probadas
        scoring    = 'accuracy',
        n_jobs     = multiprocessing.cpu_count() - 1,
        cv         = 3, 
        verbose    = 0,
        random_state = 123,
        return_train_score = True
       )

grid.fit(X = X, y = y)

# Resultados del grid
# ==============================================================================
resultados = pd.DataFrame(grid.cv_results_)
resultados.filter(regex = '(param.*|mean_t|std_t)')\
    .drop(columns = 'params')\
    .sort_values('mean_test_score', ascending = False)\
    .head(10)

In [None]:
modelo = grid.best_estimator_
modelo

In [None]:
grid_x1 = np.linspace(start=min(X[:, 0]), stop=max(X[:, 0]), num=100)
grid_x2 = np.linspace(start=min(X[:, 1]), stop=max(X[:, 1]), num=100)
xx, yy = np.meshgrid(grid_x1, grid_x2)

X_grid = np.column_stack([xx.flatten(), yy.flatten()])
predicciones = modelo.predict(X_grid)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(5, 5))

for i in np.unique(predicciones):
    ax.scatter(
        x = X_grid[predicciones == i, 0],
        y = X_grid[predicciones == i, 1], 
        c = plt.rcParams['axes.prop_cycle'].by_key()['color'][i],
        #marker = 'o',
        alpha = 0.3,
        label= f"Grupo {i}"
    )

for i in np.unique(y):
    ax.scatter(
        x = X[y == i, 0],
        y = X[y == i, 1], 
        c = plt.rcParams['axes.prop_cycle'].by_key()['color'][i],
        marker    = 'o',
        edgecolor = 'black'
    )
    

ax.set_title('Regiones de clasificación')
ax.legend();


In [None]:
history = model.fit(X_train, y_train,
                    epochs=30,
                    batch_size=32,
                    validation_split=0.3,
                    callbacks=callbacks)

In [None]:
cnn_model = model
metrics = cnn_model.evaluate(X_test, y_test)
# evaluate model
training_report_Keras = 'Training Report CNN: ' + '\n'
test_loss, test_accuracy = model.evaluate(X_test, y_test)

training_report_Keras += f'\nTest accuracy:\t\t{test_accuracy:.4f}\t- Test loss:\t\t{test_loss:.4f}\n'

preds = model.predict(X_test)
y_true = np.array([np.argmax(y) for y in y_test])
y_pred = np.array([np.argmax(y) for y in preds])
y_prob = np.array([np.amax(y) for y in preds])


training_report_Keras += classification_report(y_true, y_pred, digits=4)

print(training_report_Keras)

In [None]:
from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()


glove_file = open('/Users/natalia/Downloads/glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

embedding_matrix = zeros((max_words, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [None]:
from keras.layers import Input
from keras.layers import Flatten, LSTM
from keras.models import Model

deep_inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(max_words, 100, weights=[embedding_matrix], trainable=False)(deep_inputs)
LSTM_Layer_1 = LSTM(128)(embedding_layer)
dense_layer_1 = Dense(num_classes, activation='sigmoid')(LSTM_Layer_1)
model = Model(inputs=deep_inputs, outputs=dense_layer_1)

callbacks = [
    ReduceLROnPlateau(),
    ModelCheckpoint(filepath='model-conv1d.h5', save_best_only=True)
]

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[tf.keras.metrics.AUC()])
history = model.fit(X_train, y_train,
                    batch_size=32, 
                    epochs=30, 
                    validation_split=0.3,
                    callbacks=callbacks)

In [None]:
lstm_model = model
metrics = lstm_model.evaluate(X_test, y_test)
# evaluate model
training_report_Keras = 'Training Report CNN: ' + '\n'
test_loss, test_accuracy = model.evaluate(X_test, y_test)

training_report_Keras += f'\nTest accuracy:\t\t{test_accuracy:.4f}\t- Test loss:\t\t{test_loss:.4f}\n'

preds = model.predict(X_test)
y_true = np.array([np.argmax(y) for y in y_test])
y_pred = np.array([np.argmax(y) for y in preds])
y_prob = np.array([np.amax(y) for y in preds])


training_report_Keras += classification_report(y_true, y_pred, digits=4)

print(training_report_Keras)