# Introducción a la Ciencia de Datos: Tarea 2

Este notebook contiene el código de base para realizar la Tarea 2 del curso. Puede copiarlo en su propio repositorio y trabajar sobre el mismo.
Las **instrucciones para ejecutar el notebook** están en la [página inicial del repositorio](https://gitlab.fing.edu.uy/maestria-cdaa/intro-cd/).

**Se espera que no sea necesario revisar el código para corregir la tarea**, ya que todos los resultados y análisis relevantes deberían estar en el **informe en formato PDF**.

## Cargar dependencias
Para esta tarea, se han agregado algunos requerimientos, asegúrese de instalarlos (puede usar el mismo entorno virtual de la Tarea 1):

In [None]:
from time import time
from pathlib import Path
from collections import Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
from sklearn.decomposition import PCA
from sklearn.feature_extraction import text 
from sklearn import preprocessing
from sklearn.neighbors import NeighborhoodComponentsAnalysis, KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


## Conexión a la Base y Lectura de Datos

In [None]:
data_dir = Path("data") / "shakespeare"
data_dir.mkdir(parents=True, exist_ok=True)


def load_table(table_name, engine):
    """
    Leer la tabla con SQL y guardarla como CSV,
    o cargarla desde el CSV si ya existe
    """
    path_table = data_dir / f"{table_name}.csv"
    if not path_table.exists():
        print(f"Consultando tabla con SQL: {table_name}")
        t0 = time()
        df_table = pd.read_sql(f"SELECT * FROM {table_name}", engine)
        t1 = time()
        print(f"Tiempo: {t1 - t0:.1f} segundos")

        print(f"Guardando: {path_table}\n")
        df_table.to_csv(path_table)
    else:
        print(f"Cargando tabla desde CSV: {path_table}")
        df_table = pd.read_csv(path_table, index_col=[0])
    return df_table


print("Conectando a la base...")
conn_str = "mysql+pymysql://guest:relational@relational.fit.cvut.cz:3306/Shakespeare"
engine = create_engine(conn_str)

# Todos los párrafos de todas las obras
df_paragraphs = load_table("paragraphs", engine)

df_characters = load_table("characters", engine)

df_works = load_table("works", engine)

df_chapters = load_table("chapters", engine)

In [None]:
df_paragraphs

## Limpieza de Texto

In [None]:
# TODO: Actualizar con su versión de clean_text() de la Tarea_1

def clean_text(df, column_name):
    # Convertir todo a minúsculas
    result = df[column_name].str.lower()

    # Quitar signos de puntuación y cambiarlos por espacios (" ")
    # Se incluyen los signos de puntuación buscados antes (excepto el apóstrofe)
    signos = ["[", "\n", ",", "]", ".", ";", "?", "!", ":", "-", "(", ")", "&",'"', "\t"]
    for punc in signos:
        result = result.str.replace(punc, " ")
    return result

#def expand_contractions(text):
#    out = con.fix(text, slang=False)
#    return out

# Creamos una nueva columna CleanText a partir de PlainText
df_paragraphs["CleanText"] = clean_text(df_paragraphs, "PlainText")

# Se eliminan las contracciones mediante el uso de contractions.
#df_paragraphs['CleanContractions'] = df_paragraphs['CleanText'] #.apply(expand_contractions)
#df_paragraphs['CleanContractions']= df_paragraphs['CleanContractions'].str.lower()

# Veamos la diferencia
df_paragraphs[["PlainText", "CleanText"]]

In [None]:
# Agregamos personajes, obras y géneros en el mismo dataset
df_dataset = df_paragraphs.merge(df_chapters.set_index("id")["work_id"], left_on="chapter_id", right_index=True)
df_dataset = df_dataset.merge(df_works.set_index("id")[["Title", "GenreType"]], left_on="work_id", right_index=True)
df_dataset = df_dataset.merge(df_characters.set_index('id')["CharName"], left_on="character_id", right_index=True).sort_index()
df_dataset = df_dataset[["CleanText", "CharName", "Title", "GenreType"]]

# Usaremos sólo estos personajes
characters = ["Antony", "Cleopatra", "Queen Margaret"]
df_dataset = df_dataset[df_dataset["CharName"].isin(characters)]

df_dataset

In [None]:
# Párrafos por cada personaje seleccionado
df_dataset["CharName"].value_counts()

## Dataset y Features de texto

In [None]:
X = df_dataset["CleanText"].to_numpy()
y = df_dataset["CharName"].to_numpy()

In [None]:
# TODO: Partir train/test 30% estratificados
# -> Definir X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1, stratify=y)


In [None]:
# Chequeo tamaño de test del 30%
print(f"Tamaño DataSet: {len(X)}")
print(f"Tamaños de Train/Test: {len(X_train)}/{len(X_test)}")
print(f"Porcentaje Test: {'{0:.2f}'.format(len(X_test)*100/len(X))} %")


In [None]:
# Párrafos por cada personaje seleccionado
df_dataset["CharName"].value_counts()
totalSize = len(X)
totalCharacters = dict(Counter(df_dataset["CharName"]))

for key, value in totalCharacters.items():
    print(f"{key} : {value}")
    p = '{0:.2f}'.format(int(value)*100/totalSize)
    print(f"Porcentaje Character {key}:  {p} %")

In [None]:
# Chequeo muestreoestratificado (propociones de personajes se mantengan en conjunto de entrenamiento)
cols = ['CharName', 'Percent', 'Type']
df_train = pd.DataFrame(columns = cols) 
me_y_train = dict(Counter(y_train))
for key, value in me_y_train.items():
    print(f"{key} : {value}")
    p = '{0:.2f}'.format(int(value)*100/(len(X_train)))
    print(f"Porcentaje en muestra Character {key}:  {p} %")
    df_train.loc[len(df_train)] = {'CharName': key, 'Percent': p, 'Type' : 'Train'}
    #df_train = pd.concat([df_train, {'CharName': key, 'Percent': p, 'Type' : 'Train'}],ignore_index=True)


In [None]:
# Chequeo muestreoestratificado (propociones de personajes se mantengan en conjunto de test)
df_test = pd.DataFrame(columns = cols)
me_y_test = dict(Counter(y_test))
for key, value in me_y_test.items():
    print(f"{key} : {value}")
    p = '{0:.2f}'.format(int(value)*100/(len(X_test)))
    print(f"Porcentaje en test Character {key}:  {p} %")
    df_test.loc[len(df_test)] = {'CharName': key, 'Percent': p, 'Type' : 'Test'}
    #df_test = df_test.append({'CharName': key, 'Percent': p, 'Type' : 'Test'},ignore_index=True)


### Visualización de train vs test

In [None]:
# Visualize the Data
sns.set_style('darkgrid')
sns.set_palette('pastel')
df_train = df_train.sort_values('CharName', ascending=True)
#sns.catplot(data=final_df, x='CharName', y='Percent', hue='Type')
plt.pie(df_train['Percent'], labels=df_train['CharName'], autopct = '%0.0f%%')
plt.show()

In [None]:
df_test = df_test.sort_values('CharName', ascending=True)
# Visualize the Data
sns.set_style('darkgrid')
sns.set_palette('pastel')
#sns.catplot(data=final_df, x='CharName', y='Percent', hue='Type')
plt.pie(df_test['Percent'], labels=df_test['CharName'], autopct = '%0.0f%%')
plt.show()

In [None]:
# Combine the Data

df_alltrain = pd.DataFrame()
df_alltrain['CharName'] = y_train
df_alltrain['CleanText'] = X_train
df_alltrain['Type'] = 'Train'

df_alltest = pd.DataFrame()
df_alltest['CharName'] = y_test
df_alltest['CleanText'] = X_test
df_alltest['Type'] = 'Test'

final_df = pd.concat([df_alltrain,df_alltest], ignore_index=True)
#final_df = df_train.append(df_test, ignore_index=True)

final_df

In [None]:
# Visualize the Data
sns.set_style('darkgrid')
sns.set_palette('pastel')
#sns.catplot(data=final_df, x='CharName', y='Percent', hue='Type')
sns.histplot(data=final_df,  x='CharName', hue='Type', binwidth=0.05)
plt.show()

In [None]:
df_train = pd.DataFrame(np.array([X_train, y_train]).T, columns=['Paragraphs', 'CharName'])
df_train['Tipo']='Train'
df_test = pd.DataFrame(np.array([X_test, y_test]).T, columns=['Paragraphs', 'CharName'])
df_test['Tipo'] = 'Test'
df_tot = pd.DataFrame(np.array([X, y]).T, columns=['Paragraphs', 'CharName'])
df_tot['Tipo'] = 'Total'
df_TT = pd.concat([df_train, df_test, df_tot], ignore_index=True)

ax = sns.histplot(x=df_TT["CharName"],  hue=df_TT["Tipo"],  multiple="dodge", shrink=0.9, stat='density', common_norm=False)
ax.set_xlabel('Nombre del personaje')
ax.set_ylabel('Porcentaje de párrafos asignado')


### Extracting features from text

#### Bags of words

In [None]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

In [None]:
words = count_vect.get_feature_names_out()
words

In [None]:
count_vect.vocabulary_.get(u'algorithm')

#### Term Frequencies

In [None]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)

In [None]:
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

### Conteo de palabras y TF-IDF

In [None]:
count_vect = CountVectorizer(stop_words=None, ngram_range=(1,1))
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

In [None]:
words = count_vect.get_feature_names_out()

In [None]:
print(count_vect.get_stop_words())

In [None]:
print(X_train_counts.toarray())

In [None]:
# term-frequency times inverse document-frequency
tf_idf = TfidfTransformer(use_idf=False)
X_train_tf = tf_idf.fit_transform(X_train_counts)
X_train_tf

In [None]:
print(X_train_tf.toarray())

### Reducción de dimensionalidad

In [None]:
# TODO: Realizar PCA sobre los datos de entrenamiento
reductor = PCA(n_components = 5)

# Transformar train
X_train_red = reductor.fit_transform(X_train_tf.toarray())


In [None]:
# Visualización de las dos primeras componentes de PCA
fig, ax = plt.subplots(figsize=(6, 6))
for character in np.unique(y_train):
    mask_train = y_train == character
    ax.scatter(X_train_red[mask_train, 0], X_train_red[mask_train, 1], label=character)

ax.set_title("PCA por personaje")
ax.legend()

In [None]:
reductor.explained_variance_

In [None]:
# Comparación:
VE = []

# Sin nada:
reductor = PCA(n_components = 10)
# Transformar train
X_train_red = reductor.fit_transform(X_train_tf.toarray())
VE_control = reductor.explained_variance_
VE.append(VE_control)

# Sin stop words

additional_stop_words=['thou', 'thee', 'thy', 'ye', 'thine', ]
count_vect = CountVectorizer(stop_words=list(text.ENGLISH_STOP_WORDS.union(additional_stop_words)), ngram_range=(1,1))
X_train_counts = count_vect.fit_transform(X_train)
tf_idf = TfidfTransformer(use_idf=False)
X_train_tf = tf_idf.fit_transform(X_train_counts)
reductor = PCA(n_components = 10)
X_train_red = reductor.fit_transform(X_train_tf.toarray())
VE_SW = reductor.explained_variance_
VE.append(VE_SW)

# Usando IDF
count_vect = CountVectorizer(stop_words=None, ngram_range=(1,1))
X_train_counts = count_vect.fit_transform(X_train)
tf_idf = TfidfTransformer(use_idf=True)
X_train_tf = tf_idf.fit_transform(X_train_counts)
reductor = PCA(n_components = 10)
X_train_red = reductor.fit_transform(X_train_tf.toarray())
VE_IDF = reductor.explained_variance_
VE.append(VE_IDF)

# Con bigrama
count_vect = CountVectorizer(stop_words=None, ngram_range=(1,2))
X_train_counts = count_vect.fit_transform(X_train)
tf_idf = TfidfTransformer(use_idf=False)
X_train_tf = tf_idf.fit_transform(X_train_counts)
reductor = PCA(n_components = 10)
X_train_red = reductor.fit_transform(X_train_tf.toarray())
VE_BG = reductor.explained_variance_
VE.append(VE_BG)

In [None]:
VE

In [None]:
df_PCA = pd.DataFrame(VE_control, columns=['VarExplicada'])
df_PCA['Tipo']= 'Simple'

auxSW = pd.DataFrame(VE_SW, columns=['VarExplicada'])
auxSW['Tipo']= 'Sin Stopwords'

auxIDF = pd.DataFrame(VE_IDF, columns=['VarExplicada'])
auxIDF['Tipo']= 'TF-IDF'

auxBG = pd.DataFrame(VE_SW, columns=['VarExplicada'])
auxBG['Tipo']= 'Bigrama'

df_PCA = pd.concat([df_PCA, auxSW, auxIDF, auxBG], ignore_index=False).reset_index()
df_PCA['index'] = df_PCA['index']+1
df_PCA


In [None]:
ax = sns.barplot(x=df_PCA["index"], y=df_PCA["VarExplicada"], hue=df_PCA["Tipo"])
ax.set_xlabel('Componentes del PCA')
ax.set_ylabel('Varianza explicada')

## Modelos de Clasificación

In [None]:
#le = preprocessing.LabelEncoder()
#y_train_enc = le.fit_transform(y_train)

count_vect = CountVectorizer(stop_words=None, ngram_range=(1,2))
X_train_counts = count_vect.fit_transform(X_train)
tf = TfidfTransformer(use_idf=False)
X_train_tf = tf.fit_transform(X_train_counts)

bayes_clf = MultinomialNB().fit(X_train_tf, y_train)

# Ver las primeras 10 predicciones de train
y_pred_train = bayes_clf.predict(X_train_tf)
y_pred_train[:10]

In [None]:
X_train.shape

In [None]:
X_train_counts.shape

In [None]:
X_train_tf.shape

In [None]:
def get_accuracy(y_true, y_pred):
    return (y_true == y_pred).sum() / len(y_true)

get_accuracy(y_train, y_pred_train)

In [None]:
# TODO: Predecir para test y ver la matriz de confusión, y reportar accuracy
X_test_counts = count_vect.transform(X_test)
X_test_counts.shape

In [None]:
X_test_tf = tf.transform(X_test_counts)

In [None]:
y_test_pred = bayes_clf.predict(X_test_tf)
get_accuracy(y_test, y_test_pred)

In [None]:
disp = ConfusionMatrixDisplay.from_predictions(y_test, y_test_pred)
disp.plot()

In [None]:
def precision(y_true, y_pred, label):
    yt = (y_true == label)
    yp = (y_pred == label)
    return (yt*yp).sum()/yp.sum()

def recall(y_true, y_pred, label):
    yt = (y_true == label)
    yp = (y_pred == label)
    return (yt*yp).sum()/yt.sum()

In [None]:
#Precision y recall
print('\t \t Antony | Queen Margaret | Cleopatra')
print('Precision \t  %.2f \t       %.2f  \t     %.2f '%(precision(y_test, y_test_pred,'Antony'), precision(y_test, y_test_pred,'Queen Margaret'), precision(y_test, y_test_pred,'Cleopatra')))
print('Recall \t \t  %.2f \t       %.2f  \t     %.2f '%(recall(y_test, y_test_pred,'Antony'), recall(y_test, y_test_pred,'Queen Margaret'), recall(y_test, y_test_pred,'Cleopatra')))

### Búsqueda de hiper-parámetros con Cross-Validation

In [None]:
from sklearn.model_selection import StratifiedKFold

# TODO: Agregar más variantes de parámetros que les parezcan relevantes
param_sets = [{"stop_words": None, "ngram": (1,2), "idf": True},
             {"stop_words": None, "ngram": (1,1), "idf": False}]

skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

# Ahora usaremos train/validation/test
# Por lo tanto le renombramos train+validation = dev(elopment) dataset
X_dev = X_train
y_dev = y_train
acc_dev = []

# # Para evitar errores
# del X_train
# del y_train

for params in param_sets:
    
    # Transormaciones a aplicar (featurizers)
    count_vect = CountVectorizer(stop_words=params["stop_words"], ngram_range=params["ngram"])
    tf_idf = TfidfTransformer(use_idf=params["idf"])
    
    for train_idxs, val_idxs in skf.split(X_dev, y_dev):
        
        # Train y validation para el split actual
        X_train_ = X_dev[train_idxs]
        y_train_ = y_dev[train_idxs]
        X_val = X_dev[val_idxs]
        y_val = y_dev[val_idxs]
        
        # Ajustamos y transformamos Train
        X_train_counts = count_vect.fit_transform(X_train_)
        X_train_tf = tf_idf.fit_transform(X_train_counts)
        
        # TODO: Completar el código para entrenar y evaluar 
        
        # Entrenamos con Train
        bayes_clf = MultinomialNB().fit(X_train_tf, y_train_)

        # Transformamos Validation
        X_val_counts = count_vect.transform(X_val)
        X_val_tfidf = tf_idf.transform(X_val_counts)
        
        # Predecimos y evaluamos en Validation
        y_pred_val = bayes_clf.predict(X_val_tfidf)
        acc = get_accuracy(y_val, y_pred_val)
        acc_dev.append(acc)
        #print(f"{acc=:.4f} {params=}")
        print(acc)


### KNN

In [None]:
# Copia 
X_train_knn = X_train
y_train_knn = y_train
X_test_knn = X_test
y_test_knn = y_test

In [None]:
# Escalar
# Transormaciones a aplicar (featurizers)
count_vect = CountVectorizer(stop_words=None, ngram_range=(1,2))
tf_idf = TfidfTransformer(use_idf=False)

# Ajustamos y transformamos Train
X_train_knn_counts = count_vect.fit_transform(X_train_knn)
X_train_knn_tf = tf_idf.fit_transform(X_train_knn_counts)


X_test_knn_counts = count_vect.transform(X_test_knn)
X_test_knn_tfidf = tf_idf.transform(X_test_knn_counts)


In [None]:
# Entrenar
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_knn_tf, y_train_knn)
score = knn.score(X_train_knn_tf, y_train_knn)
print("Training score: ", score) 

In [None]:
# Predecir
y_pred_knn = knn.predict(X_test_knn_tfidf)

In [None]:
# Evaluar
acc_knn = get_accuracy(y_test_knn, y_pred_knn)
print("Acc knn:", acc_knn)

In [None]:
disp = ConfusionMatrixDisplay.from_predictions(y_test_knn, y_pred_knn)
disp.plot()

In [None]:
cr = classification_report(y_test_knn, y_pred_knn)
print(cr)

####  NCA y KNN

In [None]:
X_train_knn_nca = X_train
y_train_knn_nca = y_train

In [None]:
nca = NeighborhoodComponentsAnalysis(random_state=1)
knn_nca = KNeighborsClassifier(n_neighbors=3)

In [None]:
X_train_knn_nca_counts = count_vect.fit_transform(X_train_knn_nca)
X_train_knn_nca_tf = tf_idf.fit_transform(X_train_knn_nca_counts)

X_test_knn_nca_counts = count_vect.transform(X_test_knn)
X_test_knn_nca_tfidf = tf_idf.transform(X_test_knn_counts)

#nca.fit(X_train_knn_nca_tf, y_train_knn_nca)
#knn_nca.fit(nca.transform(X_train_knn_nca_tf), y_train_knn_nca)
# Compute the nearest neighbor accuracy on the embedded test set
#acc_knn_nca = knn_nca.score(nca.transform(X_test_knn_tfidf), y_test_knn)
#acc_knn_nca


### (Opcional) Comparativa con Fasttext

In [None]:
!pip install fasttext

In [None]:
import fasttext

y_train_s = np.char.replace(y_train.astype(str), " ", "_").astype(object)
y_test_s = np.char.replace(y_test.astype(str), " ", "_").astype(object)

# Convertimos al formato de fasttext: archivo de texto donde cada línea es:
# __label__<label> TEXTO
Xytrains = "__label__" + y_train_s.astype(object) + " " + X_train
Xytests = "__label__" + y_test_s.astype(object) + " " + X_test
np.savetxt(data_dir / "train.txt", Xytrains, fmt="%s")
np.savetxt(data_dir / "test.txt", Xytests, fmt="%s")

Xytests[0]

In [None]:
model = fasttext.train_supervised(input=str(data_dir / "train.txt"), epoch=100, wordNgrams=2)
model.test(str(data_dir / "test.txt"))

In [None]:
y_out = model.predict(list(X_test))
y_pred_test = [y[0].replace("__label__", "") for y in y_out[0]]
    
print(get_accuracy(y_test_s, y_pred_test))