# CP 4 Aprendizaje de Máquinas
---

## Regresión Logística

In [None]:
from pathlib import Path

path_p = Path("txt_sentoken/pos")
path_n = Path("txt_sentoken/neg")

ds_p = list(path_p.iterdir())     # directorio donde están las críticas positivas
ds_n = list(path_n.iterdir())     # directorio donde están las críticas negativas

def convert_file_to_text(file_path: Path) -> str:
    with open(file_path) as f:
        return ''.join(f.readlines())
    
texts_p = [convert_file_to_text(file) for file in ds_p]    # Lista de críticas positivas
texts_n = [convert_file_to_text(file) for file in ds_n]    # Lista de críticas negativas

### Ejercicio 1: Regresión Logística aplicado al dataset de _Rotten Tomatoes_

In [None]:
from sklearn.feature_extraction.text import CountVectorizer 

cv = CountVectorizer()
X = cv.fit_transform(texts_p + texts_n)
X = X.toarray()

y = [1]*1000 + [0]*1000

In [None]:
X.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.70)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr_model = LogisticRegression(solver='liblinear', max_iter=1000)
lr_model.fit(X_train, y_train)
lr_model.score(X_test, y_test)

### Ejercicio 2: Aplicando la Matriz de Confusión

In [None]:
from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt

In [None]:
disp = plot_confusion_matrix(lr_model, X_test, y_test,
                             display_labels=['Negative Class', 'Positive Class'],
                             cmap=plt.cm.Blues,
                             normalize='true')
disp.ax_.set_title('Logistic Regression Confusion matrix, with normalization');

### Ejercicio 3: Probando la clasificación de un texto en específico

In [None]:
lr_model.classes_

In [None]:
test_review = cv.transform(["The movie was not fun to watch"])
lr_model.predict_proba(test_review)

### Ejercicio 4: Generación de Bigramas 

In [None]:
cv = CountVectorizer(ngram_range=(2,2))
X = cv.fit_transform(texts_p + texts_n)
X = X.toarray()

In [None]:
X.shape

### Ejercicio 5: Ideas para Mejorar: Utilizar Bigramas con Regresión Logística

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.70)

In [None]:
lr_model = LogisticRegression(solver='liblinear', max_iter=1000)
lr_model.fit(X_train, y_train)
lr_model.score(X_test, y_test)

### Ejercicio 6: Vizualizando los Bigramas más Importantes

In [None]:
import numpy as np
import seaborn as sns

In [None]:
feature_importance = lr_model.coef_[0]
sorted_idx = np.argsort(feature_importance)

In [None]:
top_10_pos_w = [list(cv.vocabulary_.keys())[list(cv.vocabulary_.values()).index(w)] for w in sorted_idx[range(-1,-11, -1)]]
print(top_10_pos_w)

In [None]:
top_10_pos_importance = feature_importance[sorted_idx[range(-1,-11, -1)]]
print(top_10_pos_importance)

In [None]:
fig = plt.figure(figsize=(10, 6))
ax = sns.barplot(x=top_10_pos_w, y=top_10_pos_importance)
plt.title("Most Important Words Used for Positive Sentiment", fontsize=13)
x_locs,x_labels = plt.xticks()
plt.setp(x_labels, rotation=40)
plt.ylabel('Feature Importance', fontsize=12)
plt.xlabel('Word', fontsize = 12);

In [None]:
top_10_neg_w = [list(cv.vocabulary_.keys())[list(cv.vocabulary_.values()).index(w)] for w in sorted_idx[:10]]
print(top_10_neg_w)

In [None]:
top_10_neg_importance = feature_importance[sorted_idx[:10]]
print(top_10_neg_importance)

In [None]:
fig = plt.figure(figsize=(10, 6))
ax = sns.barplot(x=top_10_neg_w, y=top_10_neg_importance)
plt.title("Most Important Words Used for Negative Sentiment", fontsize = 13)
x_locs, x_labels = plt.xticks()
plt.setp(x_labels, rotation=40)
plt.ylabel('Feature Importance', fontsize = 12)
plt.xlabel('Word', fontsize = 12);

In [None]:
test_review = cv.transform(["The movie was not fun to watch"])
lr_model.predict_proba(test_review)

### Ejercicio 7: Ideas Para Mejorar: Eliminar Ruido

In [None]:
cv = CountVectorizer(min_df=10, max_df=0.95, binary=True, ngram_range=(1,2))
X = cv.fit_transform(texts_p + texts_n)
X = X.toarray()
X.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.70)

In [None]:
lr_model = LogisticRegression(solver='liblinear', max_iter=1000)
lr_model.fit(X_train, y_train)
lr_model.score(X_test, y_test)

### Ejercicio 8: Ideas para Mejorar: Disminuir el umbral de corte de probabilidad

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score

In [None]:
pred_proba_df = pd.DataFrame(lr_model.predict_proba(X_test))
threshold_list = [0.3, 0.4, 0.45, 0.5]
for i in threshold_list:
    print ('\n******** For i = {} ******'.format(i))
    Y_test_pred = pred_proba_df.applymap(lambda x: 1 if x>i else 0)
    test_accuracy = round(accuracy_score(y_test, Y_test_pred.loc[:,1].values),3)
    print('Accuracy: {}'.format(test_accuracy))