In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
import os
import re
from sklearn import svm
import warnings
warnings.filterwarnings('ignore')

In [2]:
reviews_train = []
for line in open('data/movie_data/full_train.txt', 'r'):
    
    reviews_train.append(line.strip())
    
reviews_test = []
for line in open('data/movie_data/full_test.txt', 'r'):
    
    reviews_test.append(line.strip())

len(reviews_train)

25000

In [3]:
reviews_train[5]

"This isn't the comedic Robin Williams, nor is it the quirky/insane Robin Williams of recent thriller fame. This is a hybrid of the classic drama without over-dramatization, mixed with Robin's new love of the thriller. But this isn't a thriller, per se. This is more a mystery/suspense vehicle through which Williams attempts to locate a sick boy and his keeper.<br /><br />Also starring Sandra Oh and Rory Culkin, this Suspense Drama plays pretty much like a news report, until William's character gets close to achieving his goal.<br /><br />I must say that I was highly entertained, though this movie fails to teach, guide, inspect, or amuse. It felt more like I was watching a guy (Williams), as he was actually performing the actions, from a third person perspective. In other words, it felt real, and I was able to subscribe to the premise of the story.<br /><br />All in all, it's worth a watch, though it's definitely not Friday/Saturday night fare.<br /><br />It rates a 7.7/10 from...<br />

### Limpieza de Datos y Tokenizacion

In [4]:
import re

REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
NO_SPACE = ""
SPACE = " "

def preprocess_reviews(reviews):
    
    reviews = [REPLACE_NO_SPACE.sub(NO_SPACE, line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(SPACE, line) for line in reviews]
    
    return reviews

reviews_train_clean = preprocess_reviews(reviews_train)
reviews_test_clean = preprocess_reviews(reviews_test)

In [5]:
reviews_train_clean[5]

'this isnt the comedic robin williams nor is it the quirky insane robin williams of recent thriller fame this is a hybrid of the classic drama without over dramatization mixed with robins new love of the thriller but this isnt a thriller per se this is more a mystery suspense vehicle through which williams attempts to locate a sick boy and his keeper also starring sandra oh and rory culkin this suspense drama plays pretty much like a news report until williams character gets close to achieving his goal i must say that i was highly entertained though this movie fails to teach guide inspect or amuse it felt more like i was watching a guy williams as he was actually performing the actions from a third person perspective in other words it felt real and i was able to subscribe to the premise of the story all in all its worth a watch though its definitely not friday saturday night fare it rates a   from the fiend '

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(binary=True)
cv.fit(reviews_train_clean)
X = cv.transform(reviews_train_clean)
X_test = cv.transform(reviews_test_clean)

X.shape

(25000, 90860)

### Entrenamiento con Logisitc Regression

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# los primeros 12500 son comentarios negativos, el resto son positivos.
y = [1 if i < 12500 else 0 for i in range(25000)]

# train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size = 0.75
)

# vamos a probar LR con varios valores de regularizacion.
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c, solver='lbfgs')
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_test, lr.predict(X_test))))

Accuracy for C=0.01: 0.87136
Accuracy for C=0.05: 0.88448
Accuracy for C=0.25: 0.88032
Accuracy for C=0.5: 0.8784
Accuracy for C=1: 0.87648


In [8]:
from sklearn.model_selection import cross_val_score

final_model = LogisticRegression(C=0.05, solver='lbfgs')
print("Cross-Validation Score:",cross_val_score(final_model, X, y, cv=5).mean())

Cross-Validation Score: 0.86884


### Coding Time! - Observe algunos hiperparametros de CountVectorizer

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

**max_df : float in range [0.0, 1.0] or int, default=1.0**
When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None

**min_df : float in range [0.0, 1.0] or int, default=1**
When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.

**stop_words : string {‘english’}, list, or None (default)**
If ‘english’, a built-in stop word list for English is used. There are several known issues with ‘english’ and you should consider an alternative.

**Use SVM en lugar de Logisitc Regression**
utilize diferentes tipos de kernel e hiperparametros para mejorar su modelo.

In [9]:
# ANTORCHA!

In [10]:
svm_model_linear = svm.SVC(kernel='linear')
clf = svm_model_linear.fit(X_train, y_train)
print ("Accuracy Linear",accuracy_score(y_test, svm_model_linear.predict(X_test)))

Accuracy Linear 0.85008


In [11]:
svm_model_poly = svm.SVC(kernel='poly',degree=3)
clf = svm_model_poly.fit(X_train, y_train)
print ("Accuracy",accuracy_score(y_test, svm_model_poly.predict(X_test)))

Accuracy 0.81344


In [None]:
svm_model_radial = svm.SVC(kernel='rbf',gamma='auto')
clf = svm_model_radial.fit(X_train, y_train)
print ("Accuracy",accuracy_score(y_test, svm_model_radial.predict(X_test)))