# Laboratorio final: Sentiment Analysis 

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from util import print_eval
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from util import print_short_eval
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

## Load Data

In [3]:
dataset = load_files('reviews_sentoken', shuffle=False)

## División: Train y Dev

Haremos el siguiente split:
  - Train: 85% 
  - Dev: 15% 

Primero extraemos Test:

In [4]:
X_train, X_dev, y_train, y_dev = train_test_split(
    dataset.data,
    dataset.target,
    test_size=0.25,
    random_state=42
)

In [5]:
len(X_train), len(X_dev)

(802, 268)

In [6]:
Counter(y_train), Counter(y_dev)

(Counter({0: 389, 1: 413}), Counter({0: 146, 1: 122}))

## CountVectorizer +  LinearSVC

In [7]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', LinearSVC(random_state=0)),
])

In [8]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip..., max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
     verbose=0))])

In [9]:

print_eval(pipeline, X_train, y_train)

accuracy	1.00

             precision    recall  f1-score   support

        neg       1.00      1.00      1.00       389
        pos       1.00      1.00      1.00       413

avg / total       1.00      1.00      1.00       802

[[389   0]
 [  0 413]]


In [10]:

print_eval(pipeline, X_dev, y_dev)

accuracy	0.82

             precision    recall  f1-score   support

        neg       0.82      0.86      0.84       146
        pos       0.82      0.77      0.79       122

avg / total       0.82      0.82      0.82       268

[[125  21]
 [ 28  94]]


## Test con algo nuestro...

In [11]:
pipeline.predict(['this bad'])

array([0])

In [12]:
pipeline.predict(['this is good'])

array([1])

In [13]:
pipeline.predict(['this is very very good'])

array([0])

In [14]:
pipeline.predict(['this movie is not bad, it is good'])

array([0])

## 1er Experimento: Binarizar Conteos

Probemos con **binary=True** a ver si se arregla.

In [15]:
pipeline = Pipeline([
    ('vect', CountVectorizer(binary=True)),
    ('clf', LinearSVC(random_state=0)),
])
pipeline.fit(X_train, y_train)
print_eval(pipeline, X_dev, y_dev)

accuracy	0.80

             precision    recall  f1-score   support

        neg       0.81      0.84      0.82       146
        pos       0.79      0.76      0.78       122

avg / total       0.80      0.80      0.80       268

[[122  24]
 [ 29  93]]


## Test con algo nuestro nuevamente...

In [16]:
pipeline.predict(['this bad'])

array([0])

In [17]:
pipeline.predict(['this is good'])

array([1])

In [18]:
pipeline.predict(['this is very very good'])

array([1])

In [19]:
pipeline.predict(['this movie is not bad, it is good'])

array([0])

## Distintos Modelos de Clasificación

Probamos distintos modelos de clasificación usando los valores por defecto.

Evaluamos en train (bias) y en dev (variance).

In [20]:
clfs = [
    KNeighborsClassifier(),
    MultinomialNB(),
    DecisionTreeClassifier(random_state=0),
    LogisticRegression(random_state=0),
    LinearSVC(random_state=0),
    SVC(random_state=0),
    RandomForestClassifier(random_state=0),
]

In [21]:
vect = CountVectorizer(binary=True)

for clf in clfs:
    print(str(clf.__class__))
    pipeline = Pipeline([
        ('vect', vect),
        ('clf', clf),
    ])
    pipeline.fit(X_train, y_train)
    print_short_eval(pipeline, X_train, y_train)
    print_short_eval(pipeline, X_dev, y_dev)

<class 'sklearn.neighbors.classification.KNeighborsClassifier'>
accuracy	0.74	macro f1	0.73
accuracy	0.56	macro f1	0.50
<class 'sklearn.naive_bayes.MultinomialNB'>
accuracy	0.98	macro f1	0.98
accuracy	0.82	macro f1	0.82
<class 'sklearn.tree.tree.DecisionTreeClassifier'>
accuracy	1.00	macro f1	1.00
accuracy	0.66	macro f1	0.66
<class 'sklearn.linear_model.logistic.LogisticRegression'>
accuracy	1.00	macro f1	1.00
accuracy	0.83	macro f1	0.83
<class 'sklearn.svm.classes.LinearSVC'>
accuracy	1.00	macro f1	1.00
accuracy	0.80	macro f1	0.80
<class 'sklearn.svm.classes.SVC'>


  'precision', 'predicted', average, warn_for)


accuracy	0.51	macro f1	0.34
accuracy	0.46	macro f1	0.31
<class 'sklearn.ensemble.forest.RandomForestClassifier'>
accuracy	0.99	macro f1	0.99
accuracy	0.74	macro f1	0.74


Los mejores modelos parecen ser la regresión logística y la SVM con kernel lineal.

## Vecctorizador TF-IDF

In [22]:
vect = TfidfVectorizer(binary=True)

for clf in clfs:
    print(str(clf.__class__))
    pipeline = Pipeline([
        ('vect', vect),
        ('clf', clf),
    ])
    pipeline.fit(X_train, y_train)
    print_short_eval(pipeline, X_train, y_train)
    print_short_eval(pipeline, X_dev, y_dev)

<class 'sklearn.neighbors.classification.KNeighborsClassifier'>
accuracy	0.87	macro f1	0.87
accuracy	0.76	macro f1	0.76
<class 'sklearn.naive_bayes.MultinomialNB'>
accuracy	0.99	macro f1	0.99
accuracy	0.81	macro f1	0.81
<class 'sklearn.tree.tree.DecisionTreeClassifier'>
accuracy	1.00	macro f1	1.00
accuracy	0.56	macro f1	0.56
<class 'sklearn.linear_model.logistic.LogisticRegression'>
accuracy	0.99	macro f1	0.99
accuracy	0.82	macro f1	0.82
<class 'sklearn.svm.classes.LinearSVC'>
accuracy	1.00	macro f1	1.00
accuracy	0.84	macro f1	0.84
<class 'sklearn.svm.classes.SVC'>


  'precision', 'predicted', average, warn_for)


accuracy	0.51	macro f1	0.34
accuracy	0.46	macro f1	0.31
<class 'sklearn.ensemble.forest.RandomForestClassifier'>
accuracy	0.99	macro f1	0.99
accuracy	0.65	macro f1	0.63


Nos quedaremos con con el TfidfVectorizer y la SVM con kernel lineal.

In [23]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer(binary=True)),
    ('clf', LinearSVC(random_state=0)),
])
pipeline.fit(X_train, y_train)
print_eval(pipeline, X_dev, y_dev)

accuracy	0.84

             precision    recall  f1-score   support

        neg       0.88      0.82      0.85       146
        pos       0.80      0.87      0.83       122

avg / total       0.84      0.84      0.84       268

[[119  27]
 [ 16 106]]


## Test con algo nuestro nuevamente...

In [24]:
pipeline.predict(['this is bad'])

array([0])

In [25]:
pipeline.predict(['this is good'])

array([1])

In [26]:
pipeline.predict(['this is very very good'])

array([1])

In [27]:
pipeline.predict(['this movie is not bad, it is good'])

array([0])

## Vectorizador

Primero hagamos un estudio superficial para ver qué parámetros vale la pena analizar.

### Rango de n-gramas

In [30]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer(
        binary=True,
        ngram_range=(1, 2),
    )),
    ('clf', LinearSVC(random_state=0)),
])
pipeline.fit(X_train, y_train)
print_eval(pipeline, X_dev, y_dev)

accuracy	0.84

             precision    recall  f1-score   support

        neg       0.88      0.82      0.84       146
        pos       0.80      0.86      0.83       122

avg / total       0.84      0.84      0.84       268

[[119  27]
 [ 17 105]]


### Min Frequency

In [37]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer(
        binary=True,
        min_df=7,
    )),
    ('clf', LinearSVC(random_state=0)),
])
pipeline.fit(X_train, y_train)
print_eval(pipeline, X_dev, y_dev)

accuracy	0.84

             precision    recall  f1-score   support

        neg       0.86      0.85      0.86       146
        pos       0.82      0.84      0.83       122

avg / total       0.84      0.84      0.84       268

[[124  22]
 [ 20 102]]


### Max Frequency

In [34]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer(
        binary=True,
        max_df=0.95,
    )),
    ('clf', LinearSVC(random_state=0)),
])
pipeline.fit(X_train, y_train)
print_eval(pipeline, X_dev, y_dev)

accuracy	0.84

             precision    recall  f1-score   support

        neg       0.88      0.82      0.85       146
        pos       0.80      0.87      0.83       122

avg / total       0.84      0.84      0.84       268

[[119  27]
 [ 16 106]]


### Stop words

In [33]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer(
        binary=True,
        stop_words='english',
    )),
    ('clf', LinearSVC(random_state=0)),
])
pipeline.fit(X_train, y_train)
print_eval(pipeline, X_dev, y_dev)

accuracy	0.84

             precision    recall  f1-score   support

        neg       0.90      0.78      0.84       146
        pos       0.77      0.90      0.83       122

avg / total       0.85      0.84      0.84       268

[[114  32]
 [ 12 110]]


### Grid-Search en Development

Probemos muchas las combinaciones posibles de valores.

In [40]:
from sklearn.model_selection import ParameterGrid

param_grid = {
    'vect__binary': [True],
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
    'vect__min_df': [6, 7, 14, 15],
    'vect__max_df': [0.95, 0.9, 0.8],
    'clf__random_state': [0],
}

params_list = list(ParameterGrid(param_grid))

In [41]:
from sklearn import metrics
from util import eval

pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', LinearSVC()),
])

results = []
for params in params_list:
    # TODO: add progress bar!
    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    result = eval(pipeline, X_dev, y_dev)
    
    results.append({
        **result,
        **params,
    })

In [42]:
results_df = pd.DataFrame(results)
results_df.sort_values(['acc', 'f1'], ascending=False)[:10]

Unnamed: 0,acc,clf__random_state,f1,vect__binary,vect__max_df,vect__min_df,vect__ngram_range
1,0.86194,0,0.861243,True,0.95,6,"(1, 2)"
17,0.86194,0,0.861243,True,0.9,6,"(1, 2)"
33,0.86194,0,0.861243,True,0.8,6,"(1, 2)"
35,0.858209,0,0.857566,True,0.8,6,"(1, 4)"
2,0.858209,0,0.857415,True,0.95,6,"(1, 3)"
18,0.858209,0,0.857415,True,0.9,6,"(1, 3)"
21,0.858209,0,0.857415,True,0.9,7,"(1, 2)"
37,0.858209,0,0.857415,True,0.8,7,"(1, 2)"
34,0.854478,0,0.85389,True,0.8,6,"(1, 3)"
5,0.854478,0,0.853742,True,0.95,7,"(1, 2)"


Tomaremos la primer configuración...

In [43]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer(
        binary=True,
        min_df=6,
        max_df=0.95,
        ngram_range=(1, 2)
    )),
    ('clf', LinearSVC(random_state=0)),
])
pipeline.fit(X_train, y_train)
print_eval(pipeline, X_dev, y_dev)

accuracy	0.86

             precision    recall  f1-score   support

        neg       0.89      0.86      0.87       146
        pos       0.83      0.87      0.85       122

avg / total       0.86      0.86      0.86       268

[[125  21]
 [ 16 106]]


## Test con algo nuestro nuevamente...

In [44]:
pipeline.predict(['this is bad'])

array([0])

In [45]:
pipeline.predict(['this is good'])

array([1])

In [46]:
pipeline.predict(['this is very very good'])

array([1])

In [47]:
pipeline.predict(['this movie is not bad, it is good'])

array([0])

# Incorporar Lexicon

La idea es incorporar información externa acerca de la presencia de palabras positivas y negativas.

Prbamos con:
- https://mpqa.cs.pitt.edu/lexicons/subj_lexicon/

In [53]:
filename = 'subjectivity_clues_hltemnlp05/subjclueslen1-HLTEMNLP05.tff'
f = open(filename)
lines = f.readlines()
f.close()

In [54]:
words = []
for line in lines:
    sline = line.split()
    dline = dict([token.split('=') for token in sline if '=' in token])
    word = dline['word1']
    pol = dline['priorpolarity']
    if pol not in {'both', 'neutral'}:
        if pol in {'negative', 'weakneg'}:
            pol = 'NEG'
        else:
            pol = 'POS'
        words.append((word, pol))

word_dict = dict(words)

## Nuevo Tokenizer

In [63]:
tkn = TfidfVectorizer().build_tokenizer()
def my_tkn(s):
    tokens = tkn(s)
    return [word_dict.get(token, token) for token in tokens]

In [64]:
my_tkn('bad and good')

['NEG', 'and', 'POS']

In [65]:
vect = TfidfVectorizer(vocabulary=['POS', 'NEG'], tokenizer=my_tkn)
vect.fit(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function my_tkn at 0x000000090967E6A8>, use_idf=True,
        vocabulary=['POS', 'NEG'])

In [66]:
vect.transform(X_train[:10]).toarray()

array([[0.70022702, 0.71392024],
       [0.6937695 , 0.72019711],
       [0.98918182, 0.14669464],
       [0.63669951, 0.77111201],
       [0.90437497, 0.42673869],
       [0.97911993, 0.20328346],
       [0.887563  , 0.46068637],
       [0.72890887, 0.68461074],
       [0.80326737, 0.59561861],
       [0.77492011, 0.63205919]])

Con FeatureUnion

In [67]:
from sklearn.pipeline import FeatureUnion

pipeline = Pipeline([
    ('vect', FeatureUnion([
        ('bow', TfidfVectorizer(
        binary=True,
        min_df=5,
        max_df=0.95,
        ngram_range=(1, 3)
    )),
        ('pol',  TfidfVectorizer(vocabulary=['POS', 'NEG'], tokenizer=my_tkn)),
    ])),
    ('clf', LinearSVC(random_state=0)),
])
pipeline.fit(X_train, y_train)
print_eval(pipeline, X_dev, y_dev)

accuracy	0.84

             precision    recall  f1-score   support

        neg       0.87      0.82      0.85       146
        pos       0.80      0.85      0.83       122

avg / total       0.84      0.84      0.84       268

[[120  26]
 [ 18 104]]


In [68]:
pipeline.predict(['They absolutely loved it'])

array([1])

## Incorporar el Lexicon no nos ayudo demasiado, entonces no quedamos con...

In [69]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from util import print_eval

pipeline = Pipeline([
    ('vect', TfidfVectorizer(
        binary=True,
        min_df=5,
        max_df=0.95,
        ngram_range=(1, 2)
    )),
    ('clf', LinearSVC(random_state=0)),
])
pipeline.fit(X_train, y_train)
print_eval(pipeline, X_dev, y_dev)

accuracy	0.86

             precision    recall  f1-score   support

        neg       0.88      0.86      0.87       146
        pos       0.84      0.86      0.85       122

avg / total       0.86      0.86      0.86       268

[[126  20]
 [ 17 105]]


## Evaluamos en test y guardamos el modelo...

In [70]:
import glob
import errno

myFiles = []
myPred = []

path = 'test_reviews_sentoken/*.txt'
files = glob.glob(path)
index = 0
for name in files:
    try:
        with open(name) as f:
            y_pred = pipeline.predict(f)
            myFiles.append(name)
            myPred.append(str(y_pred).replace("[", "").replace("]", ""))
    except IOError as exc:
        if exc.errno != errno.EISDIR:
            raise

In [71]:
df = pd.DataFrame(data={"Id": myFiles, "Category": myPred})
df['Id'] = df['Id'].str.replace('\\','')
df['Id'] = df['Id'].str.replace('test_reviews_sentoken','')
df = df[["Id", "Category"]]
df.head()

Unnamed: 0,Id,Category
0,0.txt,0
1,1.txt,1
2,10.txt,0
3,100.txt,1
4,101.txt,0


In [139]:
df.to_csv("./file.csv", sep=',',index=False)