# Trabalho Prático Aprendizagem Automática

## Introdução

### Outras Cenas tenho preguiça de escrever, nomeadamente introdução teórica

In [1]:
import re, pickle

import numpy as np

from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_files
from sklearn.model_selection import cross_val_score , GridSearchCV
from sklearn.metrics import confusion_matrix

from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer
#Possiveis de serem usadas : numpy, scipy, matplotlib, sklearn, nltk, re e opencv 

In [2]:
with open('imdbCriticas.p', 'rb') as f:
    global D, Docs, y
    D = pickle.load(f)
    Docs = D.data
    y = D.target

Usamos este metodo para préprocessar os dados de texto, e reduzir as palavras tendo em conta os erros de ortografia

O stemmer por defeito é o porter e se o argumento não corresponder a nenhum outro, este é utilizado

In [3]:
def preProcessDoc(Doc, stemmer = 'porter', decode = False):
    stem = {
        'porter'   : PorterStemmer(),
        'snowball' : SnowballStemmer('english'),
        'lancaster': LancasterStemmer()
    }
    stemFunc = stem.get(stemmer, PorterStemmer())
    if(decode):
        Doc = Doc.decode('UTF-8')
    Doc = Doc.replace('<br />', ' ')
    Doc = re.sub(r'[^a-zA-Z\u00C0\u00FF]+', ' ', Doc)
    return Doc

def preProcessDocs(Docs, stemmer='porter', decode = False):
    return [preProcessDoc(doc, stemmer, decode) for doc in Docs]

def text2vector(Docs):
    X = tfidf.transform(Docs)
    return X

In [4]:
# [f(x) if condition else g(x) for x in sequence]
y_boolean = [0 if val<5 else 1 for val in y]

print(y[:10])
print(y_boolean[:10])

x_train,x_test, y_train, y_test = train_test_split(Docs,y_boolean,test_size=0.25)

[10  1 10  1  2  8  8  1 10  2]
[1, 0, 1, 0, 0, 1, 1, 0, 1, 0]


# Classificacao Booleana Uni Gramas

## Comparar classificacao com stopwords e sem

In [5]:
tfidf = TfidfVectorizer(min_df=3, token_pattern=r'\b\w\w\w+\b', stop_words='english').fit(x_train)
tokens = tfidf.get_feature_names()
print('Token len' , len(tokens))

X1 = text2vector(x_train)
X2 = text2vector(x_test)

dl = LogisticRegression(max_iter = 1000, C=3.3, tol = 1e-3).fit(X1,y_train)
y2e = dl.predict(X2)

print('Train score' , round(dl.score(X1,y_train),6))
print('Test score ' , round(dl.score(X2,y_test), 6))
print('Confusion Matrix\n', confusion_matrix(y_test, y2e))

Token len 35751
Train score 0.961833
Test score  0.8949
Confusion Matrix
 [[4474  570]
 [ 481 4475]]


In [6]:
tfidf = TfidfVectorizer(min_df=3, token_pattern=r'\b\w\w\w+\b').fit(x_train)
tokens = tfidf.get_feature_names()
print('Token len' , len(tokens))

X1 = text2vector(x_train)
X2 = text2vector(x_test)

dl = LogisticRegression(max_iter = 1000, C=3.3, tol = 1e-3).fit(X1,y_train)

print('Train score' , round(dl.score(X1,y_train),6))
print('Test score ' , round(dl.score(X2,y_test), 6))

Token len 36027
Train score 0.9591
Test score  0.9005


Com a preservação das stopwords, o classificador obtem melhores resultados, esta diferença será mais visivel com a presença de N-gramas

## Incluir palavras ou palavras e numeros
-- Descricao

In [7]:
tfidf = TfidfVectorizer(min_df=3, token_pattern=r'\b\w\w\w+\b').fit(x_train)
tokens = tfidf.get_feature_names()
print('Token len' , len(tokens))

X1 = text2vector(x_train)
X2 = text2vector(x_test)

dl = LogisticRegression(max_iter = 1000, C=3.3, tol = 1e-3)
dl.fit(X1,y_train)

print('Train score' , round(dl.score(X1,y_train),6))
print('Test score ' , round(dl.score(X2,y_test), 6))

Token len 36027
Train score 0.9591
Test score  0.9005


In [8]:
tfidf = TfidfVectorizer(min_df=3, token_pattern=r'\b[a-zA-Z]{3,}\b').fit(x_train)
tokens = tfidf.get_feature_names()
print('Token len' , len(tokens))

X1 = text2vector(x_train)
X2 = text2vector(x_test)

dl = LogisticRegression(max_iter = 1000, C=3.3, tol = 1e-3)
dl.fit(X1,y_train)

print('Train score' , round(dl.score(X1,y_train),6))
print('Test score ' , round(dl.score(X2,y_test), 6))

Token len 35629
Train score 0.959167
Test score  0.9004


Podemos ver que o numero de tokens é semelhante, com cerca de 400 tokens de diferença, e que os resultados são semelhantes pelo que incluir numeros tem pouca relevancia neste classificador

## Comparacao com variacao da min_df

A min_df corresponde ao numero de repeticoes que tem de existir entre os documentos para que a palavra seja adicionada como token.

In [9]:
vals = np.arange(1, 10 , 1)
train_scores = []
test_scores = []
dl = LogisticRegression(max_iter = 1000, C=3.3, tol = 1e-3)

for v in vals:
    tfidf = TfidfVectorizer(min_df=v, token_pattern=r'\b\w\w\w+\b').fit(x_train)
    X1 = text2vector(x_train)
    X2 = text2vector(x_test)
    
    dl.fit(X1,y_train)## Classificacao booleana, uni-gramas
    print('-'*100, '\nV', round(v,2))
    train_score =round(dl.score(X1,y_train),6)
    train_scores.append(train_score)
    print('train' ,train_score)
    test_score = round(dl.score(X2,y_test),6)
    test_scores.append(test_score)
    print('test' ,test_score)
    print('diff' , round(train_score-test_score,6))
    print('num tokens', len(tfidf.get_feature_names()))
    print('-'*100)


---------------------------------------------------------------------------------------------------- 
V 1
train 0.962867
test 0.9014
diff 0.061467
num tokens 77704
----------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------- 
V 2
train 0.960633
test 0.9019
diff 0.058733
num tokens 45514
----------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------- 
V 3
train 0.9591
test 0.9005
diff 0.0586
num tokens 36027
----------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------- 
V 4
train 0.9588
test 0.9008
diff 0.058
num tokens 30833
--------------------------------------------------

Ao aumentar o min_df vemos que a frequencia tem pouca relevancia no algoritmo, no entanto valores pequenos levam a uma expansao enorme no numero de tokens considerados, que por sua vez irá gerar um peso de computação maior em pasos adiante. Como equilibrio entre a velocidade de execução e a fiabilidade do algoritmo escolhemos um **valor de 3.** 

## Comparar diversos criterios de regularizacao (C)

In [10]:
tfidf = TfidfVectorizer(min_df=3, token_pattern=r'\b\w\w\w+\b').fit(x_train)
tokens = tfidf.get_feature_names()
X1 = text2vector(x_train)
X2 = text2vector(x_test)
vals = np.arange(0.1, 5 , 0.2)
train_scores = []
test_scores = []
for c in vals:
    dl = LogisticRegression(max_iter = 1000, C=c, tol = 1e-3)
    dl.fit(X1,y_train)## Classificacao booleana, uni-gramas
    print('-'*100, '\nC', round(c,2))
    train_score = round(dl.score(X1,y_train),6)
    train_scores.append(train_score)
    print('train' ,train_score)
    test_score  = round(dl.score(X2,y_test),6)
    test_scores.append(test_score)
    print('test' ,test_score)
    print('diff' , round(train_score-test_score,6))
    print('-'*100)

diff = np.array(train_scores) - np.array(test_scores)

---------------------------------------------------------------------------------------------------- 
C 0.1
train 0.8778
test 0.8666
diff 0.0112
----------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------- 
C 0.3
train 0.903733
test 0.8824
diff 0.021333
----------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------- 
C 0.5
train 0.915133
test 0.8876
diff 0.027533
----------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------- 
C 0.7
train 0.9229
test 0.8918
diff 0.0311
----------------------------------------------------------------------------------------------------
--------

In [11]:
print(np.max(test_scores))
print('Melhor C = ', np.argmax(test_scores))

0.901
Melhor C =  20


Com um C mais pequeno, a diferença entre o train e test é menor mas os resultados são piores.

O C que obteve os melhores resultados com menos erros foi o 3.3

## Utilizar cross validation para melhorar o classificador

In [12]:
dl = LogisticRegression(max_iter = 1000, C=3.3, tol = 1e-3)
docsVector = text2vector(Docs)
scores = cross_val_score(dl, docsVector, y_boolean, cv=10, scoring='accuracy')
print(scores)

[0.89425 0.894   0.90675 0.901   0.8945  0.908   0.90775 0.902   0.897
 0.904  ]


In [13]:
print(scores.mean())
print(scores.max())

0.900925
0.908


In [14]:
from sklearn.linear_model import LogisticRegressionCV
import time

start = time.time()

lrcv = LogisticRegressionCV(max_iter = 1000, tol=1e-3, cv=10, penalty='l2', n_jobs=-1).fit(X1, y_train)
y2e = lrcv.predict(X2)

print('Exec time ',time.time() - start)


Exec time  112.16232132911682


In [15]:
print(confusion_matrix(y2e, y_test))
print(lrcv.score(X1, y_train))
print(lrcv.score(X2, y_test))

[[4502  453]
 [ 542 4503]]
0.9557333333333333
0.9005


In [16]:
help(lrcv)

Help on LogisticRegressionCV in module sklearn.linear_model._logistic object:

class LogisticRegressionCV(LogisticRegression, sklearn.base.BaseEstimator, sklearn.linear_model._base.LinearClassifierMixin)
 |  LogisticRegressionCV(*, Cs=10, fit_intercept=True, cv=None, dual=False, penalty='l2', scoring=None, solver='lbfgs', tol=0.0001, max_iter=100, class_weight=None, n_jobs=None, verbose=0, refit=True, intercept_scaling=1.0, multi_class='auto', random_state=None, l1_ratios=None)
 |  
 |  Logistic Regression CV (aka logit, MaxEnt) classifier.
 |  
 |  See glossary entry for :term:`cross-validation estimator`.
 |  
 |  This class implements logistic regression using liblinear, newton-cg, sag
 |  of lbfgs optimizer. The newton-cg, sag and lbfgs solvers support only L2
 |  regularization with primal formulation. The liblinear solver supports both
 |  L1 and L2 regularization, with a dual formulation only for the L2 penalty.
 |  Elastic-Net penalty is only supported by the saga solver.
 |  





In [17]:
print('hello world')

hello world


## Utilizar o GridSearchCV 

Vamos utilizar o modulo GridSearchCV para determinar quais os melhores parametros para o nosso classificador, tudo o que está em cima pode ser reduzido ao seguinte

## Classificacao booleana, N-gramas

train_scores = []
test_scores = []

for low in range(10):
    new_line = []
    for high in range(10):        
        tfidf = TfidfVectorizer(min_df=3, token_pattern=r'\b\w\w\w+\b', ngram_range=(low,high)).fit(x_train)
        X1 = text2vector(x_train)
        X2 = text2vector(x_test)

# Bibliografia

### Geral
- Slides Professor

### Pré processamento do texto
- https://blog.ekbana.com/pre-processing-text-in-python-ad13ea544dae
- https://medium.com/@wenxuan0923/feature-extraction-from-text-using-countvectorizer-tfidfvectorizer-9f74f38f86cc

### Grid Search
- https://scikit-learn.org/stable/modules/grid_search.html
- https://www.youtube.com/watch?v=Gol_qOgRqfA