# Trabalho Prático Aprendizagem Automática

## Introdução

### Outras Cenas tenho preguiça de escrever, nomeadamente introdução teórica

In [2]:
import re, pickle, time

import numpy as np

from matplotlib import pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.datasets import load_files
from sklearn.model_selection import cross_val_score , GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer
#Possiveis de serem usadas : numpy, scipy, matplotlib, sklearn, nltk, re e opencv 

In [3]:
with open('imdbCriticas.p', 'rb') as f:
    global D, Docs, y
    D = pickle.load(f)
    Docs = D.data
    y = D.target

Usamos este metodo para préprocessar os dados de texto, e reduzir as palavras tendo em conta os erros de ortografia

O stemmer por defeito é o porter e se o argumento não corresponder a nenhum outro, este é utilizado

In [4]:
def preProcessDoc(Doc, stemmer = 'snowball', decode = False):
    stem = {
        'porter'   : PorterStemmer(),
        'snowball' : SnowballStemmer('english'),
        'lancaster': LancasterStemmer()
    }
    stemFunc = stem.get(stemmer, SnowballStemmer('english'))
    if(decode):
        Doc = Doc.decode('UTF-8')
    Doc = Doc.replace('<br />', ' ')
    Doc = re.sub(r'[^a-zA-Z\u00C0\u00FF]+', ' ', Doc)
    Doc = ' '.join([stemFunc.stem(w) for w in Doc.split()])
    return Doc

def preProcessDocs(Docs, stemmer='snowball', decode = False):
    return [preProcessDoc(doc, stemmer, decode) for doc in Docs]

def text2vector(Docs, preProcess = False, stemmer='snowball', decode=False):
    if(preProcess):
        Docs = preProcessDocs(Docs, stemmer=stemmer, decode=decode)
    X = tfidf.transform(Docs)
    return X

In [5]:
y_boolean = [0 if val<5 else 1 for val in y]

# Comparar os stemmers

In [None]:
X = preProcessDocs(Docs, stemmer='porter')
tfidf = TfidfVectorizer(min_df=3, token_pattern=r'\b[a-zA-Z]{3,}\b').fit(X)
tokens = tfidf.get_feature_names()
vector = text2vector(X)
dl = LogisticRegression(max_iter = 1000, C=3.3, tol = 1e-3).fit(vector,y_boolean)
print('Token len' , len(tokens))
print(dl.score(vector, y_boolean))

In [None]:
X = preProcessDocs(Docs, stemmer='snowball')
tfidf = TfidfVectorizer(min_df=3, token_pattern=r'\b[a-zA-Z]{3,}\b').fit(X)
tokens = tfidf.get_feature_names()
vector = text2vector(X)
dl = LogisticRegression(max_iter = 1000, C=3.3, tol = 1e-3).fit(vector,y_boolean)
print('Token len' , len(tokens))
print(dl.score(vector, y_boolean))

In [None]:
X = preProcessDocs(Docs, stemmer='lancaster')
tfidf = TfidfVectorizer(min_df=3, token_pattern=r'\b[a-zA-Z]{3,}\b').fit(X)
tokens = tfidf.get_feature_names()
vector = text2vector(X)
dl = LogisticRegression(max_iter = 1000, C=3.3, tol = 1e-3).fit(vector,y_boolean)
print('Token len' , len(tokens))
print(dl.score(vector, y_boolean))

In [6]:
print('P' ,26773/0.945525)
print('S' ,26394/0.94535)

P 28315.486105602708
S 27919.818056804357


É possivel ver que o lancaster é o que reduz ao maximo a quantidade de tokens, no entanto vamos optar por utilizar o Snowball pois precisava de uma quantidade de tokens menores para atingir os teoricos 100%

# Classificacao Booleana Uni Gramas

In [7]:
X = preProcessDocs(Docs)
x_train,x_test, y_train, y_test = train_test_split(X,y_boolean,test_size=0.25)

Usar pipeline para fazer tudo o que está abaixo com parametros. Umpa procura exaustiva, com todos os parametros especificados ai, gera $4.3*10^8$ fits possiveis. Pelo que se demorar 1 minuto entre cada fit vamos estar aqui 30000 dias ou 300 anos.

**Descomentar linhas na grelha, leva a melhores estimativas, mas demora muito mais tempo.** 

In [8]:
print(len(np.arange(1,5,1)))
print(len(np.linspace(0.1,5,10)))

4
10


In [25]:
pipeline =Pipeline([
    ('tfidf' , TfidfVectorizer()),
    ('clf' , LogisticRegression(max_iter = 1000))
])

grid_param ={
    #'tfidf__strip_accents' :[None, 'unicode'],
    #'tfidf__stop_words' : [None, 'english'],
    #'tfidf__token_pattern' : [r'\b\w{3,}\b', r'\b[a-zA-Z]{3,}\b'],
    #'tfidf__min_df' : np.arange(1, 5, 1),
    'tfidf__min_df' : [3,4,5],
    #'tfidf__ngram_range' : [(i,j) for i in range(1,5) for j in range(1,5)],
    'tfidf_ngram_range' : [(1,1), (1,2), (1,3)],
    'tfidf__norm' : ['l1', 'l2'],
    
    #'clf__C' : np.linspace(0.1,5,10),
    #'clf_C' :[0.1, 3.3, 10], 
    'clf_C' : [3.3],
    'clf__solver' : ['sag', 'saga'],
    #'clf__tol' : (1e-3, 1e-4, 1e-5)
}

Estamos a usar cross validation com 5 folds, não é preciso dividir em treino e teste pois o algoritmo faz isso sozinho

In [26]:
start = time.time()
grid_search = GridSearchCV(pipeline, grid_param, scoring='accuracy', n_jobs=-1, verbose=2).fit(X, y_boolean)
print('Exec time ',time.time() - start)
with open('dump.p', 'wb') as f:
    pickle.dump({'param' : grid_param , 'out' :grid_search}, f)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


ValueError: Invalid parameter clf_C for estimator Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=1000,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False). Check the list of available parameters with `estimator.get_params().keys()`.

In [None]:
print("Best score: %0.6f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

## Comparar classificacao com stopwords e sem

In [None]:
tfidf = TfidfVectorizer(min_df=3, token_pattern=r'\b\w\w\w+\b', stop_words='english').fit(x_train, y_train)
tokens = tfidf.get_feature_names()
print('Token len' , len(tokens))

X1 = text2vector(x_train)
X2 = text2vector(x_test)

dl = LogisticRegression(max_iter = 1000, C=3.3, tol = 1e-3).fit(X1,y_train)
y2e = dl.predict(X2)

print('Train score' , round(dl.score(X1,y_train),6))
print('Test score ' , round(dl.score(X2,y_test), 6))
print('Confusion Matrix\n', confusion_matrix(y_test, y2e))

In [None]:
tfidf = TfidfVectorizer(min_df=3, token_pattern=r'\b\w\w\w+\b').fit(x_train, y_train)
tokens = tfidf.get_feature_names()
print('Token len' , len(tokens))

X1 = text2vector(x_train)
X2 = text2vector(x_test)

dl = LogisticRegression(max_iter = 1000, C=3.3, tol = 1e-3).fit(X1,y_train)

print('Train score' , round(dl.score(X1,y_train),6))
print('Test score ' , round(dl.score(X2,y_test), 6))

Com a preservação das stopwords, o classificador obtem melhores resultados, esta diferença será mais visivel com a presença de N-gramas

## Incluir palavras ou palavras e numeros
-- Descricao

In [None]:
tfidf = TfidfVectorizer(min_df=3, token_pattern=r'\b\w\w\w+\b').fit(x_train, y_train)
tokens = tfidf.get_feature_names()
print('Token len' , len(tokens))

X1 = text2vector(x_train)
X2 = text2vector(x_test)

dl = LogisticRegression(max_iter = 1000, C=3.3, tol = 1e-3)
dl.fit(X1,y_train)

print('Train score' , round(dl.score(X1,y_train),6))
print('Test score ' , round(dl.score(X2,y_test), 6))

In [None]:
tfidf = TfidfVectorizer(min_df=3, token_pattern=r'\b[a-zA-Z]{3,}\b').fit(x_train, y_train)
tokens = tfidf.get_feature_names()
print('Token len' , len(tokens))

X1 = text2vector(x_train)
X2 = text2vector(x_test)

dl = LogisticRegression(max_iter = 1000, C=3.3, tol = 1e-3)
dl.fit(X1,y_train)

print('Train score' , round(dl.score(X1,y_train),6))
print('Test score ' , round(dl.score(X2,y_test), 6))

Podemos ver que o numero de tokens é semelhante, com cerca de 400 tokens de diferença, e que os resultados são semelhantes pelo que incluir numeros tem pouca relevancia neste classificador, no entanto vamos remover os numeros pois os resultados foram semelhantes e reduz a complexidade

## Comparacao com variacao da min_df

A min_df corresponde ao numero de repeticoes que tem de existir entre os documentos para que a palavra seja adicionada como token.

In [None]:
vals = np.arange(1, 10 , 1)
train_scores = []
test_scores = []
dl = LogisticRegression(max_iter = 1000, C=3.3, tol = 1e-3)

for v in vals:
    tfidf = TfidfVectorizer(min_df=v, token_pattern=r'\b\w\w\w+\b').fit(x_train)
    X1 = text2vector(x_train)
    X2 = text2vector(x_test)
    
    dl.fit(X1,y_train)## Classificacao booleana, uni-gramas
    print('-'*100, '\nV', round(v,2))
    train_score =round(dl.score(X1,y_train),6)
    train_scores.append(train_score)
    print('train' ,train_score)
    test_score = round(dl.score(X2,y_test),6)
    test_scores.append(test_score)
    print('test' ,test_score)
    print('diff' , round(train_score-test_score,6))
    print('num tokens', len(tfidf.get_feature_names()))
    print('-'*100)


Ao aumentar o min_df vemos que a frequencia tem pouca relevancia no algoritmo, no entanto valores pequenos levam a uma expansao enorme no numero de tokens considerados, que por sua vez irá gerar um peso de computação maior em pasos adiante. Como equilibrio entre a velocidade de execução e a fiabilidade do algoritmo escolhemos um **valor de 3.** 

## Comparar diversos criterios de regularizacao (C)

In [20]:
tfidf = TfidfVectorizer(min_df=3, token_pattern=r'\b\w\w\w+\b').fit(x_train)
tokens = tfidf.get_feature_names()
X1 = text2vector(x_train)
X2 = text2vector(x_test)
vals = np.arange(0.1, 5 , 0.2)
train_scores = []
test_scores = []
for c in vals:
    dl = LogisticRegression(max_iter = 1000, C=c, tol = 1e-3)
    dl.fit(X1,y_train)## Classificacao booleana, uni-gramas
    print('-'*100, '\nC', round(c,2))
    train_score = round(dl.score(X1,y_train),6)
    train_scores.append(train_score)
    print('train' ,train_score)
    test_score  = round(dl.score(X2,y_test),6)
    test_scores.append(test_score)
    print('test' ,test_score)
    print('diff' , round(train_score-test_score,6))
    print('-'*100)

diff = np.array(train_scores) - np.array(test_scores)

---------------------------------------------------------------------------------------------------- 
C 0.1
train 0.878367
test 0.8637
diff 0.014667
----------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------- 
C 0.3
train 0.900433
test 0.8784
diff 0.022033
----------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------- 
C 0.5
train 0.9108
test 0.8832
diff 0.0276
----------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------- 
C 0.7
train 0.917633
test 0.8863
diff 0.031333
----------------------------------------------------------------------------------------------------
----

In [21]:
print(np.max(test_scores))
print('Melhor C = ', vals[np.argmax(test_scores)])

0.8918
Melhor C =  3.7000000000000006


Com um C mais pequeno, a diferença entre o train e test é menor mas os resultados são piores.

O C que obteve os melhores resultados com menos erros foi o 3.3

## Utilizar cross validation para melhorar o classificador

In [None]:
dl = LogisticRegression(max_iter = 1000, C=3.3, tol = 1e-3)
docsVector = text2vector(Docs)
scores = cross_val_score(dl, docsVector, y_boolean, cv=10, scoring='accuracy')
print(scores)

In [None]:
print(scores.mean())
print(scores.max())

In [None]:

start = time.time()

lrcv = LogisticRegressionCV(max_iter = 1000, tol=1e-3, cv=10, penalty='l2', n_jobs=-1).fit(X1, y_train)
y2e = lrcv.predict(X2)

print('Exec time ',time.time() - start)


In [None]:
print(confusion_matrix(y2e, y_test))
print(lrcv.score(X1, y_train))
print(lrcv.score(X2, y_test))

A divisao dos conjuntos usando metodos de cross validating não melhora os resultados para o conjunto de treino, mas adiciona muito peso

## Utilizar o GridSearchCV 

Vamos utilizar o modulo GridSearchCV para determinar quais os melhores parametros para o nosso classificador, tudo o que está em cima pode ser reduzido ao seguinte. 

Usamos a cross validation para ter resultados mais fiaveis

Copiado da documentação


"For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and ‘saga’ are faster for large ones.

For multiclass problems, only ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’ handle multinomial loss; ‘liblinear’ is limited to one-versus-rest schemes.

‘newton-cg’, ‘lbfgs’ and ‘sag’ only handle L2 penalty, whereas ‘liblinear’ and ‘saga’ handle L1 penalty.

‘liblinear’ might be slower in LogisticRegressionCV because it does not handle warm-starting."

liblinear - nao usar, 

In [None]:
c_ranges = np.linspace(0.1,10,num=50)
penalties = ['l2']
solvers = ['lbfgs', 'sag']
param_grid = {'solver' : solvers, 'penalty' : penalties}

In [None]:
with open('dump.p' , 'wb') as file:
    X1 = text2vector(x_train)
    X2 = text2vector(x_test)

    start = time.time()
    lrcv = LogisticRegressionCV(max_iter = 1000)
    grid = GridSearchCV(lrcv, param_grid, cv=10, n_jobs=-1).fit(X1, y_train)
    print('Exec time ',time.time() - start)
    pickle.dump({'lrcv' : lrcv , 'grid': grid})

In [None]:
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)

In [None]:
penalties = ['l1']
solvers = ['saga']
param_grid = {'solver' : solvers, 'penalty' : penalties}
with open('dump2.p' , 'wb') as file:
    X1 = text2vector(x_train)
    X2 = text2vector(x_test)

    start = time.time()
    lrcv = LogisticRegressionCV(max_iter = 1000)
    grid = GridSearchCV(lrcv, param_grid, cv=10, n_jobs=-1).fit(X1, y_train)
    print('Exec time ',time.time() - start)
    pickle.dump({'lrcv' : lrcv , 'grid': grid})

## Usar clustering

Segundo o SKLearn o K-means é o melhor metodo para os dados disponiveis por isso vamos usar.

### 1º Modelar os dados usando PCA


In [None]:
X = preProcessDocs(Docs)
tfidf = TfidfVectorizer(min_df=3, token_pattern=r'\b\w\w\w+\b', stop_words='english').fit(X)


In [None]:
pca = PCA().fit(X)


In [None]:
X[1]

In [None]:
Docs[1]

## Classificacao booleana, N-gramas

train_scores = []
test_scores = []

for low in range(10):
    new_line = []
    for high in range(10):        
        tfidf = TfidfVectorizer(min_df=3, token_pattern=r'\b\w\w\w+\b', ngram_range=(low,high)).fit(x_train)
        X1 = text2vector(x_train)
        X2 = text2vector(x_test)

# Bibliografia

### Geral
- Slides Professor

### Pré processamento do texto
- https://blog.ekbana.com/pre-processing-text-in-python-ad13ea544dae
- https://medium.com/@wenxuan0923/feature-extraction-from-text-using-countvectorizer-tfidfvectorizer-9f74f38f86cc

### Escolher o classificador
- https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

### Grid Search
- https://scikit-learn.org/stable/modules/grid_search.html
------------------ VER O Proximo para fazer a estimativa do TFIDF
- https://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html
- https://www.youtube.com/watch?v=Gol_qOgRqfA

### Pipeline
- https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline

### Clustering
- https://medium.com/hanman/data-clustering-what-type-of-movies-are-in-the-imdb-top-250-7ef59372a93b