# Trabalho Prático Aprendizagem Automática


In [1]:
import re, pickle, time

import numpy as np

from matplotlib import pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.datasets import load_files

from sklearn.model_selection import cross_val_score , GridSearchCV , RandomizedSearchCV

from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA , TruncatedSVD
from sklearn.pipeline import Pipeline

from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer
#Possiveis de serem usadas : numpy, scipy, matplotlib, sklearn, nltk, re e opencv 

Carregar os ficheiros

In [2]:
with open('imdbCriticas.p', 'rb') as f:
    global D, Docs, y
    D = pickle.load(f)
    Docs = D.data
    y = D.target

Usamos este metodo para préprocessar os dados de texto, e reduzir as palavras tendo em conta os erros de ortografia

O stemmer por defeito é o lancaster, porque teve os melhores resultados e se o argumento não corresponder a nenhum outro, este é utilizado

In [3]:
def preProcessDoc(Doc, stemmer = 'lancaster', decode = False):
    stem = {
        'porter'   : PorterStemmer(),
        'snowball' : SnowballStemmer('english'),
        'lancaster': LancasterStemmer()
    }
    stemFunc = stem.get(stemmer, LancasterStemmer())
    if(decode):
        Doc = Doc.decode('UTF-8')
    Doc = Doc.replace('<br />', ' ')
    Doc = re.sub(r'[^a-zA-Z\u00C0\u00FF]+', ' ', Doc)
    Doc = ' '.join([stemFunc.stem(w) for w in Doc.split()])
    return Doc

def preProcessDocs(Docs, stemmer='lancaster', decode = False):
    return [preProcessDoc(doc, stemmer, decode) for doc in Docs]

def text2vector(Docs, preProcess = False, stemmer='lancaster', decode=False):
    if(preProcess):
        Docs = preProcessDocs(Docs, stemmer=stemmer, decode=decode)
    
    global tfidf
    if(tfidf is None):
        tfidf = TfidfVectorizer(min_df=3, token_pattern=r'\b\w{3,}\b', ngram_range=(1,2), norm = 'l2').fit(preProcessDocs(Docs))
    X = tfidf.transform(Docs)
    return X

## Classificação Binária

Converter da escala de 0 a 10, para negativos/positivos

In [4]:
y_boolean = [0 if val<5 else 1 for val in y]

### Steemers

O stemmer reduz uma palavra á sua raiz, ou seja, remove plurais, conjugacao de verbos, etc

#### Porter

In [None]:
X = preProcessDocs(Docs, stemmer='porter')
tfidf = TfidfVectorizer(min_df=3, token_pattern=r'\b[a-zA-Z]{3,}\b').fit(X)
vector = text2vector(X)
dl = LogisticRegression(max_iter = 1000, C=3.3, tol = 1e-3).fit(vector,y_boolean)
print('Token len' , len(tfidf.get_feature_names()))
print(dl.score(vector, y_boolean))

#### Snowball

In [None]:
X = preProcessDocs(Docs, stemmer='snowball')
tfidf = TfidfVectorizer(min_df=3, token_pattern=r'\b[a-zA-Z]{3,}\b').fit(X)
vector = text2vector(X)
dl = LogisticRegression(max_iter = 1000, C=3.3, tol = 1e-3).fit(vector,y_boolean)
print('Token len' , len(tfidf.get_feature_names()))
print(dl.score(vector, y_boolean))

#### Lancaster

In [None]:
X = preProcessDocs(Docs, stemmer='lancaster')
tfidf = TfidfVectorizer(min_df=3, token_pattern=r'\b[a-zA-Z]{3,}\b').fit(X)
vector = text2vector(X)
dl = LogisticRegression(max_iter = 1000, C=3.3, tol = 1e-3).fit(vector,y_boolean)
print('Token len' , len(tfidf.get_feature_names()))
print(dl.score(vector, y_boolean))

In [6]:
print('P' ,26773/0.945525)
print('S' ,26394/0.94535)
print('L' ,21883/0.939425)

P 28315.486105602708
S 27919.818056804357
L 23294.036245575753


É possivel ver que o lancaster é o que reduz ao maximo a quantidade de tokens, alem disso vamos optar por utilizar o Lancaster pois precisava de uma quantidade de tokens menores para atingir os teoricos 100%

## Comparar modelos lineares

In [5]:
X=preProcessDocs(Docs)

In [33]:
pipeline =Pipeline([
    ('tfidf' , TfidfVectorizer(min_df = 3, token_pattern=r'\b\w{3,}\b', norm = 'l2' , ngram_range=(1,2))),
    ('clf' , LogisticRegressionCV(max_iter = 1000, solver='saga', tol=1e-5))
])

grid_param ={
    #'tfidf__strip_accents' :[None, 'unicode'],
    #'tfidf__stop_words' : [None, 'english'],
    #'tfidf__token_pattern' : [r'\b\w{3,}\b', r'\b[a-zA-Z]{3,}\b'],
    'tfidf__min_df' : np.arange(3, 5, 1),
    #'tfidf__min_df' : [3,4,5],
    #'tfidf__min_df' : [3],
    #'tfidf__ngram_range' : [(i,j) for i in range(1,5) for j in range(1,5)],
    'tfidf__ngram_range' : [(1,1), (1,2), (1,3)],
    #'tfidf__norm' : ['l1', 'l2'],
    'tfidf__max_features' : [None, 15000,20000,25000,30000],
    
    #'clf__C' : np.linspace(0.1,10,100),
    'clf__Cs' : [1,3,10,30,100], 
    #'clf__C' : [3.3],
    'clf__solver' : ['sag', 'saga'],
    #'clf__tol' : (1e-3, 1e-4, 1e-5)
}
t0 =time.localtime()
print('Started at'  , time.strftime("%H:%M:%S", t0))
## Se tiveres mais memoria que eu(8Gb), aumenta o pre_dispatch para um valor maior
grid_search = RandomizedSearchCV(pipeline, grid_param, cv = 3, n_jobs=-1, verbose=3,pre_dispatch=8,n_iter = 5).fit(X, y_boolean)

print('Done at' , time.strftime("%H:%M:%S", time.localtime()))

with open('dump.p', 'wb') as f:
    pickle.dump({'in' : grid_param ,
                 'out': grid_search}, f)

Started at 18:23:39
Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  6.7min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 394 epochs took 176 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.9min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.9min finished


Done at 18:33:44


TypeError: dump() missing required argument 'file' (pos 2)

In [43]:
def pretty(d, indent=0):
   for key, value in d.items():
      print('\t' * indent + str(key))
      if isinstance(value, dict):
         pretty(value, indent+1)
      else:
         print('\t' * (indent+1) + str(value))

pretty(grid_search.cv_results_, 0)
print(grid_search.best_index_)
print(grid_search.best_estimator_)

mean_fit_time
	[222.41053796  94.89635054  20.95279503  72.15967647  37.13484303]
std_fit_time
	[2.86961028 5.85621739 0.24833061 1.13697287 0.55088446]
mean_score_time
	[11.06104453 15.79741041  4.60351531 10.37696131  8.1480062 ]
std_score_time
	[0.16542562 1.49282379 0.08943857 0.17763033 0.86291273]
param_tfidf__ngram_range
	[(1, 2) (1, 3) (1, 1) (1, 2) (1, 2)]
param_tfidf__min_df
	[3 4 4 4 4]
param_tfidf__max_features
	[None 20000 20000 20000 30000]
param_clf__solver
	['saga' 'sag' 'saga' 'sag' 'sag']
param_clf__C
	[100 3.3 10 100 10]
params
	[{'tfidf__ngram_range': (1, 2), 'tfidf__min_df': 3, 'tfidf__max_features': None, 'clf__solver': 'saga', 'clf__C': 100}, {'tfidf__ngram_range': (1, 3), 'tfidf__min_df': 4, 'tfidf__max_features': 20000, 'clf__solver': 'sag', 'clf__C': 3.3}, {'tfidf__ngram_range': (1, 1), 'tfidf__min_df': 4, 'tfidf__max_features': 20000, 'clf__solver': 'saga', 'clf__C': 10}, {'tfidf__ngram_range': (1, 2), 'tfidf__min_df': 4, 'tfidf__max_features': 20000, 'clf__s

## Em vez de correr o codigo em cima, podemos fazer import do ficheiro dump

In [8]:
with open('dump.p' ,'rb') as f:
    global grid_param, grid_search
    temp = pickle.load(f)
    grid_param = temp['in']
    grid_search = temp['out']

In [9]:
print(grid_search.best_estimator_)

tfidf = TfidfVectorizer(min_df = 3, token_pattern=r'\b\w{3,}\b', norm = 'l2' , ngram_range=(1,2)).fit(Docs)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(min_df=3, ngram_range=(1, 2),
                                 token_pattern='\\b\\w{3,}\\b')),
                ('clf',
                 LogisticRegression(C=100, max_iter=1000, solver='saga',
                                    tol=1e-05, verbose=2))])


In [12]:
tfidf = TfidfVectorizer(min_df = 3, token_pattern=r'\b\w{3,}\b', norm = 'l2' , ngram_range=(1,2)).fit(Docs)
vector = text2vector(X)
dl = LogisticRegressionCV(max_iter = 1000, Cs=3, tol = 1e-3, cv=5).fit(vector,y_boolean)
print(dl.score(vector, y_boolean))
print(dl.scores_)

TypeError: object of type 'float' has no len()

## Decomposição em componentes principais

Temos 4000 documentos, com 24000 tokes, logo para reduzir a complixidade usamos LDA. A LDA remove os componentes com pouco relevo.

In [None]:
vector = text2vector(X)

In [None]:
pca = TruncatedSVD(n_components=100).fit(vector)

In [None]:
top_PCA=[np.round(a, 6) for a in pca.explained_variance_ratio_ if a >0.00001]
sum(top_PCA)

In [None]:
pca.explained_variance_ratio_

# Bibliografia

### Geral
- Slides Professor

### Pré processamento do texto
- https://blog.ekbana.com/pre-processing-text-in-python-ad13ea544dae
- https://medium.com/@wenxuan0923/feature-extraction-from-text-using-countvectorizer-tfidfvectorizer-9f74f38f86cc

### Escolher o classificador
- https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

### Grid Search
- https://scikit-learn.org/stable/modules/grid_search.html
- https://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html
- https://www.youtube.com/watch?v=Gol_qOgRqfA

### Pipeline
- https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline

### Clustering
- https://medium.com/hanman/data-clustering-what-type-of-movies-are-in-the-imdb-top-250-7ef59372a93b

### Decomposição PCA/LSA
- https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html#sklearn.decomposition.TruncatedSVD
- https://www.datascienceassn.org/sites/default/files/users/user1/lsa_presentation_final.pdf
- https://towardsdatascience.com/latent-semantic-analysis-intuition-math-implementation-a194aff870f8