# Trabalho Prático Aprendizagem Automática


In [1]:
import re, pickle, time, datetime
from tempfile import TemporaryDirectory

from sklearn.svm import LinearSVC,SVC

import numpy as np

from matplotlib import pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.datasets import load_files
from sklearn.preprocessing import Normalizer

from sklearn.model_selection import cross_val_score , GridSearchCV , RandomizedSearchCV

from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA , TruncatedSVD
from sklearn.pipeline import Pipeline


from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer
#Possiveis de serem usadas : numpy, scipy, matplotlib, sklearn, nltk, re e opencv 

Carregar os ficheiros

In [2]:
with open('imdbCriticas.p', 'rb') as f:
    global D, Docs, y
    D = pickle.load(f)
    Docs = D.data
    y = D.target

Usamos este metodo para préprocessar os dados de texto, e reduzir as palavras tendo em conta os erros de ortografia

O stemmer por defeito é o lancaster, porque teve os melhores resultados e se o argumento não corresponder a nenhum outro, este é utilizado

In [3]:
def preProcessDoc(Doc, stemmer = 'lancaster', decode = False):
    stem = {
        'porter'   : PorterStemmer(),
        'snowball' : SnowballStemmer('english'),
        'lancaster': LancasterStemmer()
    }
    stemFunc = stem.get(stemmer, LancasterStemmer())
    if(decode):
        Doc = Doc.decode('UTF-8')
    Doc = Doc.replace('<br />', ' ')
    Doc = re.sub(r'[^a-zA-Z\u00C0\u00FF]+', ' ', Doc)
    Doc = ' '.join([stemFunc.stem(w) for w in Doc.split()])
    return Doc

def preProcessDocs(Docs, stemmer='lancaster', decode = False):
    return [preProcessDoc(doc, stemmer, decode) for doc in Docs]

def text2vector(Docs, preProcess = False, stemmer='lancaster', decode=False):
    if(preProcess):
        Docs = preProcessDocs(Docs, stemmer=stemmer, decode=decode)
    global tfidf
    print(tfidf)
    if(tfidf is None):
        tfidf = pickle.load(open('tfidf_dump.p','rb'))
        #tfidf = TfidfVectorizer(min_df=3, token_pattern=r'\b\w{3,}\b', ngram_range=(1,2),\
        #                        norm = 'l2').fit(Docs)
    X = tfidf.transform(Docs)
    return X

## Classificação Binária

Converter da escala de 0 a 10, para negativos/positivos

In [4]:
y_boolean = [0 if val<5 else 1 for val in y]

### Steemers

O stemmer reduz uma palavra á sua raiz, ou seja, remove plurais, conjugacao de verbos, etc

#### Porter

In [7]:
X = preProcessDocs(Docs, stemmer='porter')
tfidf = TfidfVectorizer(min_df=3, token_pattern=r'\b[a-zA-Z]{3,}\b').fit(X)
vector = text2vector(X)
dl = LogisticRegression(max_iter = 1000, C=3.3, tol = 1e-3).fit(vector,y_boolean)
print('Token len' , len(tfidf.get_feature_names()))
print(dl.score(vector, y_boolean))

Token len 26773
0.9455


#### Snowball

In [8]:
X = preProcessDocs(Docs, stemmer='snowball')
tfidf = TfidfVectorizer(min_df=3, token_pattern=r'\b[a-zA-Z]{3,}\b').fit(X)
vector = text2vector(X)
dl = LogisticRegression(max_iter = 1000, C=3.3, tol = 1e-3).fit(vector,y_boolean)
print('Token len' , len(tfidf.get_feature_names()))
print(dl.score(vector, y_boolean))

Token len 26394
0.945375


#### Lancaster

In [9]:
X = preProcessDocs(Docs, stemmer='lancaster')
tfidf = TfidfVectorizer(min_df=3, token_pattern=r'\b[a-zA-Z]{3,}\b').fit(X)
vector = text2vector(X)
dl = LogisticRegression(max_iter = 1000, C=3.3, tol = 1e-3).fit(vector,y_boolean)
print('Token len' , len(tfidf.get_feature_names()))
print(dl.score(vector, y_boolean))

Token len 21883
0.9394


In [10]:
print('P' ,26773/0.945525)
print('S' ,26394/0.94535)
print('L' ,21883/0.939425)

P 28315.486105602708
S 27919.818056804357
L 23294.036245575753


É possivel ver que o lancaster é o que reduz ao maximo a quantidade de tokens, alem disso vamos optar por utilizar o Lancaster pois precisava de uma quantidade de tokens menores para atingir os teoricos 100%

## Comparar modelos lineares

In [5]:
X=preProcessDocs(Docs)
with open('tfidf_dump', 'wb') as f:
    tfidf = TfidfVectorizer(min_df=3, token_pattern=r'\b\w{3,}\b', \
                            ngram_range=(1,2), norm = 'l2').fit(X)
    print('done fitting')
    pickle.dump(tfidf, f)
    

done fitting


In [6]:
len(tfidf.get_feature_names())

289621

In [5]:
X = preProcessDocs(Docs)

In [None]:
with TemporaryDirectory(prefix="sklearn_graph_cache_") as tmpdir:
    pipeline =Pipeline([
        ('tfidf' , TfidfVectorizer()),
        ('norm' , Normalizer()),
        ('clf' , LogisticRegressionCV(max_iter = 1000, tol=1e-3))
    ], memory=tmpdir)

    grid_param ={
        'tfidf__strip_accents' :[None, 'unicode'],
        #'tfidf__stop_words' : [None, 'english'],
        'tfidf__token_pattern' : [r'\b\w{3,}\b', r'\b[a-zA-Z]{3,}\b'],
        'tfidf__min_df' : np.arange(3, 5, 1),
        'tfidf__max_df' : [0.25, 0.5, 0.75],
        #'tfidf__min_df' : [3,4,5],
        #'tfidf__min_df' : [3],
        #'tfidf__ngram_range' : [(i,j) for i in range(1,5) for j in range(1,5)],
        'tfidf__ngram_range' : [(1,1), (1,2), (1,3) , (2,2)] ,
        'tfidf__norm' : ['l1', 'l2'],
        'tfidf__max_features' : [None, 15000,20000,25000,30000],

        'norm__norm' : ['l1', 'l2', 'max'],


        #'clf__C' : np.linspace(0.1,10,100),
        'clf__Cs' : [1,3,10,30,100], 
        #'clf__C' : [3.3],
        'clf__solver' : ['sag', 'saga'],
        #'clf__tol' : (1e-3, 1e-4, 1e-5)
    }
    t0 =time.localtime()
    print('Started at'  , time.strftime("%H:%M:%S", t0))
    ## Se tiveres mais memoria que eu(8Gb), aumenta o pre_dispatch para um valor maior
    grid_search = RandomizedSearchCV(pipeline, grid_param, cv = 3, n_jobs=-1, verbose=3,\
                                     pre_dispatch=8,n_iter = 15).fit(X, y_boolean)

    t1 = time.localtime()
    print('Done at' , time.strftime("%H:%M:%S", t1))
with open('dump.p', 'wb') as f:
    pickle.dump({'in' : grid_param ,
                 'out': grid_search}, f)

Started at 15:16:13
Fitting 3 folds for each of 15 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed: 33.9min
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed: 56.1min finished
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


In [26]:
def pretty(d, indent=0):
    for key, value in d.items():
        print('\t' * indent + str(key))
        if isinstance(value, dict):
            pretty(value, indent+1)
        else:
             print('\t' * (indent+1) + str(value))

pretty(grid_search.cv_results_, 0)
print(grid_search.best_index_)
print(grid_search.best_estimator_)

mean_fit_time
	[793.77378281 203.92118669 135.58186412  67.24587297 217.12415775]
std_fit_time
	[16.46313034  2.55856885  9.56936036  0.57878529  1.26411078]
mean_score_time
	[5.15522528 4.77492126 5.2886459  4.90518769 2.57232587]
std_score_time
	[0.13301035 0.09226935 0.22727117 0.4655577  0.81811887]
param_tfidf__token_pattern
	['\\b\\w{3,}\\b' '\\b\\w{3,}\\b' '\\b\\w{3,}\\b' '\\b\\w{3,}\\b'
 '\\b\\w{3,}\\b']
param_tfidf__strip_accents
	[None None None None None]
param_tfidf__norm
	['l2' 'l2' 'l2' 'l2' 'l1']
param_tfidf__ngram_range
	[(2, 2) (2, 2) (2, 2) (2, 2) (1, 1)]
param_tfidf__min_df
	[4 4 4 4 4]
param_tfidf__max_features
	[None 30000 None 20000 15000]
param_tfidf__max_df
	[0.25 0.25 0.25 0.75 0.25]
param_norm__norm
	['l2' 'l2' 'l1' 'l2' 'max']
param_clf__solver
	['saga' 'saga' 'sag' 'sag' 'sag']
param_clf__Cs
	[100 30 1 3 100]
params
	[{'tfidf__token_pattern': '\\b\\w{3,}\\b', 'tfidf__strip_accents': None, 'tfidf__norm': 'l2', 'tfidf__ngram_range': (2, 2), 'tfidf__min_df': 4,

## Em vez de correr o codigo em cima, podemos fazer import do ficheiro dump

In [None]:
with open('dump.p' ,'rb') as f:
    global grid_param, grid_search
    temp = pickle.load(f)
    grid_param = temp['in']
    grid_search = temp['out']
print(grid_search.best_estimator_)

In [15]:
X = preProcessDocs(Docs)
tfidf = TfidfVectorizer(min_df = 3, token_pattern=r'\b\w{3,}\b', norm = 'l2' , ngram_range=(1,2)).fit(X)

In [17]:
len(tfidf.get_feature_names())

21883

### Classificacao booleana

In [20]:
##tfidf = TfidfVectorizer(min_df = 3, token_pattern=r'\b\w{3,}\b', norm = 'l2' , ngram_range=(1,2)).fit(Docs)
vector = text2vector(X)
dl = LogisticRegressionCV(max_iter = 1000, Cs=3, tol = 1e-3, cv=5).fit(vector,y_boolean)
print(dl.score(vector, y_boolean))
print(dl.scores_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.91805
{1: array([[0.50525 , 0.880375, 0.839375],
       [0.50525 , 0.888375, 0.850625],
       [0.50525 , 0.886625, 0.85    ],
       [0.50525 , 0.888375, 0.84275 ],
       [0.50525 , 0.88625 , 0.854   ]])}


In [31]:
dl = LogisticRegressionCV(max_iter = 1000, Cs=3, tol = 1e-3, cv=5).fit(vector,y_boolean)
print(dl.score(vector, y_boolean))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9181


### Classificacao multi-classe

In [28]:
dl = LogisticRegressionCV(max_iter = 1000, Cs=3, tol = 1e-3, cv=5).fit(vector, y)
print(dl.score(vector, y))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.623375


In [22]:
X1,X2,y1,y2=train_test_split(vector,y,test_size=1/5,shuffle=False)

In [23]:
svm=LinearSVC(dual=False,C=1).fit(X1,y1)

In [24]:
ye=svm.predict(X2)

In [25]:
print("tamanho y: ",y.shape)
print("tamanho ye: ",ye.shape)
print('N erros(teste):',np.sum(y2!=ye))

tamanho y:  (40000,)
tamanho ye:  (8000,)
N erros(teste): 4813


In [26]:
print("Linear SVC Score train: ", svm.score(X1,y1))

Linear SVC Score train:  0.85334375


In [27]:
print("Linear SVC Score test: ", svm.score(X2,y2))

Linear SVC Score test:  0.398375


Estas condições de teste não são as mais adecuadas para o conjunto de dados pois como podemos visualizar no teste acima, os valores de score para o conjunto de train e teste são bastante díspares,sendo o de train bastante bom ao contrário do de test que não é muito bom.

## Decomposição em componentes principais

Temos 4000 documentos, com 24000 tokes, logo para reduzir a complixidade usamos LDA. A LDA remove os componentes com pouco relevo.

In [66]:
vector.shape

(40000, 321899)

In [16]:
pca = TruncatedSVD(n_components=100).fit(vector)

MemoryError: Unable to allocate 270. MiB for an array with shape (321899, 110) and data type float64

In [None]:
top_PCA=[np.round(a, 6) for a in pca.explained_variance_ratio_ if a >0.00001]
sum(top_PCA)

In [None]:
pca.explained_variance_ratio_

# Bibliografia

### Geral
- Slides Professor

### Pré processamento do texto
- https://blog.ekbana.com/pre-processing-text-in-python-ad13ea544dae
- https://medium.com/@wenxuan0923/feature-extraction-from-text-using-countvectorizer-tfidfvectorizer-9f74f38f86cc

### Escolher o classificador
- https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

### Grid Search
- https://scikit-learn.org/stable/modules/grid_search.html
- https://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html
- https://www.youtube.com/watch?v=Gol_qOgRqfA

### Pipeline
- https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline

### Clustering
- https://medium.com/hanman/data-clustering-what-type-of-movies-are-in-the-imdb-top-250-7ef59372a93b

### Decomposição PCA/LSA
- https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html#sklearn.decomposition.TruncatedSVD
- https://www.datascienceassn.org/sites/default/files/users/user1/lsa_presentation_final.pdf
- https://towardsdatascience.com/latent-semantic-analysis-intuition-math-implementation-a194aff870f8