In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import os
import string
import nltk
import re
from nltk.stem.porter import PorterStemmer
import pandas as pd
import math
import scipy as sc
import numpy as np

In [2]:
files = [open("cbr-ilp-ir/"+f, 'r', encoding='ISO-8859-14').read() for f in sorted(os.listdir('cbr-ilp-ir/'))]
data = pd.DataFrame(columns=['Text', 'Class'])
data['Text'] = files
data['Class'] = [s.split("-")[0] for s in sorted(os.listdir('cbr-ilp-ir/'))]

In [3]:
def split(data, test_p):
    train = data.copy()
    test = train.sample(frac=test_p, replace=False)
    train = train.drop(test.index)
    return train, test

train, test = split(data, 0.2)

In [4]:
def token_stem(text):
    # Criando tokens
    tokens = [w for w in nltk.wordpunct_tokenize(text)]
    filtered_tokens = []
    # Lematizador utilizado
    stemmer = PorterStemmer()
    # Deixando somente termos que contém letras
    for token in tokens:
        if re.search('[a-zA-Z]', token) and len(token) > 1:
            filtered_tokens.append(token)
    # Lematização dos tokens selecionados
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return set(stems)

In [6]:
v = TfidfVectorizer(lowercase=True,min_df=0.2,max_df=0.9, tokenizer=token_stem)
v.encoding = 'ISO-8859-14'
m = v.fit_transform(train['Text'].values)
train_df = pd.DataFrame(data=m.toarray(), columns=v.get_feature_names())
train_df = pd.concat([train_df, train['Class'].reset_index(drop=True)], axis=1)

In [None]:
v.get_feature_names()

In [7]:
test_df = pd.DataFrame(columns=v.get_feature_names(), data=v.transform(test['Text'].values).toarray())

In [8]:
# Métricas de avaliação

def confusion_matrix(y_true, y_pred):
    cm = pd.DataFrame(index=np.unique(y_true), columns=np.unique(y_pred))
    cm = cm.replace(np.nan, 0)
    for t,p in zip(y_true, y_pred):
        cm.loc[t,p] += 1
    return cm

def Accuracy(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    return np.trace(cm.values)/len(y_true)

In [13]:
class NaiveBayesCont:
    def train(self, data):
        '''
        Função: Geração das probabilidades utilizadas no classificador naive bayes
        Parâmetros:
                    - data: Pandas.DataFrame - dados utilizados no treinamento. A última coluna deve conter as classes.
        '''
        # Classes do problema
        self.classes = {c: data[data.iloc[:,-1] == c].shape[0]/data.shape[0] for c in data.iloc[:,-1].unique()}
        
        # Criação dos dataframes de média e variância
        self.means = pd.DataFrame(index=list(self.classes.keys()), columns=data.columns[:-1])
        self.vars = pd.DataFrame(index=list(self.classes.keys()), columns=data.columns[:-1])
        
        # Cálculo das médias e variâncias
        for c in list(self.classes.keys()):
            self.means.loc[c, :] = data[data.iloc[:,-1] == c].mean()
            self.vars.loc[c, :] = data[data.iloc[:,-1] == c].var()
            
        for c in self.vars.columns:
            if self.vars[c].min() == 0:
                self.vars.drop(c, axis=1, inplace=True)
                self.means.drop(c, axis=1, inplace=True)
           
    def predict(self, x):
        '''
        Função: Predição da classe do objeto x
        Parâmetros:
                    - x: Objeto que queremos predizer a classe;
        '''
        probs = {}
        for s in list(self.classes.keys()):
            p = math.log(self.classes[s])
            for c in self.means.columns:
                print(x[c], self.means.loc[s,c], math.sqrt(self.vars.loc[s,c]))
                print(sc.stats.norm.pdf((x[c]-self.means.loc[s, c])/math.sqrt(self.vars.loc[s,c])))
                p += math.log(sc.stats.norm.pdf((x[c]-self.means.loc[s, c])/math.sqrt(self.vars.loc[s,c])))
            probs[s] = p
            input()
            
        m = max(list(probs.values()))
        c = [cl for cl in list(probs.keys()) if probs[cl] == m]
        print(c[0])
            
        return probs, c[0]

In [14]:
n = NaiveBayesCont()
n.train(train_df)

In [15]:
r = []
for t in test_df.index:
    r.append(n.predict(test_df.iloc[t])[1])
r = np.array(r)

0.0 0.060366704393737224 0.06013749985277253
0.24104849594819455
0.0 0.017980887711272073 0.04982690204306867
0.37379376985624435
0.0 0.03622195497450426 0.05803874820907412
0.32834557613513826
0.0 0.03156248886314123 0.05186338857807764
0.33150343846555325
0.0 0.043957971626213795 0.06750624710587103
0.32272683363684235
0.0 0.06872762858142999 0.0636103365981146
0.22254589618732903
0.0 0.03647827536746144 0.057781189932434175
0.326861281038645
0.14212815203044177 0.05455996168801273 0.06686742517826597
0.16924000955316512
0.0 0.04679490828125669 0.06494156270820209
0.3077250088301194
0.0 0.04113278181433485 0.04945927713020319
0.2823073361234397
0.1457018980337202 0.02860280877248394 0.05638605709633817
0.04617301709217089
0.07276885731829309 0.05989615225713725 0.0265921142391538
0.3548340177539033
0.0 0.046829790632595895 0.054735719586762974
0.2766695271532613
0.0 0.027891184939454534 0.055126998470166236
0.35101430613533724
0.0 0.029198069058445377 0.06006950672269363
0.3544913851

 


0.0 0.035906342373296776 0.05762682290199053
0.32855360541326906
0.0 0.07525557164137762 0.08413720157627878
0.2674162480176769
0.0 0.045108121054301814 0.06604955263116973
0.3159584592117104
0.0 0.024261470195404294 0.048016015660993935
0.35113243538555744
0.0 0.008729918752194166 0.035383571765178506
0.38698300656445445
0.0 0.00864685630783273 0.032206383728446
0.38481983752649207
0.0 0.040535870420112606 0.06209072803071591
0.3223735060039934
0.14212815203044177 0.022291405553595753 0.052579607415511646
0.029712185123422455
0.0 0.019347822621987594 0.04901350896131759
0.3690400072348372
0.0 0.07845896278216251 0.04860827155044882
0.10843443048913856
0.1457018980337202 0.04253704736021882 0.06671122842956188
0.12067244789474309
0.07276885731829309 0.062264637014272664 0.03132295467530483
0.37712858831362106
0.0 0.03350106047341455 0.05282677132152722
0.3262722217336695
0.0 0.010768940162041482 0.037471768239527185
0.382803111057788
0.0 0.04398359976794097 0.07096011053792585
0.329217

KeyboardInterrupt: 

In [12]:
r

array(['RI', 'ILP', 'CBR', 'CBR', 'RI', 'CBR', 'CBR', 'ILP', 'CBR', 'RI',
       'ILP', 'ILP', 'RI', 'ILP', 'CBR', 'RI', 'ILP', 'RI', 'ILP', 'RI',
       'ILP', 'RI', 'RI', 'RI', 'ILP', 'CBR', 'ILP', 'RI', 'CBR', 'RI',
       'ILP', 'RI', 'ILP', 'ILP', 'RI', 'ILP', 'CBR', 'RI', 'CBR', 'CBR',
       'CBR', 'RI', 'ILP', 'RI', 'RI', 'CBR', 'CBR', 'CBR', 'CBR', 'CBR',
       'RI', 'CBR', 'RI', 'CBR', 'ILP', 'CBR', 'CBR', 'CBR', 'RI', 'CBR',
       'RI', 'RI', 'CBR', 'CBR', 'CBR', 'RI', 'CBR', 'RI', 'CBR', 'RI',
       'RI', 'CBR', 'RI', 'RI', 'CBR', 'RI', 'CBR', 'CBR', 'CBR', 'CBR',
       'RI', 'CBR', 'RI', 'ILP', 'RI', 'RI', 'ILP', 'CBR', 'CBR', 'CBR',
       'CBR', 'RI', 'RI', 'CBR', 'CBR', 'RI', 'CBR', 'CBR', 'ILP', 'CBR',
       'ILP', 'CBR', 'CBR', 'ILP', 'ILP', 'CBR', 'CBR', 'CBR', 'ILP',
       'CBR', 'ILP', 'ILP', 'CBR', 'RI', 'RI'], dtype='<U3')

In [13]:
test['Class'].values

array(['RI', 'ILP', 'CBR', 'CBR', 'RI', 'CBR', 'CBR', 'ILP', 'CBR', 'RI',
       'ILP', 'ILP', 'RI', 'ILP', 'CBR', 'RI', 'ILP', 'RI', 'ILP', 'RI',
       'ILP', 'RI', 'RI', 'RI', 'ILP', 'CBR', 'ILP', 'RI', 'CBR', 'RI',
       'ILP', 'RI', 'ILP', 'ILP', 'RI', 'ILP', 'CBR', 'RI', 'CBR', 'CBR',
       'CBR', 'RI', 'ILP', 'ILP', 'RI', 'CBR', 'CBR', 'CBR', 'CBR', 'CBR',
       'RI', 'CBR', 'ILP', 'CBR', 'ILP', 'CBR', 'CBR', 'CBR', 'RI', 'CBR',
       'RI', 'RI', 'CBR', 'CBR', 'CBR', 'RI', 'CBR', 'RI', 'CBR', 'RI',
       'RI', 'CBR', 'RI', 'RI', 'CBR', 'RI', 'CBR', 'CBR', 'CBR', 'CBR',
       'RI', 'CBR', 'RI', 'ILP', 'RI', 'RI', 'ILP', 'CBR', 'CBR', 'CBR',
       'CBR', 'RI', 'RI', 'CBR', 'CBR', 'RI', 'CBR', 'CBR', 'ILP', 'CBR',
       'ILP', 'CBR', 'CBR', 'RI', 'ILP', 'CBR', 'CBR', 'CBR', 'ILP',
       'CBR', 'ILP', 'ILP', 'CBR', 'RI', 'RI'], dtype=object)

In [14]:
confusion_matrix(y_pred=r, y_true=test['Class'].values)

Unnamed: 0,CBR,ILP,RI
CBR,51,0,0
ILP,0,24,2
RI,0,1,37


In [15]:
Accuracy(y_pred=r, y_true=test['Class'].values)

0.9739130434782609

In [18]:
from sklearn.naive_bayes import GaussianNB

g = GaussianNB()

In [19]:
g.fit(train_df.iloc[:, :-1].values, train_df.iloc[:,-1].values)

GaussianNB(priors=None)

In [20]:
g.predict(test_df.values)

array(['RI', 'RI', 'CBR', 'CBR', 'ILP', 'ILP', 'RI', 'CBR', 'ILP', 'RI',
       'RI', 'CBR', 'CBR', 'ILP', 'CBR', 'CBR', 'CBR', 'ILP', 'RI', 'RI',
       'CBR', 'RI', 'ILP', 'CBR', 'ILP', 'RI', 'RI', 'RI', 'CBR', 'CBR',
       'RI', 'ILP', 'CBR', 'ILP', 'CBR', 'CBR', 'ILP', 'CBR', 'ILP',
       'ILP', 'CBR', 'CBR', 'CBR', 'ILP', 'CBR', 'CBR', 'ILP', 'RI', 'RI',
       'CBR', 'RI', 'CBR', 'CBR', 'ILP', 'RI', 'RI', 'CBR', 'ILP', 'RI',
       'CBR', 'CBR', 'RI', 'ILP', 'ILP', 'CBR', 'RI', 'CBR', 'RI', 'CBR',
       'RI', 'CBR', 'ILP', 'CBR', 'CBR', 'CBR', 'CBR', 'ILP', 'RI', 'ILP',
       'ILP', 'CBR', 'CBR', 'ILP', 'ILP', 'ILP', 'ILP', 'RI', 'CBR',
       'CBR', 'RI', 'CBR', 'CBR', 'CBR', 'RI', 'CBR', 'ILP', 'CBR', 'ILP',
       'CBR', 'ILP', 'RI', 'CBR', 'RI', 'ILP', 'RI', 'CBR', 'RI', 'RI',
       'ILP', 'RI', 'CBR', 'CBR', 'RI', 'CBR', 'RI'], dtype='<U3')

In [21]:
confusion_matrix(y_pred=g.predict(test_df.values), y_true=test['Class'].values)

Unnamed: 0,CBR,ILP,RI
CBR,48,1,2
ILP,1,29,0
RI,1,1,32


In [22]:
Accuracy(y_pred=g.predict(test_df.values), y_true=test['Class'].values)

0.9478260869565217