In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import os
import string
import nltk
import re
from nltk.stem.porter import PorterStemmer
import pandas as pd
import math
import scipy as sc
import numpy as np

In [3]:
files = [open("cbr-ilp-ir/"+f, 'r', encoding='ISO-8859-14').read() for f in sorted(os.listdir('cbr-ilp-ir/'))]
data = pd.DataFrame(columns=['Text', 'Class'])
data['Text'] = files
data['Class'] = [s.split("-")[0] for s in sorted(os.listdir('cbr-ilp-ir/'))]
data

Unnamed: 0,Text,Class
0,KBS Maintenance as Learning Two-Tiered\nDomain...,CBR
1,Integration of Case Based Retrieval with a Rel...,CBR
2,A Case-Based Approach for Developing Writing T...,CBR
3,Reasoning with Reasons in\nCase-Based Comparis...,CBR
4,INRECA: A Seamlessly Integrated System Based o...,CBR
5,DOM-ArC: An Active Decision Support\nSystem fo...,CBR
6,"Towards the Integration of Case-Based,\nSchema...",CBR
7,A Case-Based Reasoner Adaptive to Different\nC...,CBR
8,Cost Estimation of Software Projects through\n...,CBR
9,Separating the Cases from the Data: Towards\nM...,CBR


In [4]:
def split(data, test_p):
    train = data.copy()
    test = train.sample(frac=test_p, replace=False)
    train = train.drop(test.index)
    return train, test

train, test = split(data, 0.2)

In [5]:
train

Unnamed: 0,Text,Class
0,KBS Maintenance as Learning Two-Tiered\nDomain...,CBR
1,Integration of Case Based Retrieval with a Rel...,CBR
3,Reasoning with Reasons in\nCase-Based Comparis...,CBR
4,INRECA: A Seamlessly Integrated System Based o...,CBR
5,DOM-ArC: An Active Decision Support\nSystem fo...,CBR
6,"Towards the Integration of Case-Based,\nSchema...",CBR
7,A Case-Based Reasoner Adaptive to Different\nC...,CBR
8,Cost Estimation of Software Projects through\n...,CBR
9,Separating the Cases from the Data: Towards\nM...,CBR
11,Case-based Diagnosis of Multiple Faults\n\nRal...,CBR


In [6]:
test

Unnamed: 0,Text,Class
456,On the Power of Nonlinear Secret­Sharing\n\nAm...,RI
408,Specification and Simulation of Statistical Qu...,RI
105,Merge Strategies for Multiple Case Plan Replay...,CBR
183,Evaluating a Multi-modal Reasoning System in\n...,CBR
303,Which Hypotheses Can Be Found with\nInverse En...,ILP
393,New Conditions for the Existence of Least\nGen...,ILP
150,Surfing the Digital Wave\n\nGenerating Persona...,CBR
263,A First Study on Case-Based Planning in\nOrgan...,CBR
356,Refining Complete Hypotheses in ILP\n\nIvan Br...,ILP
534,Evaluation Corpora for Sense Disambiguation in...,RI


In [7]:
def token_stem(text):
    # Criando tokens
    tokens = [w for w in nltk.wordpunct_tokenize(text)]
    filtered_tokens = []
    # Lematizador utilizado
    stemmer = PorterStemmer()
    # Deixando somente termos que contém letras
    for token in tokens:
        if re.search('[a-zA-Z]', token) and len(token) > 1:
            filtered_tokens.append(token)
    # Lematização dos tokens selecionados
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return set(stems)

In [8]:
v = TfidfVectorizer(lowercase=True,min_df=0.2,max_df=0.9, tokenizer=token_stem)
v.encoding = 'ISO-8859-14'
m = v.fit_transform(train['Text'].values)
train_df = pd.DataFrame(data=m.toarray(), columns=v.get_feature_names())
train_df = pd.concat([train_df, train['Class'].reset_index(drop=True)], axis=1)
train_df

Unnamed: 0,aaai,ac,academ,acm,acquisit,adapt,advanc,ai,al,algorithm,...,vol,volum,we,which,with,work,workshop,world,york,Class
0,0.000000,0.000000,0.131215,0.115818,0.000000,0.000000,0.000000,0.000000,0.000000,0.100952,...,0.000000,0.000000,0.000000,0.108809,0.071621,0.000000,0.081853,0.149697,0.000000,CBR
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.165655,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.085219,0.000000,0.000000,0.000000,0.000000,CBR
2,0.114544,0.000000,0.127496,0.000000,0.143721,0.000000,0.000000,0.128794,0.000000,0.000000,...,0.000000,0.000000,0.076836,0.000000,0.069591,0.128794,0.000000,0.000000,0.129676,CBR
3,0.110833,0.000000,0.000000,0.108890,0.139065,0.000000,0.000000,0.000000,0.130894,0.094913,...,0.000000,0.138518,0.074347,0.000000,0.067337,0.000000,0.076957,0.000000,0.000000,CBR
4,0.000000,0.000000,0.140813,0.000000,0.000000,0.000000,0.138951,0.142247,0.000000,0.000000,...,0.135385,0.000000,0.000000,0.116768,0.076860,0.000000,0.087841,0.160647,0.000000,CBR
5,0.000000,0.000000,0.124227,0.109650,0.140036,0.118678,0.000000,0.125492,0.131808,0.095576,...,0.000000,0.000000,0.000000,0.103014,0.067807,0.000000,0.077494,0.141725,0.126351,CBR
6,0.115489,0.000000,0.128548,0.000000,0.000000,0.122807,0.000000,0.129857,0.000000,0.000000,...,0.123592,0.144336,0.000000,0.000000,0.000000,0.129857,0.080190,0.000000,0.000000,CBR
7,0.000000,0.000000,0.000000,0.149227,0.000000,0.000000,0.000000,0.170787,0.179383,0.000000,...,0.162548,0.000000,0.000000,0.000000,0.092281,0.000000,0.000000,0.000000,0.000000,CBR
8,0.103479,0.132475,0.000000,0.101665,0.000000,0.110036,0.000000,0.116353,0.000000,0.088616,...,0.000000,0.000000,0.000000,0.000000,0.062869,0.116353,0.071851,0.000000,0.000000,CBR
9,0.134839,0.000000,0.000000,0.000000,0.000000,0.143382,0.000000,0.000000,0.000000,0.115471,...,0.000000,0.000000,0.000000,0.124457,0.081921,0.000000,0.093625,0.000000,0.000000,CBR


In [None]:
v.get_feature_names()

In [9]:
test_df = pd.DataFrame(columns=v.get_feature_names(), data=v.transform(test['Text'].values).toarray())
test_df

Unnamed: 0,aaai,ac,academ,acm,acquisit,adapt,advanc,ai,al,algorithm,...,verlag,vol,volum,we,which,with,work,workshop,world,york
0,0.000000,0.135352,0.000000,0.103873,0.000000,0.000000,0.116126,0.000000,0.124864,0.090540,...,0.092459,0.000000,0.132136,0.070922,0.097587,0.064235,0.118880,0.000000,0.000000,0.000000
1,0.000000,0.000000,0.122307,0.107956,0.000000,0.000000,0.000000,0.123552,0.000000,0.094099,...,0.096093,0.000000,0.000000,0.073709,0.101422,0.066759,0.000000,0.076297,0.000000,0.000000
2,0.109579,0.000000,0.000000,0.000000,0.137492,0.116522,0.000000,0.123212,0.129414,0.093839,...,0.095828,0.000000,0.136950,0.073506,0.000000,0.066575,0.000000,0.076086,0.000000,0.000000
3,0.124986,0.000000,0.000000,0.000000,0.000000,0.132906,0.000000,0.000000,0.147609,0.000000,...,0.109302,0.000000,0.000000,0.083841,0.000000,0.075936,0.000000,0.086784,0.000000,0.141498
4,0.000000,0.166489,0.144754,0.000000,0.000000,0.000000,0.000000,0.000000,0.153588,0.111369,...,0.113729,0.000000,0.000000,0.087237,0.120036,0.079011,0.000000,0.090299,0.000000,0.000000
5,0.000000,0.164730,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.151965,0.110192,...,0.000000,0.137703,0.000000,0.086315,0.118768,0.078177,0.000000,0.089345,0.163398,0.000000
6,0.136335,0.000000,0.000000,0.133944,0.000000,0.144973,0.000000,0.000000,0.000000,0.116752,...,0.000000,0.000000,0.000000,0.091454,0.000000,0.082830,0.000000,0.094664,0.000000,0.154346
7,0.000000,0.000000,0.127629,0.000000,0.143872,0.000000,0.000000,0.000000,0.000000,0.098194,...,0.100274,0.000000,0.143305,0.076917,0.105835,0.069664,0.000000,0.079617,0.000000,0.129812
8,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.149595,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.106131,0.000000,0.121293,0.000000,0.000000
9,0.000000,0.000000,0.000000,0.142112,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.097031,0.000000,0.000000,0.162644,0.100436,0.000000,0.000000


In [10]:
# Métricas de avaliação

def confusion_matrix(y_true, y_pred):
    cm = pd.DataFrame(index=np.unique(y_true), columns=np.unique(y_pred))
    cm = cm.replace(np.nan, 0)
    for t,p in zip(y_true, y_pred):
        cm.loc[t,p] += 1
    return cm

def Accuracy(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    return np.trace(cm.values)/len(y_true)

In [11]:
class NaiveBayesCont:
    def train(self, data):
        '''
        Função: Geração das probabilidades utilizadas no classificador naive bayes
        Parâmetros:
                    - data: Pandas.DataFrame - dados utilizados no treinamento. A última coluna deve conter as classes.
        '''
        # Classes do problema
        self.classes = {c: data[data.iloc[:,-1] == c].shape[0]/data.shape[0] for c in data.iloc[:,-1].unique()}
        
        # Criação dos dataframes de média e variância
        self.means = pd.DataFrame(index=list(self.classes.keys()), columns=data.columns[:-1])
        self.vars = pd.DataFrame(index=list(self.classes.keys()), columns=data.columns[:-1])
        
        # Cálculo das médias e variâncias
        for c in list(self.classes.keys()):
            self.means.loc[c, :] = data[data.iloc[:,-1] == c].mean()
            self.vars.loc[c, :] = data[data.iloc[:,-1] == c].var()
            
        for c in self.vars.columns:
            if self.vars[c].min() == 0:
                self.vars.drop(c, axis=1, inplace=True)
                self.means.drop(c, axis=1, inplace=True)
           
    def predict(self, x):
        '''
        Função: Predição da classe do objeto x
        Parâmetros:
                    - x: Objeto que queremos predizer a classe;
        '''
        probs = {}
        for s in list(self.classes.keys()):
            p = math.log(self.classes[s])
            for c in self.means.columns:
                p += math.log(sc.stats.norm.pdf((x[c]-self.means.loc[s, c])/math.sqrt(self.vars.loc[s,c])))
            probs[s] = p
            
        m = max(list(probs.values()))
        c = [cl for cl in list(probs.keys()) if probs[cl] == m]
            
        return probs, c[0]

In [12]:
n = NaiveBayesCont()
n.train(train_df)

In [13]:
r = []
for t in test_df.index:
    r.append(n.predict(test_df.iloc[t])[1])
r = np.array(r)

In [14]:
r

array(['RI', 'RI', 'CBR', 'CBR', 'ILP', 'ILP', 'CBR', 'CBR', 'ILP', 'RI',
       'RI', 'CBR', 'CBR', 'ILP', 'CBR', 'CBR', 'CBR', 'ILP', 'RI', 'RI',
       'CBR', 'RI', 'ILP', 'CBR', 'ILP', 'RI', 'RI', 'RI', 'CBR', 'CBR',
       'RI', 'CBR', 'CBR', 'ILP', 'CBR', 'CBR', 'ILP', 'CBR', 'ILP',
       'ILP', 'CBR', 'CBR', 'CBR', 'ILP', 'CBR', 'CBR', 'ILP', 'RI', 'RI',
       'CBR', 'RI', 'CBR', 'CBR', 'ILP', 'RI', 'RI', 'CBR', 'RI', 'RI',
       'CBR', 'CBR', 'RI', 'ILP', 'ILP', 'CBR', 'RI', 'CBR', 'RI', 'CBR',
       'RI', 'CBR', 'ILP', 'CBR', 'CBR', 'CBR', 'CBR', 'RI', 'RI', 'ILP',
       'ILP', 'CBR', 'CBR', 'ILP', 'CBR', 'ILP', 'ILP', 'RI', 'CBR',
       'CBR', 'RI', 'CBR', 'CBR', 'CBR', 'RI', 'CBR', 'ILP', 'CBR', 'ILP',
       'CBR', 'ILP', 'RI', 'CBR', 'RI', 'ILP', 'RI', 'CBR', 'RI', 'RI',
       'RI', 'RI', 'CBR', 'CBR', 'RI', 'CBR', 'RI'], dtype='<U3')

In [15]:
test['Class'].values

array(['RI', 'RI', 'CBR', 'CBR', 'ILP', 'ILP', 'CBR', 'CBR', 'ILP', 'RI',
       'RI', 'CBR', 'CBR', 'ILP', 'CBR', 'CBR', 'CBR', 'ILP', 'RI', 'RI',
       'CBR', 'RI', 'ILP', 'CBR', 'ILP', 'RI', 'RI', 'RI', 'CBR', 'CBR',
       'RI', 'ILP', 'CBR', 'ILP', 'CBR', 'RI', 'ILP', 'CBR', 'ILP', 'ILP',
       'CBR', 'CBR', 'CBR', 'ILP', 'CBR', 'CBR', 'ILP', 'RI', 'RI', 'CBR',
       'RI', 'CBR', 'CBR', 'ILP', 'RI', 'RI', 'CBR', 'ILP', 'RI', 'CBR',
       'CBR', 'RI', 'ILP', 'ILP', 'CBR', 'RI', 'CBR', 'RI', 'CBR', 'RI',
       'CBR', 'ILP', 'CBR', 'CBR', 'CBR', 'CBR', 'ILP', 'RI', 'ILP',
       'ILP', 'CBR', 'CBR', 'ILP', 'CBR', 'ILP', 'ILP', 'RI', 'CBR',
       'CBR', 'RI', 'CBR', 'ILP', 'CBR', 'RI', 'CBR', 'ILP', 'CBR', 'ILP',
       'CBR', 'ILP', 'CBR', 'CBR', 'RI', 'ILP', 'RI', 'CBR', 'RI', 'RI',
       'RI', 'RI', 'CBR', 'CBR', 'RI', 'CBR', 'RI'], dtype=object)

In [16]:
confusion_matrix(y_pred=r, y_true=test['Class'].values)

Unnamed: 0,CBR,ILP,RI
CBR,50,0,1
ILP,2,26,2
RI,1,0,33


In [17]:
Accuracy(y_pred=r, y_true=test['Class'].values)

0.9478260869565217

In [18]:
from sklearn.naive_bayes import GaussianNB

g = GaussianNB()

In [19]:
g.fit(train_df.iloc[:, :-1].values, train_df.iloc[:,-1].values)

GaussianNB(priors=None)

In [20]:
g.predict(test_df.values)

array(['RI', 'RI', 'CBR', 'CBR', 'ILP', 'ILP', 'RI', 'CBR', 'ILP', 'RI',
       'RI', 'CBR', 'CBR', 'ILP', 'CBR', 'CBR', 'CBR', 'ILP', 'RI', 'RI',
       'CBR', 'RI', 'ILP', 'CBR', 'ILP', 'RI', 'RI', 'RI', 'CBR', 'CBR',
       'RI', 'ILP', 'CBR', 'ILP', 'CBR', 'CBR', 'ILP', 'CBR', 'ILP',
       'ILP', 'CBR', 'CBR', 'CBR', 'ILP', 'CBR', 'CBR', 'ILP', 'RI', 'RI',
       'CBR', 'RI', 'CBR', 'CBR', 'ILP', 'RI', 'RI', 'CBR', 'ILP', 'RI',
       'CBR', 'CBR', 'RI', 'ILP', 'ILP', 'CBR', 'RI', 'CBR', 'RI', 'CBR',
       'RI', 'CBR', 'ILP', 'CBR', 'CBR', 'CBR', 'CBR', 'ILP', 'RI', 'ILP',
       'ILP', 'CBR', 'CBR', 'ILP', 'ILP', 'ILP', 'ILP', 'RI', 'CBR',
       'CBR', 'RI', 'CBR', 'CBR', 'CBR', 'RI', 'CBR', 'ILP', 'CBR', 'ILP',
       'CBR', 'ILP', 'RI', 'CBR', 'RI', 'ILP', 'RI', 'CBR', 'RI', 'RI',
       'ILP', 'RI', 'CBR', 'CBR', 'RI', 'CBR', 'RI'], dtype='<U3')

In [21]:
confusion_matrix(y_pred=g.predict(test_df.values), y_true=test['Class'].values)

Unnamed: 0,CBR,ILP,RI
CBR,48,1,2
ILP,1,29,0
RI,1,1,32


In [22]:
Accuracy(y_pred=g.predict(test_df.values), y_true=test['Class'].values)

0.9478260869565217