In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import os
import string
import nltk
import re
from nltk.stem.snowball import SnowballStemmer
import pandas as pd
import math

In [2]:
files = [open("cbr-ilp-ir/"+f, 'r', encoding='ISO-8859-14').read() for f in sorted(os.listdir('cbr-ilp-ir/'))]
data = pd.DataFrame(columns=['Text', 'Class'])
data['Text'] = files
data['Class'] = [s.split("-")[0] for s in sorted(os.listdir('cbr-ilp-ir/'))]
data

Unnamed: 0,Text,Class
0,KBS Maintenance as Learning Two-Tiered\nDomain...,CBR
1,Integration of Case Based Retrieval with a Rel...,CBR
2,A Case-Based Approach for Developing Writing T...,CBR
3,Reasoning with Reasons in\nCase-Based Comparis...,CBR
4,INRECA: A Seamlessly Integrated System Based o...,CBR
5,DOM-ArC: An Active Decision Support\nSystem fo...,CBR
6,"Towards the Integration of Case-Based,\nSchema...",CBR
7,A Case-Based Reasoner Adaptive to Different\nC...,CBR
8,Cost Estimation of Software Projects through\n...,CBR
9,Separating the Cases from the Data: Towards\nM...,CBR


In [3]:
def split(data, test_p):
    train = data.copy()
    test = train.sample(frac=test_p, replace=False)
    train = train.drop(test.index)
    return train, test

train, test = split(data, 0.2)

In [4]:
train

Unnamed: 0,Text,Class
0,KBS Maintenance as Learning Two-Tiered\nDomain...,CBR
5,DOM-ArC: An Active Decision Support\nSystem fo...,CBR
6,"Towards the Integration of Case-Based,\nSchema...",CBR
8,Cost Estimation of Software Projects through\n...,CBR
10,On the use of CBR in optimisation problems suc...,CBR
11,Case-based Diagnosis of Multiple Faults\n\nRal...,CBR
12,On the Automatic Generation of Case Libraries\...,CBR
13,Learning to Refine Indexing by\nIntrospective ...,CBR
14,Operator Decision Aiding by Adaptation of\nSup...,CBR
15,Problem Solving with The Incredible Machine*...,CBR


In [5]:
test

Unnamed: 0,Text,Class
557,Little Words Can Make a Big Difference for Tex...,RI
313,Integrity Constraints in ILP Using a Monte Car...,ILP
282,Maximum Entropy Modeling\nwith Clausal Constra...,ILP
130,Virtual Function Generators: Representing and\...,CBR
89,Theoretical Analysis of Case Retrieval Method\...,CBR
115,Active Exploration in Instance-Based Preferenc...,CBR
555,Applying a Hybrid Query Translation Method to\...,RI
37,Adaptation Using Constraint Satisfaction\nTech...,CBR
34,Case-Based Reasoning for Expertise Relocation\...,CBR
440,Automating the Construction of Authority\nFile...,RI


In [6]:
def token_stem(text):
    # Criando tokens
    tokens = [w for w in nltk.wordpunct_tokenize(text)]
    filtered_tokens = []
    # Lematizador utilizado
    stemmer = SnowballStemmer("english")
    # Deixando somente termos que contém letras
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    # Lematização dos tokens selecionados
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return set(stems)

In [7]:
v = TfidfVectorizer(lowercase=True,min_df=0.1,max_df=0.9, tokenizer=token_stem)
v.encoding = 'ISO-8859-14'
m = v.fit_transform(train['Text'].values)
train_df = pd.DataFrame(data=m.toarray(), columns=v.get_feature_names())
train_df = pd.concat([train_df, train['Class'].reset_index(drop=True)], axis=1)
train_df

Unnamed: 0,2nd,3rd,4th,5th,aaai,aamodt,about,ac,academ,access,...,within,word,work,workshop,world,www,y,york,z,Class
0,0.000000,0.000000,0.118732,0.000000,0.000000,0.114327,0.000000,0.000000,0.086644,0.000000,...,0.000000,0.000000,0.000000,0.052697,0.097670,0.000000,0.085781,0.000000,0.123705,CBR
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.126253,0.120143,0.000000,0.095682,0.000000,...,0.132617,0.000000,0.000000,0.058194,0.107859,0.000000,0.000000,0.000000,0.000000,CBR
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.106936,0.101761,0.000000,0.081043,0.000000,...,0.000000,0.000000,0.000000,0.049290,0.091357,0.000000,0.000000,0.083286,0.000000,CBR
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,CBR
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.120092,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.095323,0.058170,0.000000,0.000000,0.000000,0.000000,0.000000,CBR
5,0.000000,0.000000,0.000000,0.000000,0.089243,0.000000,0.123244,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.059696,0.000000,0.000000,0.000000,0.000000,0.000000,CBR
6,0.000000,0.000000,0.000000,0.000000,0.094712,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.063355,0.000000,0.000000,0.000000,0.000000,0.000000,CBR
7,0.000000,0.000000,0.000000,0.000000,0.070715,0.000000,0.000000,0.000000,0.077774,0.000000,...,0.000000,0.000000,0.000000,0.047302,0.087672,0.000000,0.000000,0.000000,0.111042,CBR
8,0.000000,0.000000,0.000000,0.000000,0.089622,0.000000,0.123767,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.059950,0.111113,0.000000,0.000000,0.000000,0.140731,CBR
9,0.000000,0.212378,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.088788,0.000000,0.000000,0.000000,0.000000,0.000000,CBR


In [8]:
test_df = pd.DataFrame(columns=v.get_feature_names(), data=v.transform(test['Text'].values).toarray())
test_df

Unnamed: 0,2nd,3rd,4th,5th,aaai,aamodt,about,ac,academ,access,...,with,within,word,work,workshop,world,www,y,york,z
0,0.000000,0.000000,0.000000,0.000000,0.086435,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.052004,0.000000,0.138298,0.000000,0.000000,0.107161,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.087606,0.119382,...,0.047925,0.000000,0.000000,0.000000,0.000000,0.000000,0.103753,0.086734,0.000000,0.000000
2,0.000000,0.117922,0.000000,0.114338,0.073700,0.000000,0.000000,0.094784,0.081057,0.000000,...,0.044342,0.112346,0.000000,0.000000,0.049299,0.000000,0.095997,0.000000,0.000000,0.000000
3,0.122573,0.130856,0.000000,0.000000,0.000000,0.000000,0.000000,0.105180,0.000000,0.000000,...,0.049206,0.000000,0.000000,0.000000,0.054706,0.000000,0.000000,0.089052,0.092438,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.092223,0.133837,0.000000,0.000000,0.000000,0.000000,...,0.055487,0.140583,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,0.000000,0.000000,0.000000,0.000000,0.084003,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.050541,0.000000,0.000000,0.092080,0.056191,0.000000,0.000000,0.000000,0.000000,0.000000
6,0.000000,0.127082,0.000000,0.000000,0.000000,0.000000,0.000000,0.102147,0.087354,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.053129,0.000000,0.000000,0.086484,0.000000,0.124719
7,0.000000,0.000000,0.000000,0.000000,0.077946,0.000000,0.107643,0.000000,0.000000,0.000000,...,0.046897,0.000000,0.000000,0.000000,0.052140,0.096638,0.000000,0.000000,0.000000,0.000000
8,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.105848,0.090519,0.000000,...,0.049518,0.000000,0.000000,0.090216,0.055054,0.102038,0.000000,0.000000,0.000000,0.000000
9,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.127043,...,0.000000,0.000000,0.135628,0.092916,0.000000,0.000000,0.000000,0.092300,0.000000,0.000000


In [24]:
# Métricas de avaliação

def confusion_matrix(y_true, y_pred):
    cm = pd.DataFrame(index=np.unique(y_true), columns=np.unique(y_pred))
    cm = cm.replace(np.nan, 0)
    for t,p in zip(y_true, y_pred):
        cm.loc[t,p] += 1
    return cm

def Accuracy(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    return np.trace(cm.values)/len(y_true)

In [42]:
class NaiveBayesCont:
    def train(self, data):
        '''
        Função: Geração das probabilidades utilizadas no classificador naive bayes
        Parâmetros:
                    - data: Pandas.DataFrame - dados utilizados no treinamento. A última coluna deve conter as classes.
        '''
        # Classes do problema
        self.classes = {c: data[data.iloc[:,-1] == c].shape[0]/data.shape[0] for c in data.iloc[:,-1].unique()}
        
        # Criação dos dataframes de média e variância
        self.means = pd.DataFrame(index=list(self.classes.keys()), columns=data.columns[:-1])
        self.vars = pd.DataFrame(index=list(self.classes.keys()), columns=data.columns[:-1])
        
        # Cálculo das médias e variâncias
        for c in list(self.classes.keys()):
            self.means.loc[c, :] = data[data.iloc[:,-1] == c].mean()
            self.vars.loc[c, :] = data[data.iloc[:,-1] == c].var()
            
        for c in self.vars.columns:
            if self.vars[c].min() == 0:
                self.vars.drop(c, axis=1, inplace=True)
                self.means.drop(c, axis=1, inplace=True)
        
    def __normal(self, x, mean, var):
        '''
        Função: Cálculo do valor da distribuição normal com média 'mean' e variância 'var' no ponto x
        Parâmetros:
                    - x: Valor onde deve ser calculado o valor da normal
                    - mean: média da distribuição normal
                    - var: variância da distribuição normal
        '''
        #print(x, mean, var, -((x-mean)**2)/(2*var), math.exp(-((x-mean)**2)/(2*var)), math.sqrt(2*math.pi*var), (math.exp(-((x-mean)**2)/(2*var))/(math.sqrt(2*math.pi*var))))
        return (math.exp(-((x-mean)**2)/(2*var))/(math.sqrt(2*math.pi*var)))
    
    def predict(self, x):
        '''
        Função: Predição da classe do objeto x
        Parâmetros:
                    - x: Objeto que queremos predizer a classe;
        '''
        probs = {}
        for s in list(self.classes.keys()):
            p = self.classes[s]
            for c in self.means.columns:
                p *= self.__normal((x[c]-self.means.loc[s,c])/math.sqrt(self.vars.loc[s, c]), 0, 1)
                #p *= self.__normal(x[c], self.means.loc[s, c], self.vars.loc[s,c])
            probs[s] = p
            
        return probs

In [43]:
n = NaiveBayesCont()
n.train(train_df)

In [43]:
n.vars

Unnamed: 0,2nd,3rd,4th,5th,aaai,about,ac,academ,access,achiev,...,with,within,word,work,workshop,world,www,y,york,z
CBR,0.00175014,0.00151881,0.00197253,0.0015353,0.00165583,0.00222245,0.00125988,0.00158709,0.000995669,0.00135188,...,0.000483751,0.00129823,0.000417229,0.00168122,0.000564443,0.00184029,0.00117779,0.00134119,0.00186192,0.00106826
ILP,0.00214588,0.00208757,0.00196197,0.00313426,0.00145407,0.000835706,0.003254,0.00194267,0.0001614,0.00129578,...,0.00020458,0.0023017,0.00131858,0.00143774,0.000518061,0.000945178,0.00206189,0.0016475,0.00199748,0.000899141
RI,0.00127206,0.00130672,0.000979437,0.000714212,0.000804219,0.00161532,0.00156227,0.00144603,0.00314103,0.00201385,...,0.000543294,0.00162171,0.00299942,0.00203861,0.000806694,0.00174003,0.00249016,0.00217244,0.00141921,0.00282145


In [44]:
n.predict(test_df.iloc[0])

-0.3866582277523446 0 1 -0.074752292544292 0.92797332377215 2.5066282746310002 0.3702077939373586
-0.3023121173451022 0 1 -0.04569630814683943 0.9553320446996763 2.5066282746310002 0.3811223444530523
-0.4334321176533806 0 1 -0.09393170030674697 0.9103449363974152 2.5066282746310002 0.36317508487828204
-0.34686586979590284 0 1 -0.06015796581463411 0.9416157787319076 2.5066282746310002 0.3756503460292781
1.213458384277046 0 1 -0.7362406251861296 0.4789109412886651 2.5066282746310002 0.19105782302689672
-0.5075191632760412 0 1 -0.12878785054620648 0.8791604591738336 2.5066282746310002 0.35073427842157984
-0.38581348172907054 0 1 -0.07442602134195392 0.9282761441423316 2.5066282746310002 0.3703286017863908
-0.6431718021401627 0 1 -0.20683498353411228 0.8131538226869571 2.5066282746310002 0.32440144033987695
-0.32998684262803496 0 1 -0.05444565815380976 0.9470099697296886 2.5066282746310002 0.3778023168868538
1.883062483656629 0 1 -1.7729621586775361 0.16982918203070674 2.5066282746310002 0

{'CBR': 0.0, 'ILP': 0.0, 'RI': 3.080984773698981e-281}

In [12]:
from sklearn.naive_bayes import GaussianNB

g = GaussianNB()

In [13]:
g.fit(train_df.iloc[:, :-1].values, train_df.iloc[:,-1].values)

GaussianNB(priors=None)

In [15]:
g.predict(test_df.values)

array(['RI', 'ILP', 'ILP', 'CBR', 'CBR', 'ILP', 'RI', 'CBR', 'CBR', 'RI',
       'CBR', 'CBR', 'CBR', 'CBR', 'RI', 'ILP', 'CBR', 'CBR', 'CBR',
       'CBR', 'ILP', 'ILP', 'RI', 'ILP', 'RI', 'CBR', 'CBR', 'CBR', 'RI',
       'CBR', 'RI', 'ILP', 'CBR', 'RI', 'CBR', 'CBR', 'CBR', 'CBR', 'RI',
       'ILP', 'RI', 'CBR', 'CBR', 'CBR', 'RI', 'ILP', 'RI', 'CBR', 'RI',
       'CBR', 'CBR', 'ILP', 'CBR', 'CBR', 'RI', 'ILP', 'RI', 'CBR', 'CBR',
       'RI', 'CBR', 'CBR', 'RI', 'CBR', 'RI', 'CBR', 'CBR', 'RI', 'ILP',
       'ILP', 'CBR', 'ILP', 'RI', 'CBR', 'CBR', 'RI', 'ILP', 'CBR', 'CBR',
       'RI', 'ILP', 'ILP', 'CBR', 'RI', 'CBR', 'CBR', 'RI', 'RI', 'ILP',
       'CBR', 'CBR', 'CBR', 'RI', 'RI', 'CBR', 'ILP', 'CBR', 'RI', 'RI',
       'RI', 'CBR', 'RI', 'RI', 'CBR', 'ILP', 'RI', 'RI', 'CBR', 'ILP',
       'RI', 'CBR', 'RI', 'RI', 'CBR', 'RI'], dtype='<U3')

In [17]:
test['Class'].values

array(['RI', 'ILP', 'ILP', 'CBR', 'CBR', 'CBR', 'RI', 'CBR', 'CBR', 'RI',
       'CBR', 'ILP', 'CBR', 'CBR', 'RI', 'CBR', 'CBR', 'CBR', 'CBR', 'RI',
       'ILP', 'ILP', 'RI', 'ILP', 'RI', 'CBR', 'CBR', 'CBR', 'RI', 'CBR',
       'RI', 'ILP', 'CBR', 'RI', 'CBR', 'CBR', 'CBR', 'CBR', 'RI', 'ILP',
       'RI', 'CBR', 'CBR', 'CBR', 'RI', 'ILP', 'RI', 'CBR', 'RI', 'CBR',
       'CBR', 'ILP', 'CBR', 'CBR', 'RI', 'ILP', 'RI', 'CBR', 'CBR', 'RI',
       'CBR', 'CBR', 'RI', 'CBR', 'RI', 'CBR', 'CBR', 'RI', 'ILP', 'ILP',
       'CBR', 'ILP', 'RI', 'CBR', 'CBR', 'RI', 'ILP', 'CBR', 'CBR', 'CBR',
       'ILP', 'ILP', 'CBR', 'RI', 'CBR', 'CBR', 'RI', 'CBR', 'ILP', 'CBR',
       'CBR', 'CBR', 'RI', 'RI', 'CBR', 'ILP', 'CBR', 'RI', 'RI', 'RI',
       'CBR', 'RI', 'RI', 'CBR', 'ILP', 'RI', 'RI', 'CBR', 'ILP', 'RI',
       'CBR', 'CBR', 'RI', 'CBR', 'RI'], dtype=object)

In [29]:
import numpy as np
confusion_matrix(y_pred=g.predict(test_df.values), y_true=test['Class'].values)

Unnamed: 0,CBR,ILP,RI
CBR,52,2,3
ILP,1,20,0
RI,1,0,36
