# Part II: Naïve-Bayes and comparative algorithms

## Imports

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Perceptron

## Functions

## Conditional Probabilities

In [2]:
# Create a conditional probabilities DataFrame.
# - dt: pandas DataFrame with content of specific text class
#
# Returns:
# - cond_prob: DataFrame with conditional probabilities for 'classname' in 'data'
def conditional_probabilities(data, total_rows, total_cols, classname):
    # Sum of all occurrences (all cell values in dataframe)
    n = data.values[:, 1:].sum()

    col_names = list(data.columns.values[1:])

    zeros = np.zeros(shape=(0, total_cols))

    # Create dataframe for the sum of columns
    sum_col = pd.DataFrame(data=zeros, columns=data.columns)
    # Create dataframe for conditional probabilities
    cond_prob = pd.DataFrame(data=zeros, columns=data.columns)

    sum_col.at[0, 'Class'] = classname
    cond_prob.at[0, 'Class'] = classname
    
    # Take the sum of each column
    for i in col_names:
        sum_col.at[0, i] = data.loc[:, i].sum()
        
    # Conditional probabilities to 'data' class

    # Creating dataframe that contains conditional probabilities of CBR
    # This dataframe has 1xN dimensions, where N is the number of words

    for i in col_names:
        cond_prob.at[0, i] = (sum_col.iloc[0][i] + 1)/(n + total_cols - 1)
        # It's total_cols - 1 because it counts the Class column too

    return cond_prob

## Naïve Bayes classification algorithm

In [3]:
# Calculates the models for Naive Bayes algorithm with k-folds cross validation.
# - data: DataFrame with the corpus' BoW
# - p_class: dictionary of class probabilities
# 
# Returns:
# - train_rounds: DataFrame with conditional probabilities of each round
# - pred_rounds: DataFrame with predicted classes of each round
# - true_rounds: DataFrame with true classes of each round
def naive_bayes(data, p_class):
    # Initialize the KFold class with 10 folds
    kf = KFold(10)
    # Stores the conditional probabilities of each round of training
    train_rounds = pd.DataFrame()
    # Stores the prediction of each round of test
    pred_rounds = pd.DataFrame()
    # Stores the true values of each round of test
    true_rounds = pd.DataFrame()
    # Round counter
    i = 0

    for train, test in kf.split(data):
        print('Round:', i)

        # Probabilities DataFrame
        probs = pd.DataFrame()

        # Separating training data
        data_train = data.iloc[train, :]
        # Separating by class
        train_cbr = data_train.loc[data_train['Class'] == 'CBR']
        train_ilp = data_train.loc[data_train['Class'] == 'ILP']
        train_ri = data_train.loc[data_train['Class'] == 'RI']

        # Separating testing data
        data_test = data.iloc[test, :]
        
        ## Training ##
        # Calculating the conditional probabilities P(word|Class)
        # For Class = 'CBR'
        print("Calculating CBR probabilities")
        p = conditional_probabilities(train_cbr, data.shape[0], data.shape[1], 'CBR')
        probs = probs.append(p, ignore_index=True)

        # For Class = 'ILP'
        print("Calculating ILP probabilities")
        p = conditional_probabilities(train_ilp, data.shape[0], data.shape[1], 'ILP')
        probs = probs.append(p, ignore_index=True)

        # For Class = 'RI'
        print("Calculating RI probabilities")
        p = conditional_probabilities(train_ri, data.shape[0], data.shape[1], 'RI')
        probs = probs.append(p, ignore_index=True)

        ## Testing ##
        true, pred = test_pred(data_test, probs, p_class, i)

        ## Results ##
        # Storing the training probabilities of this round
        train_rounds = train_rounds.append(probs, ignore_index=True)
        # Storing the predicted classes of this round
        pred_rounds = pred_rounds.append(pred, ignore_index=True)
        # Storing the true classes of this round
        true_rounds = true_rounds.append(true, ignore_index=True)

        i = i+1

    return train_rounds, pred_rounds, true_rounds

## Testing the predictor

In [4]:
# Tests the predictor
# data: DataFrame with test fold BoW
# probs: DataFrame with conditional probabilities calculated using training fold
# p_class: dictionary of class probabilities
#
# Return:
# true: true values
# pred: predicted values
def test_pred(data, probs, p_class, n_round):
    # List of predicted values
    pred = list()
    # List of true values
    true = list()

    # Iterating the rows
    for _, row in data.iterrows():
        # Saving true values
        true.append(row['Class'])
        # Drops the Class columns, it will not be used
        row = row.drop('Class')
        # Selecting only words that occur in the document (freq > 0)
        row = row.loc[row > 0]

        cbr = p_class['CBR']
        ilp = p_class['ILP']
        ri = p_class['RI']

        # Now we iterate the words and multiply the probabilities found at training stage by the probability
        # of each class
        for key in row.keys():
            cbr = cbr * probs.at[0, key]
            ilp = ilp * probs.at[1, key]
            ri = ri * probs.at[2, key]
            #print(cbr)

        p = {'CBR': cbr, 'ILP': ilp, 'RI': ri}
        # Predicting the class
        pred.append(max(p, key=p.get))
    
    # Transforms the list into a DataFrame
    # We transpose so the rounds will be indexes, not columns
    true = pd.DataFrame({n_round: true}).T
    pred = pd.DataFrame({n_round: pred}).T

    return true, pred

## Testing other algorithms

In [5]:
# Classifies the data using Perceptron, Decision Tree and KNN with k = 3, 5 and 7.
# data: data to be classified
#
# Return:
# results: matrix with the predicted values and true values
def classify(data):
    # Initializing classifiers
    perceptron = Perceptron()
    dt = DecisionTreeClassifier(criterion='gini', splitter='best', min_samples_split=int(len(data)*0.1))
    _3nn = KNeighborsClassifier(n_neighbors=3, weights='uniform', algorithm='auto')
    _5nn = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto')
    _7nn = KNeighborsClassifier(n_neighbors=7, weights='uniform', algorithm='auto')
    
    # Initializing KFold CV
    kf = KFold(10)

    classifiers = [perceptron, dt, _3nn, _5nn, _7nn]
    names = ['Perceptron', 'Decision Tree', '3NN', '5NN', '7NN']
    
    # Predicted values
    pred_perceptron = pd.DataFrame()
    pred_dt = pd.DataFrame()
    pred_3nn = pd.DataFrame()
    pred_5nn = pd.DataFrame()
    pred_7nn = pd.DataFrame()
    # True values
    true_perceptron = pd.DataFrame()
    true_dt = pd.DataFrame()
    true_3nn = pd.DataFrame()
    true_5nn = pd.DataFrame()
    true_7nn = pd.DataFrame()
    # Results of each round
    results = [[pred_perceptron, true_perceptron], [pred_dt, true_dt],
               [pred_3nn, true_3nn], [pred_5nn, true_5nn], [pred_7nn, true_7nn]]

    for c, name, i in zip(classifiers, names, range(0, 5)):
        for train, test in kf.split(data):
            # Dividing training and test partitions
            x_train, y_train = data.iloc[train, 1:], data.iloc[train, 0]
            x_test, y_test = data.iloc[test, 1:], data.iloc[test, 0]

            # Training
            clf = c.fit(X=x_train, y=y_train)
            # Testing
            pred = list(clf.predict(x_test))
            # Predicted values
            pred = pd.DataFrame({i: pred}).T
            # True values
            true = pd.DataFrame({i: list(y_test)}).T

            # Storing the predicted and true values of the round
            results[i][0] = results[i][0].append(pred, ignore_index=True)
            results[i][1] = results[i][1].append(true, ignore_index=True)

    return results

## Loading the BoW without TF IDF

In [6]:
bow = pd.read_csv('bow.csv', index_col=0)
bow.head(10)

Unnamed: 0,Class,aaai,abstract,acm,adapt,al,algorithm,analysi,annual,applic,...,th,theori,thi,univers,use,user,verlag,vol,workshop,www
0,CBR,0,1,1,0,0,1,1,0,0,...,1,2,1,1,2,0,1,0,4,0
1,CBR,0,1,0,0,1,0,0,0,1,...,0,0,1,1,3,0,0,0,0,0
2,CBR,1,2,0,0,0,0,3,0,2,...,0,0,0,5,2,5,0,3,0,0
3,CBR,2,3,0,0,0,0,0,0,0,...,0,1,1,4,3,0,0,0,0,0
4,CBR,1,2,1,0,3,3,0,0,3,...,1,1,1,3,5,0,1,0,2,0
5,CBR,0,1,0,0,0,0,0,0,1,...,0,0,0,1,1,0,0,1,2,0
6,CBR,0,2,1,2,1,1,1,0,0,...,0,0,1,2,3,0,0,0,1,0
7,CBR,1,1,0,7,0,0,1,0,1,...,2,1,1,2,3,0,0,1,1,0
8,CBR,0,1,1,0,1,0,0,0,1,...,0,0,4,0,1,0,0,1,0,0
9,CBR,1,1,1,2,0,1,0,0,7,...,0,0,2,2,5,0,1,0,5,0


## Loading the BoW with TF IDF

In [7]:
bow_tfidf = pd.read_csv('bow_tfidf.csv', index_col=0)
bow_tfidf.head(10)

Unnamed: 0,Class,aaai,abstract,acm,adapt,al,algorithm,analysi,annual,applic,...,th,theori,thi,univers,use,user,verlag,vol,workshop,www
0,CBR,0.0,0.020676,0.03982,0.0,0.0,0.034163,0.036633,0.0,0.0,...,0.029141,0.067765,0.023532,0.024078,0.045838,0.0,0.034734,0.0,0.111672,0.0
1,CBR,0.0,0.037334,0.0,0.0,0.085438,0.0,0.0,0.0,0.054762,...,0.0,0.0,0.042491,0.043478,0.124154,0.0,0.0,0.0,0.0,0.0
2,CBR,0.047975,0.048885,0.0,0.0,0.0,0.0,0.12992,0.0,0.071705,...,0.0,0.0,0.0,0.142324,0.054189,0.295068,0.0,0.153873,0.0,0.0
3,CBR,0.099534,0.076067,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.041552,0.028858,0.118113,0.08432,0.0,0.0,0.0,0.0,0.0
4,CBR,0.033174,0.033804,0.032552,0.0,0.11604,0.083782,0.0,0.0,0.074376,...,0.023822,0.027698,0.019237,0.059051,0.093679,0.0,0.028394,0.0,0.045645,0.0
5,CBR,0.0,0.027591,0.0,0.0,0.0,0.0,0.0,0.0,0.040471,...,0.0,0.0,0.0,0.032132,0.030585,0.0,0.0,0.057899,0.074511,0.0
6,CBR,0.0,0.038497,0.037072,0.079558,0.04405,0.031804,0.034104,0.0,0.0,...,0.0,0.0,0.021907,0.044832,0.064011,0.0,0.0,0.0,0.025991,0.0
7,CBR,0.047006,0.023949,0.0,0.346452,0.0,0.0,0.042432,0.0,0.035129,...,0.067509,0.039247,0.027257,0.05578,0.079642,0.0,0.0,0.050256,0.032338,0.0
8,CBR,0.0,0.053205,0.102468,0.0,0.121757,0.0,0.0,0.0,0.078041,...,0.0,0.0,0.242214,0.0,0.058977,0.0,0.0,0.111646,0.0,0.0
9,CBR,0.039056,0.019899,0.038323,0.082245,0.0,0.032878,0.0,0.0,0.204312,...,0.0,0.0,0.045294,0.046346,0.110288,0.0,0.033428,0.0,0.134342,0.0


## P(Class) calculation

In [8]:
# Calculating number of documents of each class
n_cbr = bow.loc[bow['Class'] == 'CBR'].shape[0]
n_ilp = bow.loc[bow['Class'] == 'ILP'].shape[0]
n_ri = bow.loc[bow['Class'] == 'RI'].shape[0]

# Calculating P(Class)
p_class = dict()
p_class['CBR'] = n_cbr/bow.shape[0]
p_class['ILP'] = n_ilp/bow.shape[0]
p_class['RI'] = n_ri/bow.shape[0]

p_class

{'CBR': 0.4808362369337979,
 'ILP': 0.2073170731707317,
 'RI': 0.3118466898954704}

## Learning Algorithm

In [9]:
# We need to shuffle the DataFrame because the split() function returns consecutive indexes, like
# range(0, 50). If the DataFrame is ordered, we might receive folds with only 1 of the classes.
rd = bow.sample(frac=1)
train_rounds, pred_rounds, true_rounds = naive_bayes(rd, p_class)
print("Train probs:", train_rounds)
print("Predicted labels:", pred_rounds)
print("True labels:", true_rounds)

Round: 0
Calculating CBR probabilities
Calculating ILP probabilities
Calculating RI probabilities
Round: 1
Calculating CBR probabilities
Calculating ILP probabilities
Calculating RI probabilities
Round: 2
Calculating CBR probabilities
Calculating ILP probabilities
Calculating RI probabilities
Round: 3
Calculating CBR probabilities
Calculating ILP probabilities
Calculating RI probabilities
Round: 4
Calculating CBR probabilities
Calculating ILP probabilities
Calculating RI probabilities
Round: 5
Calculating CBR probabilities
Calculating ILP probabilities
Calculating RI probabilities
Round: 6
Calculating CBR probabilities
Calculating ILP probabilities
Calculating RI probabilities
Round: 7
Calculating CBR probabilities
Calculating ILP probabilities
Calculating RI probabilities
Round: 8
Calculating CBR probabilities
Calculating ILP probabilities
Calculating RI probabilities
Round: 9
Calculating CBR probabilities
Calculating ILP probabilities
Calculating RI probabilities
Train probs:    Clas

In [10]:
# Saving the results in CSV
pred_rounds.to_csv('pred_rounds_bayes.csv')
true_rounds.to_csv('true_rounds_bayes.csv')

## Running Decision Tree, KNN and Perceptron

In [11]:
rd = bow_tfidf.sample(frac=1)
results = classify(rd)

# Writing the results in CSV
# Perceptron
results[0][0].to_csv('pred_rounds_perceptron.csv')
results[0][1].to_csv('true_rounds_perceptron.csv')

# Decision tree
results[1][0].to_csv('pred_rounds_dt.csv')
results[1][1].to_csv('true_rounds_dt.csv')

# 3NN
results[2][0].to_csv('pred_rounds_3nn.csv')
results[2][1].to_csv('true_rounds_3nn.csv')

# 5NN
results[3][0].to_csv('pred_rounds_5nn.csv')
results[3][1].to_csv('true_rounds_5nn.csv')

# 7NN
results[4][0].to_csv('pred_rounds_7nn.csv')
results[4][1].to_csv('true_rounds_7nn.csv')

