# Creating the *corpus* *bag-of-words*

## Imports

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import re
from os import listdir
from os.path import join
from sklearn.feature_extraction.text import CountVectorizer

## Variables

In [2]:
# Stop words that will be removed
# Source: https://towardsdatascience.com/multinomial-naive-bayes-classifier-for-text-analysis-python-8dd6825ece67
stop_words = [
"a", "about", "above", "across", "after", "afterwards", 
"again", "all", "almost", "alone", "along", "already", "also",    
"although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "as", "at", "be", "became", "because", "become","becomes", "becoming", "been", "before", "behind", "being", "beside", "besides", "between", "beyond", "both", "but", "by","can", "cannot", "cant", "could", "couldnt", "de", "describe", "do", "done", "each", "eg", "either", "else", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "find","for","found", "four", "from", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "i", "ie", "if", "in", "indeed", "is", "it", "its", "itself", "keep", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mine", "more", "moreover", "most", "mostly", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next","no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", "part","perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "she", "should","since", "sincere","so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "take","than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they",
"this", "those", "though", "through", "throughout",
"thru", "thus", "to", "together", "too", "toward", "towards",
"under", "until", "up", "upon", "us",
"very", "was", "we", "well", "were", "what", "whatever", "when",
"whence", "whenever", "where", "whereafter", "whereas", "whereby",
"wherein", "whereupon", "wherever", "whether", "which", "while", 
"who", "whoever", "whom", "whose", "why", "will", "with",
"within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves"
]

# Corpus' directory
corpus = "corpus"

# Files that compose the corpus
files = ''.join(str(e) + ' ' for e in listdir(corpus))

# DataFrame with a BoW of all documents from corpus
bow = pd.DataFrame(columns=['Class'])

## Functions

In [30]:
# Creates the Bag of Words (BoW).
# file: path to file
def create_bow(file):
    c = CountVectorizer(lowercase=True)
    text = open(file, encoding='latin-1').read()
    c.fit_transform([text])
    
    return c

# Searches for columns that are not useful, like numbers.
# bow: pandas DataFrame with each document BoW.
# Returns a list with bad columns.
def search_bad_cols(bow):
    cols = list()
        
    for col in bow.columns:
        # Matching numbers
        if re.match(r'[0-9]+', col): cols.append(col)
        # Matching with stop words
        if col in stop_words: cols.append(col)

    return cols

# Create a conditional probabilities dataframe
# dt: pandas DataFrame with content of specific text class
# Returns two pandas DataFrame, one with the occurrence number 
# of each word and other with conditional probabilities of each word
def conditional_probabilities(dt):
    # Sum of all occurrences (all cell values in dataframe)
    n_wordsDT = dt.values[:,1:].sum()

    # Get all words (columns headers without the first one - 'Class')
    col_names = list(dt.columns.values[1:])
    
    # Create dataframe for the sum of columns
    zeros = np.zeros(shape=(1,total_cols-1))
    sumcol_DT = pd.DataFrame(data=zeros, columns=col_names)

    # Take the sum of each column
    for i in col_names:
        sumcol_DT.iloc[0][i] = dt.loc[:,i].sum()
        
    # Conditional probabilities to DT class

    # Creating dataframe that contains conditional probabilities of CBR
    # This dataframe has 1xN dimensions, where N is the number of words

    # Create dataframe for conditional probabilities
    zeros = np.zeros(shape=(1,total_cols-1))
    cond_prob = pd.DataFrame(data=zeros, columns=col_names)

    for i in col_names:
        cond_prob.iloc[0][i] = (sumcol_DT.iloc[0][i] + 1)/(n_wordsDT + (total_rows-1))
    

    return [sumcol_DT,cond_prob]

## Creating the BoW

In [4]:
# Count the number of each document type
N_CBR = 0
N_ILP = 0
N_RI = 0

for f in files.split():
    print("File: " + f)
    c = create_bow(corpus + '/' + f)

    if re.match(r'CBR', f):
        tmp = 'CBR'
        N_CBR = N_CBR + 1
    elif re.match(r'ILP', f):
        tmp = 'ILP'
        N_ILP = N_ILP + 1
    else:
        tmp = 'RI'
        N_RI = N_RI + 1

    d = dict(c.vocabulary_)
    d['Class'] = tmp

    bow = bow.append(d, ignore_index=True)

File: CBR-1010Lea229-240.txt
File: ILP-1314Seb105-126.txt
File: RI-sdjt02.ps.txt
File: CBR-1650Fic438-452.txt
File: RI-SIGIR99.ps.txt
File: ILP-1634Miy222-233.txt
File: CBR-2416Wan642-654.txt
File: RI-jnle-qa-2001.ps.txt
File: RI-Aslam00.ps.txt
File: RI-TREC8-Nie.ps.txt
File: ILP-1446Red23-37.txt
File: CBR-1898Fer74-85.txt
File: CBR-1898Per479-490.txt
File: CBR-2416Mcs219-233.txt
File: ILP-1297Web288-295.txt
File: RI-paper_anlp.ps.txt
File: CBR-1266Kra63-73.txt
File: CBR-1010Por277-288.txt
File: CBR-1266Fal611-622.txt
File: CBR-1266Tau156-165.txt
File: RI-ssgrr.ps.txt
File: RI-IPL.ps.txt
File: CBR-837Cha221-233.txt
File: RI-WWW2000.ps.txt
File: RI-acl2000.ps.txt
File: CBR-1650Mcl248-262.txt
File: CBR-1010Ric301-312.txt
File: RI-AslamMo00.ps.txt
File: CBR-1010Opi77-87.txt
File: CBR-1650Smy329-342.txt
File: RI-sigir2001ws.ps.txt
File: CBR-1010Net67-76.txt
File: ILP-1314Rie245-264.txt
File: RI-chum2001.ps.txt
File: RI-chord_sigcomm.ps.txt
File: ILP-1634Ram268-278.txt
File: RI-single-thesi

File: CBR-1898Doy49-60.txt
File: RI-tsap02gtzan.ps.txt
File: CBR-1266Lee74-83.txt
File: CBR-1010Bar145-156.txt
File: ILP-1446Jac145-154.txt
File: RI-ICCIP98.ps.txt
File: CBR-1650Mun288-302.txt
File: RI-AslamPeRu99.ps.txt
File: ILP-1314Mar377-396.txt
File: RI-sigir96.ps.txt
File: RI-ComputerJ96.ps.txt
File: CBR-1650Abi358-371.txt
File: CBR-1010Wet347-358.txt
File: CBR-1010Khe501-507.txt
File: ILP-1314Nie285-298.txt
File: ILP-1446Kir261-270.txt
File: CBR-2416Bar490-504.txt
File: RI-MontagueAs01b.ps.txt
File: CBR-1898Sch504-516.txt
File: CBR-2416Mou249-263.txt
File: RI-lrec2002.eval.ps.txt
File: CBR-1650Aur372-385.txt
File: RI-AslamDe94.ps.txt
File: RI-DL98.ps.txt
File: RI-nips.ps.txt
File: RI-sigir95.ps.txt
File: ILP-1866Mug130-144.txt
File: RI-CLEF2001.ps.txt
File: RI-integrate.ps.txt
File: RI-acl2001.ps.txt
File: CBR-1650Mel510-524.txt
File: RI-ijcai99-textmining-wkshp.ps.txt
File: RI-BMM.ps.txt
File: RI-dp2.wolfe.ps.txt
File: ILP-1297Ant45-59.txt
File: ILP-1866Rou191-208.txt
File: CBR

## Removing columns and filling NaN values

In [5]:
clean = bow.copy()

cols = search_bad_cols(clean)
clean = clean.drop(columns=cols)

# Changing NaN to 0
clean.fillna(0, inplace=True)

# Checking if all classes are in 'clean'
print(clean.iloc[:, 0].unique())

# Sorting dataframe values by column "Class"
clean = clean.sort_values(by='Class')
# Redefine dataframe index
clean = clean.reset_index(drop=True)
clean

['CBR' 'ILP' 'RI']


Unnamed: 0,Class,aaai,ability,abstract,academic,acquiring,adaptation,adapting,adaptive,address,...,mile,millot,milot,niel,opérateur,procédés,sadt,seminaire,sferca,similarite
0,CBR,66.0,67.0,69.0,70.0,71.0,72.0,73.0,74.0,75.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CBR,0.0,0.0,60.0,0.0,0.0,0.0,0.0,64.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CBR,10.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CBR,0.0,0.0,73.0,0.0,74.0,76.0,0.0,77.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CBR,34.0,0.0,35.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,CBR,0.0,0.0,47.0,48.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,CBR,0.0,0.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,CBR,0.0,0.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,CBR,0.0,0.0,47.0,49.0,0.0,54.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,CBR,27.0,0.0,29.0,0.0,0.0,30.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Learning Algorithm

In [6]:
# Extract total number of documents
total_rows = clean.shape[0]
total_cols = clean.shape[1]
print("Total rows: ",total_rows)
print("Total cols: ",total_cols)
print("Number of CBR docs: ",N_CBR)
print("Number of ILP docs: ",N_ILP)
print("Number of RI docs: ",N_RI)

Total rows:  574
Total cols:  21055
Number of CBR docs:  276
Number of ILP docs:  119
Number of RI docs:  179


### 1 - Calculating P(Words | CBR)

In [22]:
# Calculating P(CBR)
prob_cbr = N_CBR/total_rows

print("P(CBR): ", prob_cbr)

P(CBR):  0.4808362369337979


In [23]:
# Extracting CBR dataframe:

# 1 - Dataframe for CBR class
dt_CBR = clean.iloc[0:N_CBR, :]
dt_CBR

Unnamed: 0,Class,aaai,ability,abstract,academic,acquiring,adaptation,adapting,adaptive,address,...,mile,millot,milot,niel,opérateur,procédés,sadt,seminaire,sferca,similarite
0,CBR,66.0,67.0,69.0,70.0,71.0,72.0,73.0,74.0,75.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CBR,0.0,0.0,60.0,0.0,0.0,0.0,0.0,64.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CBR,10.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CBR,0.0,0.0,73.0,0.0,74.0,76.0,0.0,77.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CBR,34.0,0.0,35.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,CBR,0.0,0.0,47.0,48.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,CBR,0.0,0.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,CBR,0.0,0.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,CBR,0.0,0.0,47.0,49.0,0.0,54.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,CBR,27.0,0.0,29.0,0.0,0.0,30.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
# Calculate the conditional probability of each word of the BoW
[n_wordsCBR , cond_probCBR] = conditional_probabilities(dt_CBR)
cond_probCBR

Unnamed: 0,aaai,ability,abstract,academic,acquiring,adaptation,adapting,adaptive,address,advisory,...,mile,millot,milot,niel,opérateur,procédés,sadt,seminaire,sferca,similarite
0,0.000542,3.8e-05,0.001055,0.000333,6.8e-05,0.000442,5.8e-05,0.000236,5.1e-05,4.1e-05,...,1.4e-05,1.4e-05,1.4e-05,1.5e-05,1.5e-05,1.7e-05,1.8e-05,1.9e-05,1.9e-05,1.9e-05


### 2 - Calculating P(Words | ILP)

In [32]:
# Calculating P(ILP)
prob_ilp = N_ILP/total_rows

print("P(ILP): ", prob_ilp)

P(ILP):  0.2073170731707317


In [33]:
# 2 - Dataframe for ILP class
dt_ILP = clean.iloc[N_CBR:(N_CBR + N_ILP), :]
dt_ILP

Unnamed: 0,Class,aaai,ability,abstract,academic,acquiring,adaptation,adapting,adaptive,address,...,mile,millot,milot,niel,opérateur,procédés,sadt,seminaire,sferca,similarite
276,ILP,0.0,0.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
277,ILP,0.0,0.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278,ILP,0.0,0.0,69.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
279,ILP,0.0,0.0,58.0,60.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
280,ILP,64.0,0.0,67.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
281,ILP,0.0,0.0,53.0,55.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
282,ILP,0.0,0.0,27.0,29.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
283,ILP,0.0,0.0,53.0,54.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
284,ILP,58.0,0.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
285,ILP,0.0,23.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
# Calculate the conditional probability of each word of the BoW
[n_wordsILP , cond_probILP] = conditional_probabilities(dt_ILP)
cond_probILP

Unnamed: 0,aaai,ability,abstract,academic,acquiring,adaptation,adapting,adaptive,address,advisory,...,mile,millot,milot,niel,opérateur,procédés,sadt,seminaire,sferca,similarite
0,0.000406,3.1e-05,0.001287,0.000549,5.4e-05,2.341848e-07,6e-06,8e-06,2.7e-05,2.341848e-07,...,2.341848e-07,2.341848e-07,2.341848e-07,2.341848e-07,2.341848e-07,2.341848e-07,2.341848e-07,2.341848e-07,2.341848e-07,2.341848e-07


### 3 - Calculating P(Word | RI)

In [38]:
# Calculating P(RI)
prob_ri = N_RI/total_rows

print("P(RI): ", prob_ri)

P(RI):  0.3118466898954704


In [39]:
# 3 - Dataframe for RI class
dt_RI = clean.iloc[(N_CBR + N_ILP):,:]
dt_RI

Unnamed: 0,Class,aaai,ability,abstract,academic,acquiring,adaptation,adapting,adaptive,address,...,mile,millot,milot,niel,opérateur,procédés,sadt,seminaire,sferca,similarite
395,RI,0.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
396,RI,54.0,0.0,56.0,57.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
397,RI,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
398,RI,0.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
399,RI,0.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
400,RI,0.0,0.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
401,RI,0.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
402,RI,0.0,0.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
403,RI,0.0,0.0,72.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
404,RI,0.0,0.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
# Calculate the conditional probability of each word of the BoW
[n_wordsRI , cond_probRI] = conditional_probabilities(dt_RI)
cond_probRI

Unnamed: 0,aaai,ability,abstract,academic,acquiring,adaptation,adapting,adaptive,address,advisory,...,mile,millot,milot,niel,opérateur,procédés,sadt,seminaire,sferca,similarite
0,0.000142,3.3e-05,0.000793,0.000285,6.5e-05,2.4e-05,7e-06,8.4e-05,4e-05,9.037874e-08,...,9.037874e-08,9.037874e-08,9.037874e-08,9.037874e-08,9.037874e-08,9.037874e-08,9.037874e-08,9.037874e-08,9.037874e-08,9.037874e-08
