# Part I: BoW creation and Pre Processing

## Imports

In [28]:
import pandas as pd
import numpy as np
from collections import Counter
import re
from os import listdir
from os.path import join
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn.utils import shuffle
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk import download
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Eduardo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Variables

In [32]:
# Stop words that will be removed
# Source: https://towardsdatascience.com/multinomial-naive-bayes-classifier-for-text-analysis-python-8dd6825ece67
stop_words = [
"a", "about", "above", "across", "after", "afterwards", 
"again", "all", "almost", "alone", "along", "already", "also",    
"although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "as", "at", "be", "became", "because", "become","becomes", "becoming", "been", "before", "behind", "being", "beside", "besides", "between", "beyond", "both", "but", "by","can", "cannot", "cant", "could", "couldnt", "de", "describe", "do", "done", "each", "eg", "either", "else", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "find","for","found", "four", "from", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "i", "ie", "if", "in", "indeed", "is", "it", "its", "itself", "keep", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mine", "more", "moreover", "most", "mostly", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next","no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", "part","perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "she", "should","since", "sincere","so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "take","than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they",
"this", "those", "though", "through", "throughout",
"thru", "thus", "to", "together", "too", "toward", "towards",
"under", "until", "up", "upon", "us",
"very", "was", "we", "well", "were", "what", "whatever", "when",
"whence", "whenever", "where", "whereafter", "whereas", "whereby",
"wherein", "whereupon", "wherever", "whether", "which", "while", 
"who", "whoever", "whom", "whose", "why", "will", "with",
"within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves"
]

# Corpus' directory
corpus = "corpus"

# Files that compose the corpus
files = ''.join(str(e) + ' ' for e in listdir(corpus))

# Stemmer
ps = PorterStemmer()

## Functions

In [30]:
# Creates the Bag of Words (BoW).
# file: path to file
def create_bow(file):
    c = CountVectorizer(lowercase=True)
    text = open(file, encoding='latin-1').read()
    c.fit_transform([text])
    
    return c

# Searches for columns that are not useful, like numbers.
# bow: pandas DataFrame with each document BoW.
# Returns a list with bad columns.
def search_bad_cols(bow):
    cols = list()
        
    for col in bow.columns:
        # Matching numbers
        if re.match(r'[0-9]+', col): cols.append(col)
        # Matching with stop words
        if col in stop_words: cols.append(col)

    return cols

## Creating the BoW

In [31]:
# DataFrame with a BoW of all documents from corpus
bow = pd.DataFrame(columns=['Class'])

# Count the number of each document type
for f in files.split():
    print("File: " + f)
    c = create_bow(corpus + '/' + f)

    if re.match(r'CBR', f):
        tmp = 'CBR'

    elif re.match(r'ILP', f):
        tmp = 'ILP'

    else:
        tmp = 'RI'

    d = dict(c.vocabulary_)
    d['Class'] = tmp

    bow = bow.append(d, ignore_index=True)

File: CBR-1010Agr109-120.txt
File: CBR-1010All1-10.txt
File: CBR-1010Alu121-132.txt
File: CBR-1010Ash133-144.txt
File: CBR-1010Aur371-380.txt
File: CBR-1010Bak381-390.txt
File: CBR-1010Bar145-156.txt
File: CBR-1010Bic391-400.txt
File: CBR-1010Bis11-22.txt
File: CBR-1010Bro157-168.txt
File: CBR-1010Cun401-410.txt
File: CBR-1010Det411-420.txt
File: CBR-1010Fli421-430.txt
File: CBR-1010Fox431-440.txt
File: CBR-1010Fuc23-32.txt
File: CBR-1010Gok441-450.txt
File: CBR-1010Gro451-460.txt
File: CBR-1010Hai169-180.txt
File: CBR-1010Han461-470.txt
File: CBR-1010Has181-192.txt
File: CBR-1010Hin471-480.txt
File: CBR-1010Hua481-490.txt
File: CBR-1010Hun205-216.txt
File: CBR-1010Hur193-204.txt
File: CBR-1010Ind217-228.txt
File: CBR-1010Kem491-500.txt
File: CBR-1010Khe501-507.txt
File: CBR-1010Lea229-240.txt
File: CBR-1010Mac43-54.txt
File: CBR-1010Mal520-527.txt
File: CBR-1010Mar55-66.txt
File: CBR-1010Mun241-252.txt
File: CBR-1010Net67-76.txt
File: CBR-1010Oka253-264.txt
File: CBR-1010Opi77-87.txt


File: ILP-1297Fur165-171.txt
File: ILP-1297Gei173-180.txt
File: ILP-1297Got17-32.txt
File: ILP-1297Kak181-188.txt
File: ILP-1297Kak189-204.txt
File: ILP-1297Moy205-211.txt
File: ILP-1297Oza227-234.txt
File: ILP-1297Pom235-242.txt
File: ILP-1297Rae133-140.txt
File: ILP-1297Red243-255.txt
File: ILP-1297Sad256-263.txt
File: ILP-1297Seb264-272.txt
File: ILP-1297Sha213-225.txt
File: ILP-1297Sri273-286.txt
File: ILP-1297Web288-295.txt
File: ILP-1297Yam296-308.txt
File: ILP-1314Bae55-71.txt
File: ILP-1314Blo199-211.txt
File: ILP-1314Dze41-54.txt
File: ILP-1314Fle175-196.txt
File: ILP-1314Fuj163-174.txt
File: ILP-1314Gam72-88.txt
File: ILP-1314Gon337-357.txt
File: ILP-1314Hor315-334.txt
File: ILP-1314Inu265-282.txt
File: ILP-1314Jor229-244.txt
File: ILP-1314Kok127-141.txt
File: ILP-1314Mar377-396.txt
File: ILP-1314Miz146-158.txt
File: ILP-1314Moo3-22.txt
File: ILP-1314Mug25-40.txt
File: ILP-1314Mug358-372.txt
File: ILP-1314Nie285-298.txt
File: ILP-1314Pom299-314.txt
File: ILP-1314Rie245-264.tx

## Removing columns and filling NaN values

In [33]:
clean = bow.copy()

cols = search_bad_cols(clean)
clean = clean.drop(columns=cols)

# Changing NaN to 0
clean.fillna(0, inplace=True)

# Checking if all classes are in 'clean'
print(clean.iloc[:, 0].unique())

# Redefine dataframe index
clean = clean.reset_index(drop=True)
clean.head(10)

['CBR' 'ILP' 'RI']


Unnamed: 0,Class,aamodt,abaoub,abstract,acad,academic,academy,acm,agre,aimsa,...,mes,nize,prone,schmelze,syntacti,tleness,experimenta,garc,rishe,wileyi
0,CBR,53.0,54.0,55.0,56.0,57.0,58.0,59.0,60.0,61.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CBR,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CBR,0.0,0.0,51.0,0.0,53.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CBR,0.0,0.0,32.0,0.0,34.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CBR,64.0,0.0,65.0,0.0,0.0,0.0,67.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,CBR,30.0,0.0,32.0,0.0,33.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,CBR,41.0,0.0,43.0,0.0,45.0,0.0,46.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,CBR,46.0,0.0,48.0,0.0,49.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,CBR,0.0,0.0,14.0,0.0,0.0,0.0,15.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,CBR,0.0,0.0,58.0,0.0,0.0,0.0,60.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Applying Porter Stemming

In [75]:
# When we apply groupby() and sum(), some things happen with the 'Class' column
# To escape from this, we will stem a classless version of the BoW
stemmed = clean.iloc[:, 1:].copy()
classes = clean.iloc[:, 0].copy()

# Dictionary with the mapping of original word to stemmed one
mapper_ps = {}

# Applying Porter Stemming
for word in stemmed.columns:
    mapper_ps[word] = ps.stem(word)

# Renaming the columns' names to their stemmed ones
stemmed = stemmed.rename(mapper=mapper_ps, axis='columns')
# Grouping same words
stemmed = stemmed.groupby(stemmed.columns, axis=1, sort=False).sum()

# Concatenating again with the classes
# The order of the rows didn't change, so we can securely concatenate
stemmed = pd.concat([classes, stemmed], axis=1, sort=False)

stemmed

Unnamed: 0,Class,aamodt,abaoub,abstract,acad,academ,academi,acm,agr,aimsa,...,me,nize,prone,schmelz,syntacti,tleness,experimenta,garc,rish,wileyi
0,CBR,53.0,54.0,55.0,56.0,57.0,58.0,59.0,60.0,61.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CBR,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CBR,0.0,0.0,103.0,0.0,53.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CBR,0.0,0.0,32.0,0.0,34.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CBR,64.0,0.0,65.0,0.0,0.0,0.0,67.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,CBR,30.0,0.0,32.0,0.0,33.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,CBR,41.0,0.0,87.0,0.0,45.0,0.0,46.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,CBR,46.0,0.0,48.0,0.0,49.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,CBR,0.0,0.0,14.0,0.0,0.0,0.0,15.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,CBR,0.0,0.0,58.0,0.0,0.0,0.0,60.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Removing the least frequent words

In [76]:
minimize = stemmed.copy()

# We drop the elements that occurs less than 4500 times
drop = [col for col, val in minimize.iloc[:, 1:].sum().iteritems() if val < 4500]

minimize = minimize.drop(drop, axis='columns')
minimize

Unnamed: 0,Class,abstract,academ,acm,algorithm,analysi,approach,artifici,base,ca,...,rithm,trieval,karger,tzanetaki,zobel,wigderson,jave,trec,titl,secret
0,CBR,55.0,57.0,59.0,62.0,67.0,139.0,72.0,76.0,89.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CBR,8.0,0.0,0.0,0.0,0.0,17.0,19.0,22.0,27.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CBR,103.0,53.0,0.0,0.0,67.0,75.0,78.0,167.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CBR,32.0,34.0,0.0,0.0,0.0,47.0,52.0,61.0,73.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CBR,65.0,0.0,67.0,71.0,0.0,80.0,82.0,88.0,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,CBR,32.0,33.0,0.0,0.0,0.0,47.0,52.0,60.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,CBR,87.0,45.0,46.0,56.0,59.0,129.0,70.0,79.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,CBR,48.0,49.0,0.0,0.0,58.0,131.0,69.0,147.0,172.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,CBR,14.0,0.0,15.0,0.0,0.0,53.0,0.0,63.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,CBR,58.0,0.0,60.0,65.0,0.0,0.0,79.0,261.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Saving the prepared BoW into a CSV file

In [74]:
minimize.to_csv('bow.csv')