# Part I: BoW creation and Pre Processing

## Imports

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import re
from os import listdir
from os.path import join
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.utils import shuffle
from nltk.stem import PorterStemmer
from nltk import word_tokenize, download
download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Eduardo\Anaconda3\share\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Variables

In [2]:
# Stop words that will be removed
# Source: https://towardsdatascience.com/multinomial-naive-bayes-classifier-for-text-analysis-python-8dd6825ece67
stop_words = [
"a", "about", "above", "across", "after", "afterwards", 
"again", "all", "almost", "alone", "along", "already", "also",    
"although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "as", "at", "be", "became", "because", "become","becomes", "becoming", "been", "before", "behind", "being", "beside", "besides", "between", "beyond", "both", "but", "by","can", "cannot", "cant", "could", "couldnt", "de", "describe", "do", "done", "each", "eg", "either", "else", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "find","for","found", "four", "from", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "i", "ie", "if", "in", "indeed", "is", "it", "its", "itself", "keep", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mine", "more", "moreover", "most", "mostly", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next","no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", "part","perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "she", "should","since", "sincere","so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "take","than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they",
"this", "those", "though", "through", "throughout",
"thru", "thus", "to", "together", "too", "toward", "towards",
"under", "until", "up", "upon", "us",
"very", "was", "we", "well", "were", "what", "whatever", "when",
"whence", "whenever", "where", "whereafter", "whereas", "whereby",
"wherein", "whereupon", "wherever", "whether", "which", "while", 
"who", "whoever", "whom", "whose", "why", "will", "with",
"within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves"
]

# Corpus' directory
corpus = "corpus"

# Files that compose the corpus
files = ''.join(str(e) + ' ' for e in listdir(corpus))

# Stemmer
ps = PorterStemmer()

# Number of most frequent words
most_freq = 100

## Functions

In [3]:
# Processes the text, removing unecessary words and applying Porter stemming.
# text: text to be processed
def process_text(text):
    # Lowering case
    text = text.lower()

    # Replacing special characters with ' '
    text = re.sub('[^\w\s]', ' ', text)

    # Removing numbers
    text = re.sub('[0-9]+', ' ', text)

    # Changing consecutive white spaces in one white space
    text = re.sub('\s+', ' ', text)

    # Applying stemming
    text = ' '.join(ps.stem(token) for token in word_tokenize(text))
    
    return text

## Creating the BoWs

In [4]:
# DataFrame with classes
classes = pd.DataFrame(columns=['Class'])
# Text of all documents
docs = np.empty(len(files.split()), dtype=object)
# Counters
count_vectorizer = CountVectorizer(max_features=most_freq, stop_words=stop_words)
tfidf_vectorizer = TfidfVectorizer(max_features=most_freq, stop_words=stop_words)

# Create an array of documents
for f, i in zip(files.split(), range(len(files.split()))):
    print('{0:40} {1:4}/{2:4}'.format(f, i+1, len(files.split())))

    text = open(corpus + '/' + f, encoding='latin-1').read()
    docs[i] = process_text(text)
    
    # Identifying the class of each document by its name
    if re.match(r'CBR', f):
        tmp = 'CBR'
    elif re.match(r'ILP', f):
        tmp = 'ILP'
    else:
        tmp = 'RI'
    
    # This DataFrame will be used latter
    classes = classes.append({'Class': tmp}, ignore_index=True)

c = count_vectorizer.fit_transform(docs)
tfidf = tfidf_vectorizer.fit_transform(docs)

CBR-1010Agr109-120.txt                      1/ 574
CBR-1010All1-10.txt                         2/ 574
CBR-1010Alu121-132.txt                      3/ 574
CBR-1010Ash133-144.txt                      4/ 574
CBR-1010Aur371-380.txt                      5/ 574
CBR-1010Bak381-390.txt                      6/ 574
CBR-1010Bar145-156.txt                      7/ 574
CBR-1010Bic391-400.txt                      8/ 574
CBR-1010Bis11-22.txt                        9/ 574
CBR-1010Bro157-168.txt                     10/ 574
CBR-1010Cun401-410.txt                     11/ 574
CBR-1010Det411-420.txt                     12/ 574
CBR-1010Fli421-430.txt                     13/ 574
CBR-1010Fox431-440.txt                     14/ 574
CBR-1010Fuc23-32.txt                       15/ 574
CBR-1010Gok441-450.txt                     16/ 574
CBR-1010Gro451-460.txt                     17/ 574
CBR-1010Hai169-180.txt                     18/ 574
CBR-1010Han461-470.txt                     19/ 574
CBR-1010Has181-192.txt         

CBR-1898Fuc86-98.txt                      167/ 574
CBR-1898Gok99-111.txt                     168/ 574
CBR-1898Gom112-123.txt                    169/ 574
CBR-1898Hul124-135.txt                    170/ 574
CBR-1898Jar136-147.txt                    171/ 574
CBR-1898Jur273-284.txt                    172/ 574
CBR-1898Kon148-160.txt                    173/ 574
CBR-1898Lea161-172.txt                    174/ 574
CBR-1898Li3-14.txt                        175/ 574
CBR-1898Lie173-185.txt                    176/ 574
CBR-1898Lop26-36.txt                      177/ 574
CBR-1898Mcg431-442.txt                    178/ 574
CBR-1898Mck186-197.txt                    179/ 574
CBR-1898Mcs198-209.txt                    180/ 574
CBR-1898Mic443-453.txt                    181/ 574
CBR-1898Mil418-430.txt                    182/ 574
CBR-1898Min455-466.txt                    183/ 574
CBR-1898Mon467-478.txt                    184/ 574
CBR-1898Mun210-221.txt                    185/ 574
CBR-1898Noc222-233.txt         

ILP-1446Fla185-194.txt                    332/ 574
ILP-1446Fog175-184.txt                    333/ 574
ILP-1446Hek205-214.txt                    334/ 574
ILP-1446Jac145-154.txt                    335/ 574
ILP-1446Kaz125-134.txt                    336/ 574
ILP-1446Kha165-174.txt                    337/ 574
ILP-1446Kir261-270.txt                    338/ 574
ILP-1446Kra80-94.txt                      339/ 574
ILP-1446Man135-144.txt                    340/ 574
ILP-1446Mar215-224.txt                    341/ 574
ILP-1446Mug245-249.txt                    342/ 574
ILP-1446Nak155-164.txt                    343/ 574
ILP-1446Nie250-260.txt                    344/ 574
ILP-1446Noc195-204.txt                    345/ 574
ILP-1446Rae1-8.txt                        346/ 574
ILP-1446Ram271-280.txt                    347/ 574
ILP-1446Red23-37.txt                      348/ 574
ILP-1446Rob291-299.txt                    349/ 574
ILP-1446Sam225-234.txt                    350/ 574
ILP-1446Seb95-105.txt          

RI-ICALP.ps.txt                           494/ 574
RI-icann_01.ps.txt                        495/ 574
RI-icassp00gtzan.ps.txt                   496/ 574
RI-ICCIP98.ps.txt                         497/ 574
RI-icde99.ps.txt                          498/ 574
RI-icmc02gtzan.ps.txt                     499/ 574
RI-ideal.ps.txt                           500/ 574
RI-IDM99.ps.txt                           501/ 574
RI-ijcai99-textmining-wkshp.ps.txt        502/ 574
RI-Information2002.ps.txt                 503/ 574
RI-integrate.ps.txt                       504/ 574
RI-IPL.ps.txt                             505/ 574
RI-IPM97.ps.txt                           506/ 574
RI-ismir00gtzan.ps.txt                    507/ 574
RI-ismir01gtzan.ps.txt                    508/ 574
RI-ismir02gtzan.ps.txt                    509/ 574
RI-JASIS.lsi.90.ps.txt                    510/ 574
RI-JASIS2000.ps.txt                       511/ 574
RI-jnle-qa-2001.ps.txt                    512/ 574
RI-jnle-semlex.ps.txt          

In [5]:
bow = pd.DataFrame(c.toarray(), columns=count_vectorizer.get_feature_names())
bow_tfidf = pd.DataFrame(tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names())

bow = pd.concat([classes, bow], sort=False, axis=1)
bow_tfidf = pd.concat([classes, bow_tfidf], sort=False, axis=1)

## Saving the prepared BoW into a CSV file

In [6]:
bow.to_csv('bow.csv')
bow_tfidf.to_csv('bow_tfidf.csv')