In [1]:
import nltk
import string

# used for looping through folders/files
from os import listdir
from os.path import isfile, join

#Calc tfidf and cosine similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\livio\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\livio\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# All text entries to compare will appear here
BASE_INPUT_DIR = "./data/"
INPUT_DIR = "./data/input/input_file.txt"

## Preprocess Data

#### File information

In [3]:
def returnListOfFilePaths(inputPath,folderPath):
    fileInfo = []
    listOfFileNames = [fileName for fileName in listdir(folderPath) if isfile(join(folderPath, fileName))]
    listOfFilePaths = [join(folderPath, fileName) for fileName in listdir(folderPath) if isfile(join(folderPath, fileName))]
    
    listOfFileNames.append("input_file")
    listOfFilePaths.append(inputPath)
    
    fileInfo.append(listOfFileNames)
    fileInfo.append(listOfFilePaths)
    return fileInfo

fileNames, filePaths = returnListOfFilePaths(INPUT_DIR,BASE_INPUT_DIR)
print(fileNames, "\n", filePaths)

['wsj_0001', 'wsj_0002', 'wsj_0003', 'wsj_0004', 'wsj_0005', 'wsj_0006', 'wsj_0007', 'wsj_0008', 'wsj_0009', 'wsj_0010', 'wsj_0011', 'wsj_0012', 'wsj_0013', 'wsj_0014', 'wsj_0015', 'wsj_0016', 'wsj_0017', 'wsj_0018', 'wsj_0019', 'wsj_0020', 'wsj_0021', 'wsj_0022', 'wsj_0023', 'wsj_0024', 'wsj_0025', 'wsj_0026', 'wsj_0027', 'wsj_0028', 'wsj_0029', 'wsj_0030', 'wsj_0031', 'wsj_0032', 'wsj_0033', 'wsj_0034', 'wsj_0035', 'wsj_0036', 'wsj_0037', 'wsj_0038', 'wsj_0039', 'wsj_0040', 'wsj_0041', 'wsj_0042', 'wsj_0043', 'wsj_0044', 'wsj_0045', 'wsj_0046', 'wsj_0047', 'wsj_0048', 'wsj_0049', 'wsj_0050', 'wsj_0051', 'wsj_0052', 'wsj_0053', 'wsj_0054', 'wsj_0055', 'wsj_0056', 'wsj_0057', 'wsj_0058', 'wsj_0059', 'wsj_0060', 'wsj_0061', 'wsj_0062', 'wsj_0063', 'wsj_0064', 'wsj_0065', 'wsj_0066', 'wsj_0067', 'wsj_0068', 'wsj_0069', 'wsj_0070', 'wsj_0071', 'wsj_0072', 'wsj_0073', 'wsj_0074', 'wsj_0075', 'wsj_0076', 'wsj_0077', 'wsj_0078', 'wsj_0079', 'wsj_0080', 'wsj_0081', 'wsj_0082', 'wsj_0083', 'ws

In [4]:
# Get document contents
def create_docContentDict(filePaths):
    rawContentDict = {}
    for filePath in filePaths:
        with open(filePath, "r") as ifile:
            fileContent = ifile.read()
        rawContentDict[filePath] = fileContent
    return rawContentDict
rawContentDict = create_docContentDict(filePaths)
print(rawContentDict)



## Create Custom tokenizer

### Define functions to use within the tokenizer
We'd like to;
- tokenize the input
- remove stop words
- perform stemming
- remove punctuation
- convert input to lowercase

#### Tokenize

In [5]:
def tokenizeContent(contentsRaw):
    tokenized = nltk.tokenize.word_tokenize(contentsRaw)
    return tokenized

#### Remove Stop words

In [6]:
def removeStopWordsFromTokenized(contentsTokenized):
    stop_word_set = set(nltk.corpus.stopwords.words("english"))
    filteredContents = [word for word in contentsTokenized if word not in stop_word_set]
    return filteredContents

#### Stemming

In [7]:
def performPorterStemmingOnContents(contentsTokenized):
    porterStemmer = nltk.stem.PorterStemmer()
    filteredContents = [porterStemmer.stem(word) for word in contentsTokenized]
    return filteredContents

#### Remove Punctuation

In [8]:
def removePunctuationFromTokenized(contentsTokenized):
    excludePuncuation = set(string.punctuation)
    
    # manually add additional punctuation to remove
    doubleSingleQuote = '\'\''
    doubleDash = '--'
    doubleTick = '``'

    excludePuncuation.add(doubleSingleQuote)
    excludePuncuation.add(doubleDash)
    excludePuncuation.add(doubleTick)

    filteredContents = [word for word in contentsTokenized if word not in excludePuncuation]
    return filteredContents

#### Convert terms to lowercase

In [9]:
def convertItemsToLower(contentsRaw):
    filteredContents = [term.lower() for term in contentsRaw]
    return filteredContents

### Test that functions are working as expected

In [10]:
# get contents of a file for testing
# TODO: may need to make a copy of this here
content_test = rawContentDict[filePaths[0]]

# visually inspect
print(content_test[:300])

.START 

Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29.
Mr. Vinken is chairman of Elsevier N.V., the Dutch publishing group. 




In [11]:
# test tokenization
content_test_tokenized = tokenizeContent(content_test)

# visually inspect
print(content_test_tokenized[:30])

['.START', 'Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.', 'Mr.', 'Vinken', 'is', 'chairman', 'of', 'Elsevier', 'N.V.', ',', 'the', 'Dutch', 'publishing']


In [12]:
# test remove stop words
content_test_rmStop = removeStopWordsFromTokenized(content_test_tokenized)

# visually inspect
print(content_test_rmStop[:30])

['.START', 'Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'join', 'board', 'nonexecutive', 'director', 'Nov.', '29', '.', 'Mr.', 'Vinken', 'chairman', 'Elsevier', 'N.V.', ',', 'Dutch', 'publishing', 'group', '.']


In [13]:
# Test stemming
content_test_stemmed = performPorterStemmingOnContents(content_test_rmStop)

# visually inspect
print(content_test_stemmed[:30])

['.start', 'pierr', 'vinken', ',', '61', 'year', 'old', ',', 'join', 'board', 'nonexecut', 'director', 'nov.', '29', '.', 'mr.', 'vinken', 'chairman', 'elsevi', 'n.v.', ',', 'dutch', 'publish', 'group', '.']


In [14]:
# Test remove punctuation
content_test_cleaned = removePunctuationFromTokenized(content_test_stemmed)

# visually inspect
print(content_test_cleaned[:30])

['.start', 'pierr', 'vinken', '61', 'year', 'old', 'join', 'board', 'nonexecut', 'director', 'nov.', '29', 'mr.', 'vinken', 'chairman', 'elsevi', 'n.v.', 'dutch', 'publish', 'group']


In [15]:
# Test convert to lower
content_test_clean_lower = convertItemsToLower(content_test_cleaned)
print(content_test_clean_lower[:30])

['.start', 'pierr', 'vinken', '61', 'year', 'old', 'join', 'board', 'nonexecut', 'director', 'nov.', '29', 'mr.', 'vinken', 'chairman', 'elsevi', 'n.v.', 'dutch', 'publish', 'group']


### Wrap into a function to be used by NLTK

In [16]:
# process data without writing inspection file information to file
def processData(rawContents):
    cleaned = tokenizeContent(rawContents)
    cleaned = removeStopWordsFromTokenized(cleaned)
    cleaned = performPorterStemmingOnContents(cleaned)    
    cleaned = removePunctuationFromTokenized(cleaned)
    cleaned = convertItemsToLower(cleaned)
    return cleaned

## Create Functions For Output
- TFIDF
- Cosine Similarity
    - this function will both calcuate and output results

In [17]:
# print TFIDF values in 'table' format
def print_TFIDF_for_all(term, values, fileNames):
    values = values.transpose() # files along 'x-axis', terms along 'y-axis'
    numValues = len(values[0])
    print('                ', end="")   #bank space for formatting output
    for n in range(len(fileNames)):
        print('{0:18}'.format(fileNames[n]), end="")    #file names
    print()
    for i in range(len(term)):
        print('{0:8}'.format(term[i]), end='\t|  ')     #the term
        for j in range(numValues):
            print('{0:.12f}'.format(values[i][j]), end='   ') #the value, corresponding to the file name, for the term
        print()

In [18]:
# TODO: modify this to build matrix then print from matrix form
def calc_and_print_CosineSimilarity_for_all(tfs, fileNames):
    #print(cosine_similarity(tfs[0], tfs[1]))
    print("\n\n\n========COSINE SIMILARITY====================================================================\n")
    numFiles = len(fileNames)
    for i in range(numFiles -1):
        print(fileNames[i], end='   ')
        matrixValue = cosine_similarity(tfs[i], tfs[-1])
        numValue = matrixValue[0][0]
        print(" {0:.8f}".format(numValue), end='         ')

        print()
    print("\n\n=============================================================================================\n")

In [19]:
def main(printResults=True):
    baseFolderPath = "./data/"
    inputFilePath = "./data/input/input_file.txt"

    fileNames, filePathList = returnListOfFilePaths(inputFilePath,baseFolderPath)

    rawContentDict = create_docContentDict(filePathList)

    # instanciate tfid
    tfidf = TfidfVectorizer(tokenizer=processData, stop_words='english')
    #calculate tfidf
    tfs = tfidf.fit_transform(rawContentDict.values())
    tfs_Values = tfs.toarray()
    tfs_Term = tfidf.get_feature_names()
    
    # print results
    #print_TFIDF_for_all(tfs_Term, tfs_Values, fileNames)
    calc_and_print_CosineSimilarity_for_all(tfs, fileNames)


In [20]:
main()







wsj_0001    1.00000000         
wsj_0002    0.15622490         
wsj_0003    0.00949398         
wsj_0004    0.00322659         
wsj_0005    0.04396144         
wsj_0006    0.00278221         
wsj_0007    0.00181271         
wsj_0008    0.01661047         
wsj_0009    0.03139419         
wsj_0010    0.02208738         
wsj_0011    0.01584017         
wsj_0012    0.01441098         
wsj_0013    0.00660417         
wsj_0014    0.06737963         
wsj_0015    0.00984402         
wsj_0016    0.00860567         
wsj_0017    0.00183360         
wsj_0018    0.01168064         
wsj_0019    0.02164322         
wsj_0020    0.00047055         
wsj_0021    0.01565625         
wsj_0022    0.01082278         
wsj_0023    0.00147117         
wsj_0024    0.00478892         
wsj_0025    0.02371089         
wsj_0026    0.00491147         
wsj_0027    0.02638177         
wsj_0028    0.11371539         
wsj_0029    0.01560089         
wsj_0030    0.06271723         
wsj_0031    0.00222195         
wsj_