In [4]:
import nltk
import string

# used for looping through folders/files
import os
from os import listdir
from os.path import isfile, join

#Calc tfidf and cosine similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
# All text entries to compare will appear here
BASE_INPUT_DIR = "/content/inputData"
file1_path = os.path.join(BASE_INPUT_DIR, "f1.txt")
file2_path = os.path.join(BASE_INPUT_DIR, "f2.txt")

## Preprocess Data

#### File information

In [None]:
def returnListOfFilePaths(folderPath):
    fileInfo = []
    listOfFileNames = [fileName for fileName in listdir(folderPath) if isfile(join(folderPath, fileName))]
    listOfFilePaths = [join(folderPath, fileName) for fileName in listdir(folderPath) if isfile(join(folderPath, fileName))]
    fileInfo.append(listOfFileNames)
    fileInfo.append(listOfFilePaths)
    return fileInfo

fileNames, filePaths = returnListOfFilePaths(BASE_INPUT_DIR)
print(fileNames, "\n", filePaths)

In [None]:
# Get document contents
def create_docContentDict(filePaths):
    rawContentDict = {}
    for filePath in filePaths:
        with open(filePath, "r") as ifile:
            fileContent = ifile.read()
        rawContentDict[filePath] = fileContent
    return rawContentDict
rawContentDict = create_docContentDict(filePaths)
print(rawContentDict)

{'/content/inputData/100593newsML.txt': 'Anglo-French Channel Tunnel operator Eurotunnel Monday announced a deal giving its creditor banks 45.5 percent of the company in return for wiping out one billion pounds ($1.56 billion) of its debt.\nThe long-awaited restructuring brings to an end months of wrangling between Eurotunnel and the 225 banks to which it owes nearly nine billion pounds ($14.1 billion).\nThe deal, announced simultaneously in Paris and London, brings the company back from the brink of insolvency but leaves shareholders owning only 54.5 percent of the company.\n"The restructuring plan provides Eurotunnel with the medium-term financial stability to allow it to consolidate its substantial commercial achievements to date and to develop its operations," Eurotunnel co-chairman Alastair Morton said.\nThe firm was now making a profit before interest, he added.\nAlthough shareholders will see their interests diluted, they were offered the prospect of a brighter future after mont

## Create Custom tokenizer

### Define functions to use within the tokenizer
We'd like to;
- tokenize the input
- remove stop words
- perform stemming
- remove punctuation
- convert input to lowercase

#### Tokenize

In [None]:
def tokenizeContent(contentsRaw):
    tokenized = nltk.tokenize.word_tokenize(contentsRaw)
    return tokenized

#### Remove Stop words

In [None]:
def removeStopWordsFromTokenized(contentsTokenized):
    stop_word_set = set(nltk.corpus.stopwords.words("english"))
    filteredContents = [word for word in contentsTokenized if word not in stop_word_set]
    return filteredContents

#### Stemming

In [None]:
def performPorterStemmingOnContents(contentsTokenized):
    porterStemmer = nltk.stem.PorterStemmer()
    filteredContents = [porterStemmer.stem(word) for word in contentsTokenized]
    return filteredContents

#### Remove Punctuation

In [None]:
def removePunctuationFromTokenized(contentsTokenized):
    excludePuncuation = set(string.punctuation)

    # manually add additional punctuation to remove
    doubleSingleQuote = '\'\''
    doubleDash = '--'
    doubleTick = '``'

    excludePuncuation.add(doubleSingleQuote)
    excludePuncuation.add(doubleDash)
    excludePuncuation.add(doubleTick)

    filteredContents = [word for word in contentsTokenized if word not in excludePuncuation]
    return filteredContents

#### Convert terms to lowercase

In [None]:
def convertItemsToLower(contentsRaw):
    filteredContents = [term.lower() for term in contentsRaw]
    return filteredContents

### Test that functions are working as expected

In [None]:
# get contents of a file for testing
# TODO: may need to make a copy of this here
content_test = rawContentDict[filePaths[0]]

# visually inspect
print(content_test[:300])

Anglo-French Channel Tunnel operator Eurotunnel Monday announced a deal giving its creditor banks 45.5 percent of the company in return for wiping out one billion pounds ($1.56 billion) of its debt.
The long-awaited restructuring brings to an end months of wrangling between Eurotunnel and the 225 ba


In [None]:
# test tokenization
content_test_tokenized = tokenizeContent(content_test)

# visually inspect
print(content_test_tokenized[:30])

['Anglo-French', 'Channel', 'Tunnel', 'operator', 'Eurotunnel', 'Monday', 'announced', 'a', 'deal', 'giving', 'its', 'creditor', 'banks', '45.5', 'percent', 'of', 'the', 'company', 'in', 'return', 'for', 'wiping', 'out', 'one', 'billion', 'pounds', '(', '$', '1.56', 'billion']


In [None]:
# test remove stop words
content_test_rmStop = removeStopWordsFromTokenized(content_test_tokenized)

# visually inspect
print(content_test_rmStop[:30])

['Anglo-French', 'Channel', 'Tunnel', 'operator', 'Eurotunnel', 'Monday', 'announced', 'deal', 'giving', 'creditor', 'banks', '45.5', 'percent', 'company', 'return', 'wiping', 'one', 'billion', 'pounds', '(', '$', '1.56', 'billion', ')', 'debt', '.', 'The', 'long-awaited', 'restructuring', 'brings']


In [None]:
# Test stemming
content_test_stemmed = performPorterStemmingOnContents(content_test_rmStop)

# visually inspect
print(content_test_stemmed[:30])

['anglo-french', 'channel', 'tunnel', 'oper', 'eurotunnel', 'monday', 'announc', 'deal', 'give', 'creditor', 'bank', '45.5', 'percent', 'compani', 'return', 'wipe', 'one', 'billion', 'pound', '(', '$', '1.56', 'billion', ')', 'debt', '.', 'the', 'long-await', 'restructur', 'bring']


In [None]:
# Test remove punctuation
content_test_cleaned = removePunctuationFromTokenized(content_test_stemmed)

# visually inspect
print(content_test_cleaned[:30])

['anglo-french', 'channel', 'tunnel', 'oper', 'eurotunnel', 'monday', 'announc', 'deal', 'give', 'creditor', 'bank', '45.5', 'percent', 'compani', 'return', 'wipe', 'one', 'billion', 'pound', '1.56', 'billion', 'debt', 'the', 'long-await', 'restructur', 'bring', 'end', 'month', 'wrangl', 'eurotunnel']


In [None]:
# Test convert to lower
content_test_clean_lower = convertItemsToLower(content_test_cleaned)
print(content_test_clean_lower[:30])

['anglo-french', 'channel', 'tunnel', 'oper', 'eurotunnel', 'monday', 'announc', 'deal', 'give', 'creditor', 'bank', '45.5', 'percent', 'compani', 'return', 'wipe', 'one', 'billion', 'pound', '1.56', 'billion', 'debt', 'the', 'long-await', 'restructur', 'bring', 'end', 'month', 'wrangl', 'eurotunnel']


### Wrap into a function to be used by NLTK

In [None]:
# process data without writing inspection file information to file
def processData(rawContents):
    cleaned = tokenizeContent(rawContents)
    cleaned = removeStopWordsFromTokenized(cleaned)
    cleaned = performPorterStemmingOnContents(cleaned)
    cleaned = removePunctuationFromTokenized(cleaned)
    cleaned = convertItemsToLower(cleaned)
    return cleaned

## Create Functions For Output
- TFIDF
- Cosine Similarity
    - this function will both calcuate and output results

In [10]:
pip install tabulate



In [11]:
# print TFIDF values in 'table' format
def print_TFIDF_for_all(term, values, fileNames):
    # files along 'x-axis', terms along 'y-axis'
     #bank space for formatting output
      #file names
       #the term
       #the value, corresponding to the file name, for the term
        #print

    max_filename_length = max(len(max(fileNames, key=len)), len("File Name"))
    max_term_length = max(len(term), len("Term"))

    max_value_length = max(len("TF-IDF Value"), len(str(max(values, default=0))))
    header = f"{'File Name':<{max_filename_length}} {'Term':<{max_term_length}} {'TF-IDF Value':>{max_value_length}}"
    print(header)

    for filename, value in zip(fileNames, values):
        row = f"{filename:<{max_filename_length}} {term:<{max_term_length}} {value:>{max_value_length}}"
        print(row)

In [12]:
# write TFIDF values in 'table' format
def write_TFIDF_for_all(term, values, fileNames):
    filePath = 'file3.txt'
    #create file
   # get valeus from files along 'x-axis', terms along 'y-axis'
     #bank space for formatting output
      #file names
    #write it in file
          #the term
        #the value, corresponding to the file name, for the term

    with open(filePath, 'w') as file:
        max_filename_length = max(len(max(fileNames, key=len)), len("File Name"))
        max_term_length = max(len(term), len("Term"))
        max_value_length = max(len("TF-IDF Value"), len(str(max(values, default=0))))
        header = f"{'File Name':<{max_filename_length}} {'Term':<{max_term_length}} {'TF-IDF Value':>{max_value_length}}"
        file.write(header + '\n')
        for filename, value in zip(fileNames, values):
            row = f"{filename:<{max_filename_length}} {term:<{max_term_length}} {value:>{max_value_length}}"
            file.write(row + '\n')


In [None]:
# TODO: modify this to build matrix then print from matrix form
def calc_and_print_CosineSimilarity_for_all(tfs, fileNames):
    #print(cosine_similarity(tfs[0], tfs[1]))
    print("\n\n\n========COSINE SIMILARITY====================================================================\n")
    #formatting
            #print(fileNames[n], end='\t')
            #print(numValue, end='\t')
            #(cosine_similarity(tfs[i], tfs[n]))[0][0]
    header = "\t" + "\t".join(fileNames)  # File names as column headers
    print(header)
    for i in range(len(fileNames)):
        print(fileNames[i], end='\t')
        for n in range(len(fileNames)):
            similarity = cosine_similarity(tfs[i], tfs[n])[0][0]
            print(f"{similarity:.4f}", end='\t')

        print()


    print("\n\n=============================================================================================\n")

In [13]:
def calc_and_write_CosineSimilarity_for_all(tfs, fileNames):
    filePath = "file3.txt"
   #read form file & write it in perform cosineSimilarity
            #(cosine_similarity(tfs[i], tfs[n]))[0][0]
    with open(filePath , 'w') as file:
        file.write("\n\n\n======== COSINE SIMILARITY =======================================================\n")

        # Write the header row with file names as column headers
        header = "\t" + "\t".join(fileNames)  # File names as column headers
        file.write(header + '\n')

        # Calculate and write cosine similarity for each pair of documents
        for i in range(len(fileNames)):
            file.write(fileNames[i] + '\t')  # Write the file name for the current row
            for n in range(len(fileNames)):
                similarity = cosine_similarity(tfs[i], tfs[n])[0][0]
                file.write(f"{similarity:.4f}\t")  # Write the similarity value

            file.write('\n')  # Move to the next line for the next file

        file.write("\n\n============================================================================================\n")



## Wrap Everything into `Main()`

In [None]:
def main(printResults=True):
    baseFolderPath = "/content/inputData"

    fileNames, filePathList = returnListOfFilePaths(baseFolderPath)

    rawContentDict = create_docContentDict(filePathList)

    # calculate tfidf
    tfidf = TfidfVectorizer(tokenizer=processData, stop_words='english')
    tfs = tfidf.fit_transform(rawContentDict.values())
    tfs_Values = tfs.toarray()
    tfs_Term = tfidf.get_feature_names_out()

    if printResults:
        # print results
        print_TFIDF_for_all(tfs_Term, tfs_Values, fileNames)
        calc_and_print_CosineSimilarity_for_all(tfs, fileNames)
    else:
        # write results to file
        write_TFIDF_for_all(tfs_Term, tfs_Values, fileNames)
        calc_and_write_CosineSimilarity_for_all(tfs, fileNames)

In [None]:
main()



                100593newsML.txt  137871newsML.txt  100618newsML.txt  100554newsML.txt  130040newsML.txt  
'm      	|  0.000000000000   0.000000000000   0.000000000000   0.095165597630   0.000000000000   
's      	|  0.139427353384   0.093595320091   0.145592889457   0.090693834344   0.089203368600   
1.0     	|  0.039192056059   0.000000000000   0.040925145224   0.031866754923   0.000000000000   
1.56    	|  0.094428412110   0.000000000000   0.049302038059   0.000000000000   0.000000000000   
1.6     	|  0.000000000000   0.000000000000   0.000000000000   0.047582798815   0.000000000000   
1.85    	|  0.000000000000   0.000000000000   0.000000000000   0.047582798815   0.000000000000   
10      	|  0.039192056059   0.000000000000   0.040925145224   0.031866754923   0.000000000000   
10.40   	|  0.039192056059   0.000000000000   0.040925145224   0.031866754923   0.000000000000   
100     	|  0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.046800821384   
113.5   	| 