## TF-IDF Project

In [1]:
import nltk
import math

### Load Data

In [2]:
dataset = {
    "tfidf-1.txt":open("tfidf-1.txt", encoding="utf8").read(),
    "tfidf-2.txt":open("tfidf-2.txt", encoding="utf8").read(),
    "tfidf-3.txt":open("tfidf-3.txt", encoding="utf8").read(),
    "tfidf-4.txt":open("tfidf-4.txt", encoding="utf8").read(),
    "tfidf-5.txt":open("tfidf-5.txt", encoding="utf8").read(),
    "tfidf-6.txt":open("tfidf-6.txt", encoding="utf8").read(),
    "tfidf-7.txt":open("tfidf-7.txt", encoding="utf8").read(),
    "tfidf-8.txt":open("tfidf-8.txt", encoding="utf8").read(),
    "tfidf-9.txt":open("tfidf-9.txt", encoding="utf8").read(),
    "tfidf-10.txt":open("tfidf-10.txt", encoding="utf8").read()
}

In [3]:
dataset

{'tfidf-1.txt': 'World War II (WWII or WW2), also known as the Second World War, was a global war that lasted from 1939 to 1945, though related conflicts began earlier. It involved the vast majority of the world\'s nations—including all of the great powers—eventually forming two opposing military alliances: the Allies and the Axis. It was the most widespread war in history, and directly involved more than 100 million people from over 30 countries. In a state of "total war", the major participants threw their entire economic, industrial, and scientific capabilities behind the war effort, erasing the distinction between civilian and military resources. Marked by mass deaths of civilians, including the Holocaust (in which approximately 11 million people were killed) and the strategic bombing of industrial and population centres (in which approximately one million were killed, and which included the atomic bombings of Hiroshima and Nagasaki), it resulted in an estimated 50 million to 85 mi

### Define Functions

In [4]:
# Calculate term frequencies
def tf(dataset, file_name):
    text = dataset[file_name]
    tokens = nltk.word_tokenize(text)
    fd = nltk.FreqDist(tokens)
    return fd

In [5]:
tf(dataset, "tfidf-1.txt")

FreqDist({'the': 80, ',': 54, 'and': 46, 'of': 39, '.': 27, 'in': 21, 'war': 12, 'on': 11, 'to': 9, 'United': 9, ...})

In [6]:
# Calculate inverse document frequency
def idf(dataset, term):
    count = [term in dataset[file_name] for file_name in dataset]
    inv_df = math.log(len(count)/sum(count))
    return inv_df

In [7]:
def tfidf(dataset, file_name, n):
    term_scores = {}
    file_fd = tf(dataset,file_name)
    for term in file_fd:
        if term.isalpha():
            idf_val = idf(dataset,term)
            tf_val = tf(dataset, file_name)[term]
            tfidf = tf_val*idf_val
            term_scores[term] = round(tfidf,2)
    return sorted(term_scores.items(), key=lambda x:-x[1])[:n]

### Run the Code

In [8]:
tfidf(dataset, "tfidf-1.txt", 10)

[('Soviet', 20.72),
 ('Union', 18.42),
 ('Axis', 16.12),
 ('Japan', 11.27),
 ('Germany', 11.27),
 ('Allies', 9.66),
 ('invasion', 9.66),
 ('World', 9.21),
 ('Asia', 9.21),
 ('Africa', 9.21)]

In [9]:
for file_name in dataset:
     print("{0}: \n {1} \n".format(file_name, tfidf(dataset,file_name,10)))

tfidf-1.txt: 
 [('Soviet', 20.72), ('Union', 18.42), ('Axis', 16.12), ('Japan', 11.27), ('Germany', 11.27), ('Allies', 9.66), ('invasion', 9.66), ('World', 9.21), ('Asia', 9.21), ('Africa', 9.21)] 

tfidf-2.txt: 
 [('Module', 16.12), ('Armstrong', 13.82), ('lunar', 13.82), ('Apollo', 11.51), ('Moon', 9.21), ('Aldrin', 9.21), ('spacecraft', 9.21), ('Earth', 8.05), ('surface', 6.91), ('Lunar', 6.91)] 

tfidf-3.txt: 
 [('Napoleon', 32.19), ('French', 16.86), ('Coalition', 11.51), ('Prussia', 6.91), ('military', 6.02), ('Revolution', 6.02), ('Battle', 6.02), ('against', 5.5), ('France', 4.85), ('Europe', 4.85)] 

tfidf-4.txt: 
 [('Washington', 25.33), ('President', 6.44), ('Continental', 4.82), ('presided', 4.61), ('militia', 4.61), ('armies', 4.61), ('generals', 4.61), ('preservation', 4.61), ('opposition', 4.61), ('federal', 4.61)] 

tfidf-5.txt: 
 [('Newton', 23.03), ('scientists', 6.91), ('motion', 4.83), ('mathematician', 4.61), ('Principia', 4.61), ('mechanics', 4.61), ('calculus', 4