In [1]:
# import libraries

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords as sw

from collections import Counter
import math

import re
import numpy as np

from pprint import pprint

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
!pip install wikipedia
import wikipedia



In [0]:
cities = ["Canberra","Perth", "Sydney", "Melbourne", "Darwin,_Northern_Territory", "Brisbane", "Hobart"]
corpus = [wikipedia.page(city).content for city in cities]

In [0]:
#calculate the DF for all documents

def calculateDF(tokenized_docs):
  DF = {} 
  for tokensized_doc in tokenized_docs:
    # get each unique word in the doc - we need to know whether the word is appeared in the document
    for term in np.unique(tokensized_doc):
      try:
        DF[term] +=1
      except:
        DF[term] =1
  
  return DF

In [0]:
#calculate TF-IDF for all documents

def calculateTF_IDF(tokenized_docs, DF):
  TF_IDF = {}
  tf = {}
  
  # total number of documents
  N = len(tokenized_docs)

  doc_id = 0
  # get each tokenised doc
  for tokensized_doc in tokenized_docs:
      # initialise counter for the doc
      counter = Counter(tokensized_doc)
      # calculate total number of words in the doc
      total_num_words = len(tokensized_doc)    

      # get each unique word in the doc
      for term in np.unique(tokensized_doc):

          #calculate Term Frequency 
          tf[doc_id, term] = counter[term]/total_num_words
          
          #calculate Document Frequency
          df = DF[term]

          # calculate Inverse Document Frequency
          idf = math.log(N/(df+1))+1

          # calculate TF-IDF
          TF_IDF[doc_id, term] = tf[doc_id, term] * idf

      doc_id += 1
  return TF_IDF, tf

In [0]:
# sort tf and tf-idf values
def sort(TF, TF_IDF, top_n):
  sorted_TF = sorted(TF.items(), key=lambda x: x[1], reverse=True)[: top_n]
  sorted_TFIDF = sorted(TF_IDF.items(), key=lambda x: x[1], reverse=True)[: top_n]
  return sorted_TF, sorted_TFIDF

In [0]:
#call the funtion

def get_tf_and_idf(corpus, top_n):
  #Process the corpus (incl. tokenisation, lower_case, stopword removal)
  sorted_TF = {}
  sorted_TFIDF = {}
  sww = sw.words()
  tokenized_docs=[]
  
  for doc in corpus:
    #preprocessing on all the 7 docs
    clean_doc = re.sub(r'[^\w\s]','', doc)
    tokenized_sentence = sent_tokenize(clean_doc.lower())
    lower_case = word_tokenize(clean_doc.lower())
    stopword_removal = [w for w in lower_case if not w in sww]
    tokenized_docs.append(stopword_removal)

  #rest of your code here
  # get DF values
  DF = calculateDF(tokenized_docs)

  # get TF-IDF & TF values
  TF_IDF, TF = calculateTF_IDF(tokenized_docs, DF)
  
  # sort and get top N TF-IDF and TF values
  sorted_TF, sorted_TFIDF = sort(TF, TF_IDF, top_n)

  # printing all the list
  print('Total docs in corpus:', len(tokenized_docs))
  print('\nTop 10 of tf values:')
  print('(doc id, word): tf')
  pprint(sorted_TF)
  #print('=======================================')
  print('\nTop 10 of tf values:')
  print('(doc id, word): tf*idf')
  pprint(sorted_TFIDF)

In [9]:
get_tf_and_idf(corpus, 10)

Total docs in corpus: 7

Top 10 of tf values:
(doc id, word): tf
[((5, 'brisbane'), 0.046163190929337926),
 ((4, 'darwin'), 0.04120879120879121),
 ((3, 'melbourne'), 0.03969006957621758),
 ((1, 'perth'), 0.03878231859883236),
 ((2, 'sydney'), 0.03666121112929623),
 ((6, 'hobart'), 0.03344575604272063),
 ((0, 'canberra'), 0.02618181818181818),
 ((3, 'city'), 0.014705882352941176),
 ((5, 'city'), 0.014375379631504353),
 ((6, 'city'), 0.013771781899943788)]

Top 10 of tf values:
(doc id, word): tf*idf
[((4, 'darwin'), 0.055074405355269765),
 ((5, 'brisbane'), 0.05327927819409089),
 ((1, 'perth'), 0.044760639376119696),
 ((6, 'hobart'), 0.04469932438390229),
 ((2, 'sydney'), 0.03666121112929623),
 ((0, 'canberra'), 0.03499127310426448),
 ((3, 'melbourne'), 0.03439019931234105),
 ((5, 'queensland'), 0.015189267906210122),
 ((4, 'darwins'), 0.013409303383901),
 ((5, 'brisbanes'), 0.013227399491064117)]
