In [75]:
from nltk import word_tokenize
import re
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [76]:
# Add documents in the collection
document_0 = "China has a strong economy that is growing at a rapid pace. However politically it differs greatly from the US Economy."
document_1 = "At last, China seems serious about confronting an endemic problem: domestic violence and corruption."
document_2 = "Japan's prime minister, Shinzo Abe, is working towards healing the economic turmoil in his own country for his view on the future of his people."
document_3 = "Vladimir Putin is working hard to fix the economy in Russia as the Ruble has tumbled."
document_4 = "What's the future of Abenomics? We asked Shinzo Abe for his views"
document_5 = "Obama has eased sanctions on Cuba while accelerating those against the Russian Economy, even as the Ruble's value falls almost daily."
document_6 = "Vladimir Putin is riding a horse while hunting deer. Vladimir Putin always seems so serious about things - even riding horses. Is he crazy?"

all_documents = [document_0, document_1, document_2, document_3, document_4, document_5, document_6]

In [77]:
# Tokenizer module
for doc_id in range(len(all_documents)):
    all_documents[doc_id] = word_tokenize(all_documents[doc_id])

In [78]:
print(all_documents)

[['China', 'has', 'a', 'strong', 'economy', 'that', 'is', 'growing', 'at', 'a', 'rapid', 'pace', '.', 'However', 'politically', 'it', 'differs', 'greatly', 'from', 'the', 'US', 'Economy', '.'], ['At', 'last', ',', 'China', 'seems', 'serious', 'about', 'confronting', 'an', 'endemic', 'problem', ':', 'domestic', 'violence', 'and', 'corruption', '.'], ['Japan', "'s", 'prime', 'minister', ',', 'Shinzo', 'Abe', ',', 'is', 'working', 'towards', 'healing', 'the', 'economic', 'turmoil', 'in', 'his', 'own', 'country', 'for', 'his', 'view', 'on', 'the', 'future', 'of', 'his', 'people', '.'], ['Vladimir', 'Putin', 'is', 'working', 'hard', 'to', 'fix', 'the', 'economy', 'in', 'Russia', 'as', 'the', 'Ruble', 'has', 'tumbled', '.'], ['What', "'s", 'the', 'future', 'of', 'Abenomics', '?', 'We', 'asked', 'Shinzo', 'Abe', 'for', 'his', 'views'], ['Obama', 'has', 'eased', 'sanctions', 'on', 'Cuba', 'while', 'accelerating', 'those', 'against', 'the', 'Russian', 'Economy', ',', 'even', 'as', 'the', 'Ruble

In [79]:
# Linguistic modules 1) Lowercase 2) Remove special characters 
stemmer = PorterStemmer()
for doc_id in range(len(all_documents)):
    for term_id in range(len(all_documents[doc_id])):
        all_documents[doc_id][term_id] = stemmer.stem(re.sub(r'[^a-z\d ]','',all_documents[doc_id][term_id].lower()))        

In [80]:
print(all_documents)

[['china', 'ha', 'a', 'strong', 'economi', 'that', 'is', 'grow', 'at', 'a', 'rapid', 'pace', '', 'howev', 'polit', 'it', 'differ', 'greatli', 'from', 'the', 'us', 'economi', ''], ['at', 'last', '', 'china', 'seem', 'seriou', 'about', 'confront', 'an', 'endem', 'problem', '', 'domest', 'violenc', 'and', 'corrupt', ''], ['japan', 's', 'prime', 'minist', '', 'shinzo', 'abe', '', 'is', 'work', 'toward', 'heal', 'the', 'econom', 'turmoil', 'in', 'hi', 'own', 'countri', 'for', 'hi', 'view', 'on', 'the', 'futur', 'of', 'hi', 'peopl', ''], ['vladimir', 'putin', 'is', 'work', 'hard', 'to', 'fix', 'the', 'economi', 'in', 'russia', 'as', 'the', 'rubl', 'ha', 'tumbl', ''], ['what', 's', 'the', 'futur', 'of', 'abenom', '', 'we', 'ask', 'shinzo', 'abe', 'for', 'hi', 'view'], ['obama', 'ha', 'eas', 'sanction', 'on', 'cuba', 'while', 'acceler', 'those', 'against', 'the', 'russian', 'economi', '', 'even', 'as', 'the', 'rubl', 's', 'valu', 'fall', 'almost', 'daili', ''], ['vladimir', 'putin', 'is', 'rid

In [81]:
# Linguistic modules continued.. 3) Remove stop words 4) remove duplicates 5) Remove single letter terms
stop_words = set(stopwords.words('english'))
print(stop_words)
all_unique_documents = {}
for doc_id in range(len(all_documents)):
    all_unique_documents[doc_id] = set()
    for term_id in range(len(all_documents[doc_id])):
        if len(all_documents[doc_id][term_id]) > 1 and all_documents[doc_id][term_id] not in stop_words:
            all_unique_documents[doc_id].add(all_documents[doc_id][term_id])
print(all_unique_documents)

{'they', 'whom', 'this', "couldn't", 'during', 'will', 'below', 'ain', 'your', "didn't", "mightn't", 'yourself', 'those', 'at', 'me', 'yourselves', 'its', 'out', 'while', 'and', 'i', 'hers', 'don', 'doesn', 'you', 'am', 'for', "doesn't", 'was', 'each', 'won', "you'd", "shan't", 'didn', 'when', "should've", 'o', "needn't", "wouldn't", 'where', 'our', 'own', 'again', 'not', 'd', 've', 'once', 'all', 'were', 'being', 'too', 'than', 't', "wasn't", 'did', 'couldn', 'mustn', 'ours', 'her', 'isn', 'myself', 'against', 'we', 'there', 'until', 'into', 'aren', 'more', 'other', 'herself', "won't", 'only', 'any', "hadn't", 'what', 'himself', 'why', 'now', "shouldn't", 'as', 'having', 'hasn', 'should', 'them', 'in', 'mightn', "you're", 'because', 'by', 'how', 'down', 'do', 'his', 'if', 'very', 'itself', 'haven', 'under', 'has', 'of', 'same', "that'll", "mustn't", "isn't", 'needn', 'off', 'shouldn', 'with', 'most', 'll', 'both', 'before', 'after', 'it', 'no', 'further', 'such', 'm', 'had', 'who', 'o

In [82]:
# Create inverted index
inverted_index = {}
for doc_id in range(len(all_unique_documents)):
    for term in all_unique_documents[doc_id]:
       if term not in inverted_index:
            inverted_index[term] = []
       inverted_index[term].append(doc_id)

In [83]:
# Sorting the list items
for item in inverted_index:
        inverted_index[item].sort()

In [84]:
# Print inverted index
inverted_index

{'abe': [2, 4],
 'abenom': [4],
 'acceler': [5],
 'almost': [5],
 'alway': [6],
 'ask': [4],
 'china': [0, 1],
 'confront': [1],
 'corrupt': [1],
 'countri': [2],
 'crazi': [6],
 'cuba': [5],
 'daili': [5],
 'deer': [6],
 'differ': [0],
 'domest': [1],
 'eas': [5],
 'econom': [2],
 'economi': [0, 3, 5],
 'endem': [1],
 'even': [5, 6],
 'fall': [5],
 'fix': [3],
 'futur': [2, 4],
 'greatli': [0],
 'grow': [0],
 'ha': [0, 3, 5],
 'hard': [3],
 'heal': [2],
 'hi': [2, 4],
 'hors': [6],
 'howev': [0],
 'hunt': [6],
 'japan': [2],
 'last': [1],
 'minist': [2],
 'obama': [5],
 'pace': [0],
 'peopl': [2],
 'polit': [0],
 'prime': [2],
 'problem': [1],
 'putin': [3, 6],
 'rapid': [0],
 'ride': [6],
 'rubl': [3, 5],
 'russia': [3],
 'russian': [5],
 'sanction': [5],
 'seem': [1, 6],
 'seriou': [1, 6],
 'shinzo': [2, 4],
 'strong': [0],
 'thing': [6],
 'toward': [2],
 'tumbl': [3],
 'turmoil': [2],
 'us': [0],
 'valu': [5],
 'view': [2, 4],
 'violenc': [1],
 'vladimir': [3, 6],
 'work': [2, 3]}