In [1]:
from bs4 import BeautifulSoup
import requests
import os.path as path

In [46]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.stem import PorterStemmer

### Fetch Scrape N documents of two categories from wikipedia


In [47]:
URLS={
    "football":["https://en.wikipedia.org/wiki/Football","https://en.wikipedia.org/wiki/American_football","https://en.wikipedia.org/wiki/Association_football","https://en.wikipedia.org/wiki/Australian_rules_football","https://en.wikipedia.org/wiki/Gaelic_football"],
    "algorithm":["https://en.wikipedia.org/wiki/Algorithm","https://en.wikipedia.org/wiki/Analysis_of_algorithms","https://en.wikipedia.org/wiki/Computational_complexity","https://en.wikipedia.org/wiki/Worst-case_complexity","https://en.wikipedia.org/wiki/Average-case_complexity"],
}
BASE_URL="Documents"


### Helper Functions

In [48]:
def getHTMLFromURL(url):
    page = requests.get(url).content
    soup = BeautifulSoup(page, 'html.parser')
    return soup;

def getBodyTextFromHTML(soup):
    txt = soup.find(attrs={'id':'bodyContent'}).text
    txt=txt.strip();
    return txt;

def writeInTextFile(topic,filename,text):
    file = open(path.join(BASE_URL,topic,filename),"w")
    file.write(text)
    file.close()

def mainFunction():
    for topic in URLS:
        fileno=1;
        for url in URLS[topic]:
            soup = getHTMLFromURL(url)
            text = getBodyTextFromHTML(soup)
            filename = f"{fileno}.txt"
            writeInTextFile(topic,filename,text)
            print("Done for ",topic,filename)
            fileno=fileno+1
        

### Getting text from the urls provided

In [49]:

mainFunction()

Done for  football 1.txt
Done for  football 2.txt
Done for  football 3.txt
Done for  football 4.txt
Done for  football 5.txt
Done for  algorithm 1.txt
Done for  algorithm 2.txt
Done for  algorithm 3.txt
Done for  algorithm 4.txt
Done for  algorithm 5.txt


### Preperations

#### Unigram Count Matrix

In [50]:
unique_word_set={}
unique_word_dict={}

def getUniqueWords(text):
    unique_words={}
    words = word_tokenize(text)
    for word in words:
            unique_word_set[word]=1
            if word in unique_words:
                unique_words[word]=unique_words[word]+1
            else:
                unique_words[word]=1
    return unique_words;

def getUniqueWordsFromFiles():
    for topic in URLS:
        for i in range(1,6):
            filename = f"{i}.txt"
            file = open(path.join(BASE_URL,topic,filename),"r")
            text = file.read()
            file.close()
            unique_words=getUniqueWords(text);
            if topic not in unique_word_dict:
                unique_word_dict[topic]={}

            for word in unique_words:
                if word in unique_word_dict[topic]:
                    unique_word_dict[topic][word]=unique_word_dict[topic][word]+1
                else:
                    unique_word_dict[topic][word]=1
                    

    print("Total Unique Words from files are ",len(unique_word_set))



In [51]:
getUniqueWordsFromFiles()

Total Unique Words from files are  13272


In [54]:
for word in unique_word_set:
    print("word",end="\t\t")
    print(word,end="\t\t");

print()

for topic in unique_word_dict:
    print(topic,end="\t")
    for word in unique_word_set:
        if word not in unique_word_dict[topic]:
            print(0,end="\t\t")
        else:
            print(unique_word_dict[topic][word],end="\t\t")
    print()

football	5		5		5		5		5		5		2		5		5		5		5		5		5		5		5		2		5		5		4		5		5		4		5		5		5		4		5		5		5		3		5		5		5		5		1		3		5		5		1		5		1		5		5		5		2		5		2		2		5		5		5		5		1		3		3		2		5		5		5		5		5		5		5		5		5		3		5		5		5		5		5		5		5		5		5		5		5		5		5		5		5		2		5		5		5		5		5		5		5		5		5		5		5		2		4		4		5		5		5		5		5		3		5		3		1		5		5		5		5		5		5		5		5		5		1		5		5		2		5		1		5		5		5		4		4		5		4		4		4		5		1		2		5		5		5		3		4		5		5		2		5		4		2		5		5		4		2		5		5		5		2		3		5		3		1		5		5		5		3		2		4		5		2		5		1		5		5		4		5		5		4		5		4		5		4		4		4		4		5		2		3		5		5		5		3		3		2		1		5		4		4		5		1		5		5		5		1		2		5		5		5		5		5		5		5		4		5		4		5		4		3		3		4		5		5		5		4		5		5		5		5		5		5		3		3		4		5		5		5		5		2		3		4		5		5		5		5		5		5		5		3		5		5		5		5		5		2		4		1		5		2		5		5		5		4		4		2		5		5		3		1		3		5		3		5		5		1		4		3		5		1		4		4		5		5		4		5		4		3		5		1		1		4		5		4		1		2		3		2		5		3		1		3		5		4		5		5		5		1		3		5		4		5		5		5		5		4		1		1		2		1		2		5		2		3		5		1		1		1		1		1		1		1		2		1		1		1		1		3		2		2

#### Bigram Probability

In [55]:
unique_bigram_set={}
unique_bigram_dict={}

def getUniqueBigramsWords(text):
    unique_words={}
    words = word_tokenize(text)
    prev="<string>"
    for word in words:
            bigram=prev+" "+word
            unique_bigram_set[bigram]=1
            if bigram in unique_words:
                unique_words[bigram]=unique_words[bigram]+1
            else:
                unique_words[bigram]=1
            prev=word
    return unique_words;


def getUniqueWordsFromFiles():
    for topic in URLS:
        for i in range(1,6):
            filename = f"{i}.txt"
            file = open(path.join(BASE_URL,topic,filename),"r")
            text = file.read()
            file.close()
            unique_words=getUniqueWords(text);
            if topic not in unique_word_dict:
                unique_word_dict[topic]={}

            for word in unique_words:
                if word in unique_word_dict[topic]:
                    unique_word_dict[topic][word]=unique_word_dict[topic][word]+1
                else:
                    unique_word_dict[topic][word]=1
                    

    print("Total Unique Words from files are ",len(unique_word_set))