In [5]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize, sent_tokenize
import re
from urllib.parse import quote

In [6]:
whitespaces = re.compile("\n[\n]+\n")
refs = re.compile("\[.*?\]")

def clean_page(page):
    soup = BeautifulSoup(page, 'html.parser')
    paras = ["".join(list(s.strings)).strip()+"\n" for s in soup.find_all('p')]
    paras = [s for s in paras if s != "\n"] # paras
    paras = [re.sub(refs, "", s) for s in paras] # remove refs
    cleaned_text = "\n".join(paras)
    return cleaned_text

In [7]:
import requests

def fetch_titles(category):
    category = "_".join(category.split(" "))
    req = f"https://en.wikipedia.org/w/api.php?action=query&generator=categorymembers&gcmtitle=Category:{category}&cmprop=title&cllimit=max&gcmlimit=max&format=json"
    out = requests.get(req).json()
    titles = [q['title'] for q in out['query']['pages'].values()]
    return titles

def get_page_text_list(category, num):
    num_pages = 0
    pages = []
    categories = [category]
    while num_pages < num:
        titles = fetch_titles(categories[-1])
        categories.pop()
        for t in titles:
            if t[:8] == "Category": categories.append(t[9:])
        titles = [t for t in titles if t[:6] != "Portal" and t[:8] != "Category" and t[:8] != "Template"]
        num_pages += len(titles)
        pages.extend(titles)
    texts = []
    for page in pages:
        name = page
        try:
            url = "https://en.wikipedia.org/wiki/" + "_".join(page.split(" "))
            page = urlopen(url)
        except:
            continue
        # print("fetching", name)
        page = page.read().decode("utf-8")
        text = clean_page(page)
        texts.append(text)
        if len(texts) == num:
            break

    return texts

In [8]:
import pandas as pd

categories = ["Sports", "Education"]
num_pages = 10
docs = []
labels = []
for c in categories:
    print("Fetching {}...".format(c))
    pages = get_page_text_list(c, num_pages)
    docs.extend(pages)
    labels.extend([c]*num_pages)

Fetching Sports...
Fetching Education...


In [9]:
# pick a subset of the pages
start = 0
num_words = 1000
sub_docs = [" ".join([w.lower()
                      for w in word_tokenize(d) 
                      if w.isalnum()][start:start+num_words]) 
            for d in docs]

data = {
    "docs": sub_docs,
    "category": labels 
}
df = pd.DataFrame(data)

In [11]:
df.to_csv("text.csv")

In [10]:
df

Unnamed: 0,docs,category
0,sport pertains to any form of physical activit...,Sports
1,casa do pessoal do porto do lobito or simply k...,Sports
2,gombe united football club is a nigerian footb...,Sports
3,aibi international is a company operating in t...,Sports
4,an airboard is an inflatable bodyboard for the...,Sports
5,ali on the run is a fitness podcast hosted by ...,Sports
6,aquabiking also called aqua cycling is the com...,Sports
7,an assault course also called trim trail is a ...,Sports
8,avestavallen is a sports complex in avesta it ...,Sports
9,balloon baloun or is a game similar to the mod...,Sports


In [57]:
from collections import Counter

all_words = []
for d in sub_docs: all_words.extend(word_tokenize(d))
unique_words = set(all_words)
unigram_matrix = []

for d in sub_docs:
    counts = {w: 0 for w in unique_words}
    for w, ct in Counter(word_tokenize(d)).items():
        counts[w] = ct
    unigram_matrix.append(counts)
    
uni_df = pd.DataFrame(unigram_matrix)
uni_df.head()

Unnamed: 0,bahia,services,sorted,mainstream,university,emotions,craft,folkestone,skills,kind,...,league,along,1975,forming,schoenebeck,organizational,academic,513,improve,or
0,0,0,0,1,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,2,25
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [74]:
uni_df

Unnamed: 0,bahia,services,sorted,mainstream,university,emotions,craft,folkestone,skills,kind,...,league,along,1975,forming,schoenebeck,organizational,academic,513,improve,or
0,0,0,0,1,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,2,25
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7,0,0,0,0,0,0,1,1,0,0,...,0,2,0,0,0,0,0,0,1,9
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3


In [58]:
import nltk

all_bigrams = []
for d in sub_docs: all_bigrams.extend(list(nltk.bigrams(word_tokenize(d))))
unique_bigrams = set(all_bigrams)
bigram_matrix = []

for i, d in enumerate(sub_docs):
    prob = {bg: 0 for bg in unique_bigrams}
    bigram_counts = Counter(nltk.bigrams(word_tokenize))
    for bigram, count in bigram_counts.items():
        prev_word = bigram[0]
        prev_word_count = unigram_matrix[i][prev_word]
        bigram_prob = count/prev_word_count
        prob[bigram] = bigram_prob
    bigram_matrix.append(prob)
    
bi_df = pd.DataFrame(bigram_matrix)
bi_df.head()

Unnamed: 0,"(z, has)","(top, flight)","(available, to)","(learning, agenda)","(primary, role)","(while, spending)","(ice, hockey)","(wikiproject, tags)","(events, such)","(became, more)",...,"(issues, sorted)","(business, decisions)","(subject, and)","(pertains, to)","(made, books)","(structures, and)","(more, importantly)","(education, as)","(the, better)","(among, all)"
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [59]:
"""
Term frequency is same as unigram count computed in unigram matrix
We just need to compute the inverse document frequncy (idf) of each term
then subsequently multiply idf with terms frequency to get tf-idf table
idf = log((1+n)/(1+df(t))) + 1 ; where n is no of docs, df(t) is no of docs containing term t
""";

In [60]:
from math import log2
# Computing idf

idf = {}
n = len(sub_docs)
doc_terms = [set(word_tokenize(d)) for d in sub_docs]
for term in unique_words:
    df_term = 0
    for doc in doc_terms:
        if term in doc: df_term += 1
    idf[term] = log2((1+n)/(1+df_term)) + 1
    

In [61]:
from copy import deepcopy
# tf-idf matrix
tfidf_matrix = []

for uni_count in unigram_matrix:
    term_freqs = deepcopy(uni_count)
    for term, freq in term_freqs.items():
        term_freqs[term] = freq * idf[term]
    tfidf_matrix.append(term_freqs)
    
tfidf_df = pd.DataFrame(tfidf_matrix)
tfidf_df.head()

Unnamed: 0,bahia,services,sorted,mainstream,university,emotions,craft,folkestone,skills,kind,...,league,along,1975,forming,schoenebeck,organizational,academic,513,improve,or
0,0.0,0.0,0.0,4.392317,0.0,0.0,0.0,0.0,2.584963,0.0,...,0.0,0.0,0.0,4.392317,0.0,0.0,0.0,0.0,6.140779,34.807936
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.807355,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.392317
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7.61471,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.392317
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [62]:
from sklearn.naive_bayes import MultinomialNB

y = df["category"].apply(lambda x: 0 if x =="Sports" else 1)
naive_bayes = MultinomialNB()

In [77]:
new_doc = "sports is good for health"

In [78]:
uni_vector = {w: 0 for w in unique_words}
for w, ct in Counter(word_tokenize(new_doc)).items():
    if w in uni_vector: uni_vector[w] = ct

bi_vector = {bg: 0 for bg in unique_bigrams}
bigram_counts = Counter(nltk.bigrams(word_tokenize(new_doc)))
for bigram, count in bigram_counts.items():
    prev_word = bigram[0]
    
    if bigram in bi_vector and prev_word in uni_vector:
        prev_word_count = uni_vector[prev_word]
        bigram_prob = count/prev_word_count
        bi_vector[bigram] = bigram_prob
        
term_freqs = deepcopy(uni_vector)
for term, freq in term_freqs.items():
    term_freqs[term] = freq * idf[term]

print(uni_vector)
print(bi_vector)
print(term_freqs)

{'bahia': 0, 'services': 0, 'sorted': 0, 'mainstream': 0, 'university': 0, 'emotions': 0, 'craft': 0, 'folkestone': 0, 'skills': 0, 'kind': 0, 'scholars': 0, 'fide': 0, 'studies': 0, 'handle': 0, 'volleyball': 0, 'weak': 0, 'connotation': 0, 'nature': 0, 'ecopedagogy': 0, 'differently': 0, 'system': 0, 'then': 0, 'socializes': 0, 'critical': 0, 'promote': 0, 'motorcycle': 0, 'assault': 0, 'zoomers': 0, 'conseil': 0, 'team': 0, 'bracer': 0, 'depends': 0, 'developing': 0, 'leading': 0, 'maria': 0, 'compensation': 0, 'software': 0, 'contests': 0, '10': 0, 'coherently': 0, 'twin': 0, 'cism': 0, 'actively': 0, 'ahead': 0, 'publisher': 0, 'around': 0, 'knowledgeable': 0, 'mazes': 0, 'fears': 0, 'replaces': 0, 'socialization': 0, 'prevent': 0, 'farmers': 0, 'impact': 0, 'wikipedia': 0, 'our': 0, 'dilemma': 0, 'casa': 0, 'size': 0, 'people': 0, 'english': 0, 'externalization': 0, 'sided': 0, 'wide': 0, 'stipulate': 0, 'immersion': 0, 'besides': 0, 'musashino': 0, 'social': 0, 'carl': 0, 'likew

In [79]:
term_freqs["education"]

0.0

In [80]:
# unigram prediction
naive_bayes.fit(uni_df, y)
pred = naive_bayes.predict(pd.DataFrame([uni_vector]))[0];
if pred == 0: print("Sports")
else: print("Education")

Sports


In [81]:
#bigram prediction
naive_bayes.fit(bi_df, y)
pred = naive_bayes.predict(pd.DataFrame([bi_vector]))[0];
if pred == 0: print("Sports")
else: print("Education")

Education


In [82]:
#tfidf prediction
naive_bayes.fit(tfidf_df, y)
pred = naive_bayes.predict(pd.DataFrame([term_freqs]))[0];
if pred == 0: print("Sports")
else: print("Education")

Sports
