In [9]:
# Scrape n documents of two categories from Wikipdia (say sports and education)
# - Preprocess and clean all the documents
# - Prepare 
#     - Unigram count Matrix
#     - Bigram Probability Matrix
#     - TF-IDF Matrix
# - Apply appropriate Naive Bayes classification on each matrix

In [1]:
sports_urls = [
	"https://en.wikipedia.org/wiki/Cricket",
 	"https://en.wikipedia.org/wiki/Hockey",
 	"https://en.wikipedia.org/wiki/Football",
]
education_urls = [
	"https://en.wikipedia.org/wiki/Education",
 	"https://en.wikipedia.org/wiki/Knowledge",
 	"https://en.wikipedia.org/wiki/Procedural_knowledge",
]

In [2]:
%pip install requests beautifulsoup4 nltk contractions pandas numpy scikit_learn

Note: you may need to restart the kernel to use updated packages.


In [12]:
import requests
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

def preprocess_text(text):
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text).strip().lower()
    tokens = word_tokenize(text)
    
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    tokens = [word for word in tokens if len(word) > 1]
    text = ' '.join(tokens)
    
    return text

def scrape_wikipedia(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    paragraphs = soup.find_all('p')
    text = '\n'.join([para.get_text() for para in paragraphs])
    return text

preprocessed_texts = []
for url in sports_urls + education_urls:
    print(f"Scraping data from: {url}")
    text = scrape_wikipedia(url)
    cleaned_text = preprocess_text(text)
    print(cleaned_text)
    preprocessed_texts.append(cleaned_text)
    print("="*50)
    print("\n")

Scraping data from: https://en.wikipedia.org/wiki/Cricket


firstclass cricket one day international limited over domestic twenty international twenty domestic form cricket batandball game played two team eleven player field centre yard metre pitch wicket end comprising two bail balanced three stump two player batting team striker nonstriker stand front either wicket one player fielding team bowler bowling ball towards striker wicket opposite end pitch striker goal hit bowled ball switch place nonstriker batting team scoring one run exchange run also scored ball reach cross boundary field ball bowled illegally fielding team try prevent run scored dismissing batter mean dismissal include bowled ball hit striker wicket dislodges bail fielding side either catching ball hit bat hit ground hitting wicket ball batter cross crease front wicket ten batter dismissed inning end team swap role game adjudicated two umpire aided third umpire match referee international match communicate two offfield scorer record match statistical information form cricket r

In [13]:
import numpy as np
import pandas as pd
from nltk import bigrams
from collections import Counter


def compute_unigram_count_matrix(texts):
    word_counts = [Counter(text.split()) for text in texts]
    unique_words = sorted(set(word for word_count in word_counts for word in word_count))
    unigram_count_matrix = np.zeros((len(texts), len(unique_words)), dtype=int)
    for i, word_count in enumerate(word_counts):
        for j, word in enumerate(unique_words):
            unigram_count_matrix[i, j] = word_count[word]
    return pd.DataFrame(unigram_count_matrix, columns=unique_words)


def compute_bigram_prob_matrix(texts):
    all_bigrams = [bigram for text in texts for bigram in bigrams(text.split())]
    bigram_counts = Counter(all_bigrams)
    bigram_prob_matrix = np.zeros((len(texts), len(bigram_counts)), dtype=float)
    unique_bigrams = sorted(set(all_bigrams))
    for i, _ in enumerate(texts):
        for j, bigram in enumerate(unique_bigrams):
            bigram_prob_matrix[i, j] = bigram_counts[bigram] / len(all_bigrams)
    return pd.DataFrame(bigram_prob_matrix, columns=unique_bigrams)
  
  
def compute_tfidf_matrix(unigram_count_matrix, texts):
    tf_matrix = unigram_count_matrix.apply(lambda x: x / x.sum(), axis=1)
    idf_vector = unigram_count_matrix.apply(lambda x: np.log(len(texts) / (x > 0).sum()), axis=0)
    tfidf_matrix = tf_matrix * idf_vector
    return tfidf_matrix


unigram_count_matrix = compute_unigram_count_matrix(preprocessed_texts)
unigram_count_matrix.index = [x.split('/')[-1] for x in sports_urls + education_urls]
print("Unigram Count Matrix:")
print(unigram_count_matrix)
print("="*50)
print("\n")


bigram_prob_matrix = compute_bigram_prob_matrix(preprocessed_texts)
bigram_prob_matrix.index = [x.split('/')[-1] for x in sports_urls + education_urls]
print("Bigram Probability Matrix:")
print(bigram_prob_matrix)
print("="*50)
print("\n")


tfidf_matrix = compute_tfidf_matrix(unigram_count_matrix, preprocessed_texts)
tfidf_matrix.index = [x.split('/')[-1] for x in sports_urls + education_urls]
print("TF-IDF Matrix:")
print(tfidf_matrix)
print("="*50)
print("\n")

Unigram Count Matrix:
                      abandon  abbreviation  abdali  aberdeen  ability  \
Cricket                     2             0       1         0        1   
Hockey                      0             0       0         0        0   
Football                    1             1       0         3        1   
Education                   0             0       0         0        6   
Knowledge                   0             0       0         0        5   
Procedural_knowledge        0             0       0         0        4   

                      abilitycitation  able  abolished  abolition  aboriginal  \
Cricket                             0     0          1          0           1   
Hockey                              1     0          0          0           0   
Football                            0     2          1          1           1   
Education                           0     3          0          0           0   
Knowledge                           0     4          0

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score

labels = np.random.choice([0, 1], size=len(preprocessed_texts))
X_train, X_test, y_train, y_test = train_test_split(preprocessed_texts, labels, test_size=0.2, random_state=42)

count_vectorizer = CountVectorizer()
X_train_counts = count_vectorizer.fit_transform(X_train)
X_test_counts = count_vectorizer.transform(X_test)

nb_classifier_unigram = MultinomialNB()
nb_classifier_unigram.fit(X_train_counts, y_train)

y_pred_unigram = nb_classifier_unigram.predict(X_test_counts)

# Evaluate accuracy
accuracy_unigram = accuracy_score(y_test, y_pred_unigram)
print("Accuracy using unigram count matrix:", accuracy_unigram)

# 2. Bigram Probability Matrix
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))
X_train_bigram = bigram_vectorizer.fit_transform(X_train)
X_test_bigram = bigram_vectorizer.transform(X_test)

# Train Naive Bayes classifier on bigram probability matrix
nb_classifier_bigram = MultinomialNB()
nb_classifier_bigram.fit(X_train_bigram, y_train)

# Make predictions on the testing set using bigram probability matrix
y_pred_bigram = nb_classifier_bigram.predict(X_test_bigram)

# Evaluate accuracy
accuracy_bigram = accuracy_score(y_test, y_pred_bigram)
print("Accuracy using bigram probability matrix:", accuracy_bigram)

# 3. TF-IDF Matrix
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train Naive Bayes classifier on TF-IDF matrix
nb_classifier_tfidf = MultinomialNB()
nb_classifier_tfidf.fit(X_train_tfidf, y_train)

# Make predictions on the testing set using TF-IDF matrix
y_pred_tfidf = nb_classifier_tfidf.predict(X_test_tfidf)

# Evaluate accuracy
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
print("Accuracy using TF-IDF matrix:", accuracy_tfidf)


Accuracy using unigram count matrix: 0.5
Accuracy using bigram probability matrix: 0.5
Accuracy using TF-IDF matrix: 0.5
