In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tqdm

In [None]:
def load_clickbait_data():
    df = pd.read_csv('./clickbait_data.csv')
    df, test_df = train_test_split(df, test_size=0.1, random_state=17)
    texts = df['headline']
    labels = df['clickbait'].values.astype(int)
    test_texts = test_df['headline'].values
    test_labels = test_df['clickbait'].values.astype(int)
    
    return texts, labels, test_texts, test_labels

In [None]:
def feature_extraction(texts, test_texts):
    vectorizer = CountVectorizer(max_features=10000, stop_words='english')
    X = vectorizer.fit_transform(texts)
    X_test = vectorizer.transform(test_texts)
    
    return X, X_test, vectorizer

In [None]:
def train_naive_bayes(X, labels, vectorizer):
    X_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
    
    X_df['--label--'] = labels.astype(int)
    
    # log_prob_class
    class_count_ = X_df['--label--'].value_counts()
    log_class_count = np.log(class_count_)
    log_prob_class = log_class_count - np.log(class_count_.sum())
    
    # log_prob_token
    token_count_per_class = X_df.groupby('--label--').sum().reset_index(level=0, drop=True)
    token_count_per_class = token_count_per_class + 1.0
    token_per_class = token_count_per_class.sum(1).values.reshape(-1, 1)
    
    prob_token_count_per_class = np.log(token_count_per_class) - np.log(token_per_class)
    
    return prob_token_count_per_class, log_prob_class

In [None]:
def predict_single_doc(doc, vectorizer, prob_token_count_per_class, log_prob_class):
    # TODO
    # This function input a document ``doc" and return the prediction ``pred" (numeric output of 0 or 1)
    # An output ``pred" of 0 means we predict the document ``doc" as non-clickbait
    # An output ``pred"of 1 means we predict the document ``doc" as clickbait
    
    # END TODO
    return pred

---

In [None]:
texts, labels, test_texts, test_labels = load_clickbait_data()

In [None]:
X, X_test, vectorizer = feature_extraction(texts, test_texts)

In [None]:
prob_token_count_per_class, log_prob_class = train_naive_bayes(X, labels, vectorizer)

In [None]:
preds = []
for i, doc in enumerate(test_texts):
    preds.append(predict_single_doc(doc, vectorizer, prob_token_count_per_class, log_prob_class))
    if i % 500 == 0:
        print("Done", i)

In [None]:
print(classification_report(test_labels, preds))

---

#### Compare your results with the scikit-learn Naive Bayes implementation

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X, labels)
print(classification_report(test_labels, clf.predict(X_test)))

##### Make sure the results from your implementation is the same or similar to the one implemented in scikit-learn