In [244]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tqdm

In [245]:
def load_clickbait_data():
    df = pd.read_csv('./clickbait_data.csv')
    df, test_df = train_test_split(df, test_size=0.1, random_state=17)
    texts = df['headline']
    labels = df['clickbait'].values.astype(int)
    test_texts = test_df['headline'].values
    test_labels = test_df['clickbait'].values.astype(int)
    
    return texts, labels, test_texts, test_labels

In [246]:
def feature_extraction(texts, test_texts):
    vectorizer = CountVectorizer(max_features=10000, stop_words='english')
    X = vectorizer.fit_transform(texts)
    X_test = vectorizer.transform(test_texts)
    
    return X, X_test, vectorizer

In [247]:
def train_naive_bayes(X, labels, vectorizer):
    X_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
    
    X_df['--label--'] = labels.astype(int)
    
    # log_prob_class
    class_count_ = X_df['--label--'].value_counts()
    log_class_count = np.log(class_count_)
    log_prob_class = log_class_count - np.log(class_count_.sum())
    
    # log_prob_token
    token_count_per_class = X_df.groupby('--label--').sum().reset_index(level=0, drop=True)
    token_count_per_class = token_count_per_class + 1.0
    token_per_class = token_count_per_class.sum(1).values.reshape(-1, 1)
    
    prob_token_count_per_class = np.log(token_count_per_class) - np.log(token_per_class)
    
    return prob_token_count_per_class, log_prob_class

In [256]:
def predict_single_doc(doc, vectorizer, prob_token_count_per_class, log_prob_class):
    # TODO
    # This function input a document ``doc" and return the prediction ``pred" (numeric output of 0 or 1)
    # An output ``pred" of 0 means we predict the document ``doc" as non-clickbait
    # An output ``pred"of 1 means we predict the document ``doc" as clickbait
    c_sum = np.array([0.0,0.0])
    c_sum += np.sum([prob_token_count_per_class[token]\
            for token in doc.split() if token in vectorizer.get_feature_names_out()],0)
    nonclickbait = c_sum[0]+log_prob_class[0]
    clickbait = c_sum[1]+log_prob_class[1]
    pred = np.argmax([nonclickbait, clickbait])
    # END TODO
    return pred

---

In [249]:
texts, labels, test_texts, test_labels = load_clickbait_data()

In [250]:
X, X_test, vectorizer = feature_extraction(texts, test_texts)

In [251]:
prob_token_count_per_class, log_prob_class = train_naive_bayes(X, labels, vectorizer)

In [257]:
preds = []
for i, doc in enumerate(test_texts):
    preds.append(predict_single_doc(doc, vectorizer, prob_token_count_per_class, log_prob_class))
    if i % 500 == 0:
        print("Done", i)

-0.6952326870509662 -0.6910660143561209
Done 0
-17.14661481429669 -23.458703125520923
-43.889863874317705 -51.80201630228768
-0.6952326870509662 -0.6910660143561209
-0.6952326870509662 -0.6910660143561209
-0.6952326870509662 -0.6910660143561209
-0.6952326870509662 -0.6910660143561209
-6.554229701205301 -12.074884569938522
-20.317699975328544 -20.819645795905664
-0.6952326870509662 -0.6910660143561209
-21.709512546468687 -30.485812854413734
-25.16646361971268 -32.645297103767106
-8.917439416015782 -5.822980686772634
-0.6952326870509662 -0.6910660143561209
-0.6952326870509662 -0.6910660143561209
-0.6952326870509662 -0.6910660143561209
-0.6952326870509662 -0.6910660143561209
-0.6952326870509662 -0.6910660143561209
-32.13236355885125 -40.650391133539415
-17.938925743976952 -22.360090836852816
-0.6952326870509662 -0.6910660143561209
-39.30585549555108 -46.38651668111666
-16.03296464826404 -22.360090836852816
-21.194174310745233 -30.748177118881223
-8.694295864701571 -6.975018142114323
-0.69

KeyboardInterrupt: 

In [253]:
print(classification_report(test_labels, preds))

              precision    recall  f1-score   support

           0       0.99      0.61      0.75      1631
           1       0.71      0.99      0.83      1569

    accuracy                           0.80      3200
   macro avg       0.85      0.80      0.79      3200
weighted avg       0.85      0.80      0.79      3200



---

#### Compare your results with the scikit-learn Naive Bayes implementation

In [254]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X, labels)
print(classification_report(test_labels, clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.97      0.96      0.96      1631
           1       0.95      0.97      0.96      1569

    accuracy                           0.96      3200
   macro avg       0.96      0.96      0.96      3200
weighted avg       0.96      0.96      0.96      3200



##### Make sure the results from your implementation is the same or similar to the one implemented in scikit-learn