In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

url_df = pd.read_csv('/content/url_only_data.csv', header=None, names=['url'])

url_df['source'] = ['FoxNews' if i < 2010 else 'NBC' for i in range(len(url_df))]

def scrape_headline(url, source):
    try:
        response = requests.get(url, timeout=5)
        if response.status_code != 200:
            return None
        soup = BeautifulSoup(response.text, 'html.parser')

        if source == 'FoxNews':
            h1 = soup.find('h1', class_='headline speakable')
        else:
            h1 = soup.find('h1')

        return h1.text.strip() if h1 else None
    except Exception as e:
        return None

headlines = []
for idx, row in url_df.iterrows():
    headline = scrape_headline(row['url'], row['source'])
    headlines.append(headline)
    time.sleep(0.5)

url_df['headline'] = headlines

url_df = url_df.dropna(subset=['headline'])
url_df.to_csv('/content/scraped_headlines.csv', index=False)

url_df.head()


Unnamed: 0,url,source,headline
1,https://www.foxnews.com/lifestyle/jack-carrs-e...,FoxNews,Jack Carr recalls Gen. Eisenhower's D-Day memo...
2,https://www.foxnews.com/entertainment/bruce-wi...,FoxNews,"Bruce Willis, Demi Moore avoided doing one thi..."
4,https://www.foxnews.com/entertainment/emily-bl...,FoxNews,Emily Blunt says her ‘toes curl’ when people t...
5,https://www.foxnews.com/media/the-view-co-host...,FoxNews,"'The View' co-host, CNN commentator Ana Navarr..."
8,https://www.foxnews.com/media/tom-cotton-turns...,FoxNews,Tom Cotton turns tables on CNN's Dana Bash on ...


In [None]:
import re
import string

df = pd.read_csv('/content/scraped_headlines.csv')

# Clean headlines
def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['cleaned_headline'] = df['headline'].apply(clean_text)

df['label'] = df['source'].apply(lambda x: 1 if x == 'FoxNews' else 0)

df.to_csv('/content/cleaned_headlines.csv', index=False)

df[['cleaned_headline', 'label']].head()


Unnamed: 0,cleaned_headline,label
0,jack carr recalls gen eisenhowers dday memo ab...,1
1,bruce willis demi moore avoided doing one thin...,1
2,emily blunt says her toes curl when people tel...,1
3,the view cohost cnn commentator ana navarro to...,1
4,tom cotton turns tables on cnns dana bash on g...,1


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

df = pd.read_csv('/content/cleaned_headlines.csv')

X = df['cleaned_headline']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=100)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

model = LogisticRegression(max_iter=100)
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.6512
Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.79      0.69       337
           1       0.70      0.51      0.59       331

    accuracy                           0.65       668
   macro avg       0.66      0.65      0.64       668
weighted avg       0.66      0.65      0.64       668



In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd

df = pd.read_csv('/content/cleaned_headlines.csv')
X = df['cleaned_headline']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

models = {
    "Logistic Regression": LogisticRegression(max_iter=200),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

for name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    acc = accuracy_score(y_test, y_pred)
    print(f"\n{name} Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))



Logistic Regression Accuracy: 0.7784
              precision    recall  f1-score   support

           0       0.75      0.84      0.79       337
           1       0.82      0.71      0.76       331

    accuracy                           0.78       668
   macro avg       0.78      0.78      0.78       668
weighted avg       0.78      0.78      0.78       668


Naive Bayes Accuracy: 0.7904
              precision    recall  f1-score   support

           0       0.76      0.86      0.81       337
           1       0.83      0.72      0.77       331

    accuracy                           0.79       668
   macro avg       0.80      0.79      0.79       668
weighted avg       0.80      0.79      0.79       668


Random Forest Accuracy: 0.7515
              precision    recall  f1-score   support

           0       0.76      0.75      0.75       337
           1       0.75      0.75      0.75       331

    accuracy                           0.75       668
   macro avg       0.75     

In [None]:
from sklearn.naive_bayes import MultinomialNB

df = pd.read_csv('/content/cleaned_headlines.csv')
X = df['cleaned_headline']
y = df['label']

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1,2))
X_tfidf = vectorizer.fit_transform(X)

model = MultinomialNB()
model.fit(X_tfidf, y)

def predict_news_source(headlines):
    cleaned = [re.sub(r'[^a-zA-Z0-9\s]', '', h.lower()).strip() for h in headlines]
    tfidf = vectorizer.transform(cleaned)
    preds = model.predict(tfidf)
    labels = ['NBC' if p == 0 else 'FoxNews' for p in preds]
    return labels


In [None]:
sub_testset = [
    "Biden outlines new education policy during speech in Pennsylvania",
    "Fox News anchor clashes with guest over inflation numbers",
    "NBC exclusive: Inside the Pentagon's new space command",
    "Trump holds rally amid criminal investigations and indictments",
    "NBC reports on increasing rent prices across the country",
    "Fox News investigates Hunter Biden business dealings",
    "Supreme Court to hear major case on abortion access",
    "Fox News personality criticizes media coverage of border crisis",
    "NBC hosts town hall on climate change initiatives",
    "Fox & Friends segment covers California's crime rates",
    "NBC reveals findings from latest COVID-19 study",
    "Fox News panel debates Biden's foreign policy stance",
    "NBC analyst discusses 2024 election predictions",
    "Fox News questions timing of DOJ investigation",
    "NBC coverage of women's rights march in Washington",
    "Fox News airs interview with conservative think tank leader",
    "NBC reports on new infrastructure bill progress",
    "Fox News criticizes 'woke' culture in universities",
    "NBC poll shows declining trust in government",
    "Fox News anchor challenges guest on tax policy"
]

predictions = predict_news_source(sub_testset)
for i, (headline, pred) in enumerate(zip(sub_testset, predictions)):
    print(f"{i+1}. [{pred}] {headline}")


1. [NBC] Biden outlines new education policy during speech in Pennsylvania
2. [FoxNews] Fox News anchor clashes with guest over inflation numbers
3. [NBC] NBC exclusive: Inside the Pentagon's new space command
4. [NBC] Trump holds rally amid criminal investigations and indictments
5. [FoxNews] NBC reports on increasing rent prices across the country
6. [FoxNews] Fox News investigates Hunter Biden business dealings
7. [NBC] Supreme Court to hear major case on abortion access
8. [FoxNews] Fox News personality criticizes media coverage of border crisis
9. [NBC] NBC hosts town hall on climate change initiatives
10. [FoxNews] Fox & Friends segment covers California's crime rates
11. [FoxNews] NBC reveals findings from latest COVID-19 study
12. [NBC] Fox News panel debates Biden's foreign policy stance
13. [NBC] NBC analyst discusses 2024 election predictions
14. [FoxNews] Fox News questions timing of DOJ investigation
15. [FoxNews] NBC coverage of women's rights march in Washington
16. [Fox

In [None]:
import joblib

joblib.dump({"model": model, "vectorizer": vectorizer}, "naive_bayes_model.pkl")

['naive_bayes_model.pkl']

In [None]:
from huggingface_hub import login

login()

In [None]:
from huggingface_hub import upload_file

upload_file(
    path_or_fileobj="naive_bayes_model.pkl",
    path_in_repo="naive_bayes_model.pkl",
    repo_id="nehirsunargs/nb-model",
    repo_type="model"
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


naive_bayes_model.pkl:   0%|          | 0.00/353k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/nehirsunargs/nb-model/commit/7f79d2082eff63c02049e959ab799a3f76113151', commit_message='Upload naive_bayes_model.pkl with huggingface_hub', commit_description='', oid='7f79d2082eff63c02049e959ab799a3f76113151', pr_url=None, repo_url=RepoUrl('https://huggingface.co/nehirsunargs/nb-model', endpoint='https://huggingface.co', repo_type='model', repo_id='nehirsunargs/nb-model'), pr_revision=None, pr_num=None)

In [None]:
import pandas as pd
import re
import joblib
from huggingface_hub import hf_hub_download

df = pd.read_csv('final_test_data.csv')

# Load model and vectorizer from Hugging Face
model_bundle = joblib.load(
    hf_hub_download("nehirsunargs/nb-model", "naive_bayes_model.pkl")
)
model = model_bundle["model"]
vectorizer = model_bundle["vectorizer"]

# Clean headlines
def clean_text(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', str(text).lower()).strip()

df['cleaned'] = df['Headline'].apply(clean_text)
X_test = vectorizer.transform(df['cleaned'])
preds = model.predict(X_test)

# Convert numerical predictions to label strings
df['Label(FoxNews/NBC)'] = ['FoxNews' if p == 1 else 'NBC' for p in preds]

submission_df = df[['ID', 'Headline', 'Label(FoxNews/NBC)']]

submission_df.to_csv('test_submission.csv', index=False)

files.download('test_submission.csv')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


naive_bayes_model.pkl:   0%|          | 0.00/353k [00:00<?, ?B/s]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>