In [1]:
categories = {
    "Fiction": [
        "https://www.goodreads.com",
        "https://www.tor.com",
        "https://lithub.com"
    ],
    "Non-Fiction": [
        "https://www.npr.org/sections/books",
        "https://www.theguardian.com/books",
        "https://www.newyorker.com/books"
    ],
    "Science Fiction & Fantasy": [
        "https://www.tor.com",
        "https://www.locusmag.com",
        "https://www.sfsite.com"
    ],
    "Mystery & Thriller": [
        "https://www.crimereads.com",
        "https://www.mysteryscenemag.com",
        "https://www.bookreporter.com"
    ],
    "Classic Literature": [
        "https://www.gutenberg.org",
        "https://www.britannica.com/art/classic-literature",
        "https://www.openculture.com/free_ebooks"
    ],
    "Poetry": [
        "https://www.poetryfoundation.org",
        "https://www.poets.org",
        "https://www.poetryarchive.org"
    ],
    "Children's Books": [
        "https://www.scholastic.com",
        "https://www.booktrust.org.uk",
        "https://www.commonsensemedia.org"
    ],
    "Young Adult (YA)": [
        "https://www.epicreads.com",
        "https://www.teenreads.com",
        "https://www.yalsa.ala.org/thehub/"
    ],
    "Historical Fiction": [
        "https://historicalnovelsociety.org",
        "https://www.bookbub.com/blog/tag/historical-fiction",
        "https://www.nytimes.com/section/books"
    ],
    "Horror & Gothic": [
        "https://www.the-line-up.com",
        "https://www.horrorgeeklife.com",
        "https://www.nightmare-magazine.com"
    ],
    "Graphic Novels & Comics": [
        "https://www.comicbookherald.com",
        "https://www.cbr.com",
        "https://www.pastemagazine.com/comics"
    ],
    "Self-Help & Personal Development": [
        "https://www.success.com",
        "https://jamesclear.com",
        "https://www.lifehack.org"
    ],
    "Book Reviews & Recommendations": [
        "https://bookriot.com",
        "https://www.kirkusreviews.com",
        "https://www.nytimes.com/section/books/review"
    ],
    "Publishing & Writing": [
        "https://www.writersdigest.com",
        "https://www.thecreativepenn.com",
        "https://www.nanowrimo.org"
    ],
    "Libraries & Archives": [
        "https://www.loc.gov",
        "https://www.worldcat.org",
        "https://www.bl.uk"
    ]
}


In [3]:
import os
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime

# Create directory for output
os.makedirs("scraped_data", exist_ok=True)

def extract_date(soup):
    """Extract article date from meta tags."""
    date_tags = ["article:published_time", "date", "publish-date", "pubdate"]
    for tag in date_tags:
        date_meta = soup.find("meta", {"property": tag}) or soup.find("meta", {"name": tag})
        if date_meta and date_meta.get("content"):
            return date_meta["content"]
    return datetime.today().strftime("%Y-%m-%d")  # Default to today's date

def clean_text(text):
    """Cleans text by removing unnecessary spaces and line breaks."""
    text = re.sub(r"\s+", " ", text)  # Normalize whitespace
    return text.strip()

def scrape_and_save(category, urls):
    structured_data = []

    for url in urls:
        try:
            response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")

            title = soup.title.string.strip() if soup.title else "Untitled Article"
            date = extract_date(soup)
            paragraphs = [p.get_text(strip=True) for p in soup.find_all("p")]
            content = "\n".join(paragraphs)

            structured_data.append(f"=== {title} ===\nDate: {date}\nSource: {url}\n--------------------\n{clean_text(content)}\n\n")

        except requests.RequestException as e:
            print(f"Error scraping {url}: {e}")

    file_path = os.path.join("scraped_data", f"{category}111.txt")
    with open(file_path, "w", encoding="utf-8") as f:
        f.writelines(structured_data)

    print(f"Structured data saved for category: {category}")



# Scrape data
for category, urls in categories.items():
    scrape_and_save(category, urls)


Structured data saved for category: Fiction
Structured data saved for category: Non-Fiction
Error scraping https://www.sfsite.com: 404 Client Error: Not Found for url: https://www.sfsite.com/
Structured data saved for category: Science Fiction & Fantasy
Error scraping https://www.mysteryscenemag.com: HTTPSConnectionPool(host='www.mysteryscenemag.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1006)')))
Structured data saved for category: Mystery & Thriller
Error scraping https://www.britannica.com/art/classic-literature: 404 Client Error: Not Found for url: https://www.britannica.com/art/classic-literature
Structured data saved for category: Classic Literature
Structured data saved for category: Poetry
Structured data saved for category: Children's Books
Error scraping https://www.epicreads.com: 403 Client Error: Forbidden for url: https://www.e

In [6]:
filepath = "scraped_data/Fiction111.txt"
with open(filepath, "r", encoding="utf-8") as file:
    content = file.read()

print(content)

=== Goodreads | Meet your next favorite book ===
Date: 2025-03-09
Source: https://www.goodreads.com
--------------------
You’re in the right place. Tell us what titles or genres you’ve enjoyed in the past, and we’ll give you surprisingly insightful recommendations. Chances are your friends are discussing their favorite (and least favorite) books on Goodreads. Because Brian liked… He discovered: Decision-making, Sociology, Marketing Because Shomeret liked… She discovered: Psychology, Animals, Science, Nature More book lists Gain access to a massive audience of book lovers. Goodreads is a great place to promote your books. Welcome back. Just a moment while we sign you in to your Goodreads account.

=== Homepage - Reactor ===
Date: 2025-03-09
Source: https://www.tor.com
--------------------
Advertisement Science fiction. Fantasy. The universe. And related subjects. The Reactor newsletter is the best way to catch up on the world of science fiction, fantasy, pop culture, and more! Kochin is

In [7]:
import os
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from datetime import datetime

# Create directory for output
os.makedirs("scraped_data", exist_ok=True)

def extract_date(soup):
    """Extract article date from meta tags."""
    date_tags = ["article:published_time", "date", "publish-date", "pubdate"]
    for tag in date_tags:
        date_meta = soup.find("meta", {"property": tag}) or soup.find("meta", {"name": tag})
        if date_meta and date_meta.get("content"):
            return date_meta["content"]
    return datetime.today().strftime("%Y-%m-%d")  # Default to today's date

def clean_text(text):
    """Cleans text by removing unnecessary spaces and line breaks."""
    text = re.sub(r"\s+", " ", text)  # Normalize whitespace
    return text.strip()

def scrape_and_save(category, urls, all_data):
    for url in urls:
        try:
            response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")

            title = soup.title.string.strip() if soup.title else "Untitled Article"
            date = extract_date(soup)
            paragraphs = [p.get_text(strip=True) for p in soup.find_all("p")]
            content = clean_text(" ".join(paragraphs))

            # Append data to list
            all_data.append([category, title, date, url, content])

        except requests.RequestException as e:
            print(f"Error scraping {url}: {e}")

# Collect data
all_data = []
for category, urls in categories.items():
    scrape_and_save(category, urls, all_data)

# Convert to DataFrame
df = pd.DataFrame(all_data, columns=["Category", "Title", "Date", "URL", "Content"])

# Save to CSV
csv_path = os.path.join("scraped_data", "scraped_articles.csv")
df.to_csv(csv_path, index=False, encoding="utf-8")

print(f"Data saved successfully to {csv_path}")


Error scraping https://www.sfsite.com: 404 Client Error: Not Found for url: https://www.sfsite.com/
Error scraping https://www.mysteryscenemag.com: HTTPSConnectionPool(host='www.mysteryscenemag.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1006)')))
Error scraping https://www.britannica.com/art/classic-literature: 404 Client Error: Not Found for url: https://www.britannica.com/art/classic-literature
Error scraping https://www.epicreads.com: 403 Client Error: Forbidden for url: https://www.epicreads.com/
Error scraping https://www.bookbub.com/blog/tag/historical-fiction: 403 Client Error: Forbidden for url: https://www.bookbub.com/blog/tag/historical-fiction
Error scraping https://www.cbr.com: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error scraping https://www.pastemagazine.com/comics: 403 Cli

In [9]:
data = pd.read_csv('/content/scraped_data/scraped_articles.csv')
data.head()
data.shape

(36, 5)

In [10]:
data.dropna(inplace=True)
data.shape

(33, 5)

In [11]:
data.head()

Unnamed: 0,Category,Title,Date,URL,Content
0,Fiction,Goodreads | Meet your next favorite book,2025-03-09,https://www.goodreads.com,You’re in the right place. Tell us what titles...
1,Fiction,Homepage - Reactor,2025-03-09,https://www.tor.com,Advertisement Science fiction. Fantasy. The un...
2,Fiction,Literary Hub,2025-03-09,https://lithub.com,"Featuring Brad Johnson, Emily Temple, James Fo..."
3,Non-Fiction,"Books: Book Reviews, Book News, and Author Int...",2025-03-09,https://www.npr.org/sections/books,"November 25, 2024 •Books We Love returns with ..."
5,Non-Fiction,"Culture: TV, Movies, Music, Art, and Theatre N...",2025-03-09,https://www.newyorker.com/books,Sections More ©2025Condé Nast. All rights rese...


In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from gensim.models import Word2Vec

# Load your dataset (ensure this is defined)
# data = pd.read_csv("scraped_articles.csv")  # Uncomment if reading from a CSV

# Preprocess the text data
def preprocess_text(text):
    """Convert text to lowercase, remove punctuation, and split into words."""
    text = text.lower()
    text = ''.join(c for c in text if c.isalnum() or c.isspace())
    return text.split()

data['Processed_Content'] = data['Content'].apply(preprocess_text)

# Train a Word2Vec model
sentences = data['Processed_Content'].tolist()
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Save trained embeddings (optional)
word2vec_model.save("word2vec_model.bin")

# Generate document embeddings using trained Word2Vec embeddings
def document_embedding(words, model):
    """Get average Word2Vec embedding for a document."""
    embeddings = [model.wv.get_vector(word) for word in words if word in model.wv]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)  # Default zero vector if no words found

# Convert text to embeddings
data['Content_Embedding'] = data['Processed_Content'].apply(lambda x: document_embedding(x, word2vec_model))

# Convert list of embeddings into a DataFrame
X = pd.DataFrame(list(data['Content_Embedding']))
y = data['Category']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))


                                  precision    recall  f1-score   support

                Children's Books       0.00      0.00      0.00       1.0
              Classic Literature       0.00      0.00      0.00       1.0
                         Fiction       0.00      0.00      0.00       0.0
              Historical Fiction       0.00      0.00      0.00       1.0
                 Horror & Gothic       0.00      0.00      0.00       0.0
            Libraries & Archives       0.00      0.00      0.00       1.0
              Mystery & Thriller       0.00      0.00      0.00       1.0
                          Poetry       0.00      0.00      0.00       0.0
       Science Fiction & Fantasy       0.00      0.00      0.00       0.0
Self-Help & Personal Development       0.00      0.00      0.00       1.0
                Young Adult (YA)       0.00      0.00      0.00       1.0

                        accuracy                           0.00       7.0
                       macro avg    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:

from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 0.0


In [15]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sentence_transformers import SentenceTransformer

In [20]:
data = data.dropna(subset=['Content', 'Category'])

# Load Hugging Face embedding model
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Generate embeddings
data['Content_Embedding'] = data['Content'].apply(lambda x: embed_model.encode(x, convert_to_tensor=True))

In [25]:
# Encode categories
label_encoder = LabelEncoder()
data['Category_Label'] = label_encoder.fit_transform(data['Category'])

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    data['Content_Embedding'].tolist(), data['Category_Label'], test_size=0.2, random_state=42
)

# Convert lists to tensors
X_train, X_test = torch.stack(X_train), torch.stack(X_test)

# Convert y_train and y_test from pandas Series to NumPy arrays first, then to tensors
y_train, y_test = torch.tensor(y_train.to_numpy(), dtype=torch.long), torch.tensor(y_test.to_numpy(), dtype=torch.long)

# Create a PyTorch dataset
class NewsDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = NewsDataset(X_train, y_train)
test_dataset = NewsDataset(X_test, y_test)

# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)


In [30]:
# Define a simple Neural Network
class TextClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(TextClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(256, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return self.softmax(x)

# Model setup
num_classes = len(label_encoder.classes_)
model = TextClassifier(input_dim=X_train.shape[1], num_classes=num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 500
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}")

# Evaluate the model
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        outputs = model(X_batch)
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == y_batch).sum().item()
        total += y_batch.size(0)

print(f"Accuracy: {correct / total:.4f}")

Epoch [1/500], Loss: 2.7089
Epoch [2/500], Loss: 2.7086
Epoch [3/500], Loss: 2.7084
Epoch [4/500], Loss: 2.7081
Epoch [5/500], Loss: 2.7078
Epoch [6/500], Loss: 2.7074
Epoch [7/500], Loss: 2.7069
Epoch [8/500], Loss: 2.7064
Epoch [9/500], Loss: 2.7057
Epoch [10/500], Loss: 2.7050
Epoch [11/500], Loss: 2.7040
Epoch [12/500], Loss: 2.7027
Epoch [13/500], Loss: 2.7011
Epoch [14/500], Loss: 2.6990
Epoch [15/500], Loss: 2.6962
Epoch [16/500], Loss: 2.6926
Epoch [17/500], Loss: 2.6878
Epoch [18/500], Loss: 2.6818
Epoch [19/500], Loss: 2.6740
Epoch [20/500], Loss: 2.6639
Epoch [21/500], Loss: 2.6511
Epoch [22/500], Loss: 2.6348
Epoch [23/500], Loss: 2.6149
Epoch [24/500], Loss: 2.5922
Epoch [25/500], Loss: 2.5689
Epoch [26/500], Loss: 2.5467
Epoch [27/500], Loss: 2.5255
Epoch [28/500], Loss: 2.5032
Epoch [29/500], Loss: 2.4785
Epoch [30/500], Loss: 2.4506
Epoch [31/500], Loss: 2.4236
Epoch [32/500], Loss: 2.4038
Epoch [33/500], Loss: 2.3910
Epoch [34/500], Loss: 2.3812
Epoch [35/500], Loss: 2