In [127]:
#import libraries
# General libraries
import json
import re

# Data handling
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Natural Language Processing
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

In [None]:
#scrape initial websites to use for training
%run "../scrapeContent/scrapeWebsitesToTrain.ipynb"

In [271]:
#grab the trainig data
with open("../scrapeContent/processed_websites_content.json", "r") as jsonobj:
    training_data_json = json.load(jsonobj)

In [272]:
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()
text_columns = ["head_title", "description", "title", "alt_images_texts", "p", "h1", "h2", "h3", "h4", "h5", "h6"]

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens]
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    preprocessed_text = " ".join(stemmed_tokens)
    
    return preprocessed_text

In [281]:
#create dataframe
columns = ["website", "content", "category_name" ]

dataFrame = pd.DataFrame(columns=columns)

for website, data in training_data_json.items():
    status_code = data.get("status_code","")

    if status_code == 200:
        for prop in text_columns:
            if prop in data and prop in text_columns:
                    data[prop] = preprocess_text(data[prop])
                

        combined_content = " ".join([
            data.get("head_title", ""),
            data.get("description", ""),
            data.get("title", ""),
            data.get("alt_images_texts", ""),
            data.get("p", ""),
            data.get("h1", ""),
            data.get("h2", ""),
            data.get("h3", ""),
            data.get("h4", ""),
            data.get("h5", ""),
            data.get("h6", "")
        ])

        new_row = {
            "website": website,
            "content": combined_content,
            "category_name": data.get("category_name"),
        }
        new_row_df = pd.DataFrame([new_row])

        dataFrame = pd.concat([dataFrame, new_row_df], ignore_index=True)



In [286]:
df = dataFrame
df = df.drop_duplicates(subset="category_name")
dataFrame = df

In [None]:
n = len(dataFrame)
dataFrame.head(n)  

In [None]:
# Step 3: TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))  # You can adjust the number of features
X_tfidf = tfidf_vectorizer.fit_transform(dataFrame["content"])

# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, dataFrame["category_name"], test_size=0.2, random_state=42)

print(X_tfidf)
# Step 5: Model Selection (Random Forest as an example)
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Step 6: Model Training
model.fit(X_train, y_train)

# Step 7: Model Evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")