In [127]:
#import libraries
# General libraries
import json
import re

# Data handling
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Natural Language Processing
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

In [233]:
#scrape initial websites to use for training
%run "../scrapeContent/scrapeWebsitesToTrain.ipynb"

loading 24/24


In [271]:
#grab the trainig data
with open("../scrapeContent/processed_websites_content.json", "r") as jsonobj:
    training_data_json = json.load(jsonobj)

In [272]:
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()
text_columns = ["head_title", "description", "title", "alt_images_texts", "p", "h1", "h2", "h3", "h4", "h5", "h6"]

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens]
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    preprocessed_text = " ".join(stemmed_tokens)
    
    return preprocessed_text

In [281]:
#create dataframe
columns = ["website", "content", "category_name" ]

dataFrame = pd.DataFrame(columns=columns)

for website, data in training_data_json.items():
    status_code = data.get("status_code","")

    if status_code == 200:
        for prop in text_columns:
            if prop in data and prop in text_columns:
                    data[prop] = preprocess_text(data[prop])
                

        combined_content = " ".join([
            data.get("head_title", ""),
            data.get("description", ""),
            data.get("title", ""),
            data.get("alt_images_texts", ""),
            data.get("p", ""),
            data.get("h1", ""),
            data.get("h2", ""),
            data.get("h3", ""),
            data.get("h4", ""),
            data.get("h5", ""),
            data.get("h6", "")
        ])

        new_row = {
            "website": website,
            "content": combined_content,
            "category_name": data.get("category_name"),
        }
        new_row_df = pd.DataFrame([new_row])

        dataFrame = pd.concat([dataFrame, new_row_df], ignore_index=True)



In [286]:
df = dataFrame
df = df.drop_duplicates(subset="category_name")
dataFrame = df

In [287]:
n = len(dataFrame)
dataFrame.head(n)  

Unnamed: 0,website,content,category_name
0,https://lithub.com,literari hub becom a lit hub support emal of...,Arts & Entertainment
3,https://www.rockauto.com,rockauto rockauto ship auto part and bodi part...,Automotive
6,https://www.bloomberg.com/europe,bloomberg are you a robot bloomberg to conti...,Business
9,https://www.careercontessa.com,career contessa career advic job search site f...,Careers
12,https://www.bookwidgets.com/blog/2016/10/15-ed...,educ blog everi teacher should know about book...,Education
15,https://nurtureandthriveblog.com,nurtur and thrive dr ashley soderlund develop ...,Family & Parenting
18,https://artofhealthyliving.com,the art of healthi live health fit diet wellb ...,Health & Fitness
21,https://iamafoodblog.com,i am a food blog celebr the awesom of food cel...,Food & Drink


In [304]:
# Step 3: TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))  # You can adjust the number of features
X_tfidf = tfidf_vectorizer.fit_transform(dataFrame["content"])

# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, dataFrame["category_name"], test_size=0.2, random_state=42)

print(X_tfidf)
# Step 5: Model Selection (Random Forest as an example)
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Step 6: Model Training
model.fit(X_train, y_train)

# Step 7: Model Evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

  (0, 453)	0.0013812963133014993
  (0, 835)	0.0015604826145351675
  (0, 430)	0.0017797889231650653
  (0, 52)	0.007383049617740825
  (0, 852)	0.002062523929383376
  (0, 54)	0.0015604826145351675
  (0, 671)	0.01722711577472859
  (0, 651)	0.019688132313975534
  (0, 650)	0.2707118193171636
  (0, 574)	0.02707118193171636
  (0, 573)	0.295321984709633
  (0, 669)	0.02707118193171636
  (0, 670)	0.4675931424569189
  (0, 692)	0.02707118193171636
  (0, 693)	0.3248541831805963
  (0, 353)	0.03937626462795107
  (0, 623)	0.004125047858766752
  (0, 593)	0.0015604826145351675
  (0, 989)	0.0017797889231650653
  (0, 619)	0.012305082696234709
  (0, 106)	0.004125047858766752
  (0, 830)	0.0015604826145351675
  (0, 767)	0.007383049617740825
  (0, 525)	0.009844066156987767
  (0, 115)	0.012305082696234709
  :	:
  (7, 417)	0.0161538024388166
  (7, 634)	0.003540831097485038
  (7, 861)	0.034476709865043344
  (7, 919)	0.0027904838466834454
  (7, 635)	0.014163324389940153
  (7, 995)	0.01784033674424803
  (7, 105)	0.