In [6]:
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from google.colab import drive
drive.mount('/content/drive')

file_data = '/content/drive/MyDrive/sample_11_categories_1.csv'
df = pd.read_csv(file_data)

df['combined_text'] = df[['text', 'html_title', 'h1', 'h2', 'p']].fillna('').agg(' '.join, axis=1)
label_columns = ['Software company', 'Marketing agency', 'Legal services', 'Advertising agency',
                 'Restaurant', 'Solar energy company', 'Travel agency', 'E-commerce service',
                 'Real estate agency', 'Life insurance agency', 'College']
y = df[label_columns]


def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_text'] = df['combined_text'].apply(clean_text)

X = df['clean_text']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

model = RandomForestClassifier(random_state=42, n_estimators=100)
model.fit(X_train_tfidf, y_train)

y_probs_list = model.predict_proba(X_test_tfidf)

y_probs = np.vstack([probs[:, 1] for probs in y_probs_list]).T

threshold = 0.7
def apply_threshold(probs, threshold=0.5):
    probs = np.array(probs)
    return (probs >= threshold).astype(int)

y_pred = apply_threshold(y_probs, threshold)

precisions, recalls, fscores, supports = precision_recall_fscore_support(y_test, y_pred, average=None, zero_division=0)

print(f"{'Label':25} {'Precision':10} {'Recall':10} {'F1-Score':10} {'Support':7}")
for label, precision, recall, f1, support in zip(y.columns, precisions, recalls, fscores, supports):
    print(f"{label:25} {precision*100:9.2f}% {recall*100:9.2f}% {f1*100:9.2f}% {support:7}")







import requests
from bs4 import BeautifulSoup
import numpy as np

def predict_website_category(url, vectorizer, model, threshold=0.5):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except Exception as e:
        print(f"Error fetching the URL: {e}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')

    text = soup.get_text(separator=' ', strip=True)
    title = soup.title.string if soup.title else ''
    h1 = ' '.join([tag.get_text() for tag in soup.find_all('h1')])
    h2 = ' '.join([tag.get_text() for tag in soup.find_all('h2')])
    p = ' '.join([tag.get_text() for tag in soup.find_all('p')])

    combined = f"{text} {title} {h1} {h2} {p}"
    cleaned = clean_text(combined)

    vectorized = vectorizer.transform([cleaned])

    probs_list = model.predict_proba(vectorized)
    probs = np.vstack([probs[:, 1] for probs in probs_list]).T

    preds = apply_threshold(probs, threshold)

    results = list(zip(y.columns, probs[0], preds[0]))

    results_sorted = sorted(results, key=lambda x: x[1], reverse=True)

    print(f"\n🔍 Prediction for URL: {url}")
    print("Predicted Categories and Confidence:")

    found = False
    for label, prob, pred in results_sorted:
        print(f"- {label}: {prob:.3f} {'✅' if pred == 1 else ''}")
        if pred == 1:
            found = True

    if not found:
        print(" No category matched confidently.")




import joblib

joblib.dump(model, '/content/drive/MyDrive/website_category_model.pkl')

joblib.dump(vectorizer, '/content/drive/MyDrive/tfidf_vectorizer.pkl')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Label                     Precision  Recall     F1-Score   Support
Software company             100.00%     29.63%     45.71%     135
Marketing agency              98.41%     48.44%     64.92%     128
Legal services               100.00%     10.00%     18.18%      60
Advertising agency            97.73%     44.33%     60.99%      97
Restaurant                    79.41%     45.00%     57.45%      60
Solar energy company         100.00%     63.49%     77.67%      63
Travel agency                100.00%     36.36%     53.33%      55
E-commerce service           100.00%     27.27%     42.86%      77
Real estate agency            95.45%     34.43%     50.60%      61
Life insurance agency        100.00%     42.37%     59.52%      59
College                      100.00%     66.67%     80.00%      54


['/content/drive/MyDrive/tfidf_vectorizer.pkl']

In [7]:
predict_website_category("https://vyosim.com/", vectorizer, model)


🔍 Prediction for URL: https://vyosim.com/
Predicted Categories and Confidence:
- Software company: 0.590 ✅
- Marketing agency: 0.250 
- E-commerce service: 0.240 
- Legal services: 0.120 
- Advertising agency: 0.100 
- College: 0.050 
- Solar energy company: 0.040 
- Real estate agency: 0.040 
- Restaurant: 0.030 
- Life insurance agency: 0.030 
- Travel agency: 0.020 
