### Basic URL Classifier

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', 100)
import requests
from urllib.parse import urlparse
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv("current-output-homepage_with_coded_hard_soft.csv")
df = df.loc[df['hard_soft'].isin([0, 1])]
print(df.shape)
df.head(n = 5)

(295, 16)


Unnamed: 0,date,time,src,order,url,link_text,homepage_keywords,path,title,text,top_image,authors,summary,keywords,sample,hard_soft
0,20161107,50620,hpmg,64,http://www.huffingtonpost.com/entry/stay-with-me-taiwan_us_581d04fde4b0d9ce6fbc2e33,Gay Man Finds Strife Bitterness In This Haunting Short Film,breaking|daughter|clinton|woman|post|country|opinion|seeing|huffington|grow|lead|hillary|beyonc√©,,,,,,,,1.0,0.0
1,20161105,200432,usat,3,http://www.usatoday.com/story/news/politics/onpolitics/2016/11/04/what-time-do-polls-open-and-cl...,On Politics What time do polls open and close in each state on Election Day?,shipping|neiman|marcus|latest|today|thats|items|pricey|world|website|dish|usa|shell|sold,,,,,,,,1.0,1.0
2,20161108,210601,wsj,25,http://www.wsj.com/articles/toyota-profit-falls-on-strong-yen-weak-u-s-sales-1478586115,Toyota Looks to Electric Cars as Stronger Yen Dents Profits,breaking|financial|went|journal|presidential|polls|republican|races|street|world|business|trump|...,,,,,,,,1.0,1.0
3,20161106,40438,google,210,http://www.nj.com/essex/index.ssf/2016/11/7_stabbed_at_newark_home_official_says.html,3 killed several wounded in Newark stabbing,month|todaywith|season|google|playoff|work|safely|regular|lot|contenders|usa,,,,,,,,1.0,1.0
4,20161106,40438,google,267,http://www.standard.co.uk/news/crime/million-mask-march-2016-chaotic-scenes-as-anonymous-protest...,Million Mask March 2016 Chaotic scenes as Anonymous protest descends into aggression,month|todaywith|season|google|playoff|work|safely|regular|lot|contenders|usa,,,,,,,,1.0,1.0


In [3]:
df['hard_soft'].value_counts()

hard_soft
1.0    176
0.0    119
Name: count, dtype: int64

In [4]:
def extract_and_tokenize_url(url):
    if isinstance(url, str):
        parsed_url = urlparse(url)
        path = parsed_url.path
        tokens = re.split(r'[/\-]', path)
        tokens = [token.lower() for token in tokens if token]
        return ' '.join(tokens)
    else:
        return ''

In [5]:
df["tokenized_url"] = df["url"].apply(extract_and_tokenize_url)

In [6]:
tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(df["tokenized_url"])

In [7]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, df["hard_soft"], test_size=0.2, random_state=42)

In [8]:
# Initialize and train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.73
