### Import data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("data/feeds_rows_20251219.csv")
df.head()

Unnamed: 0,id,title,date,source,content,inserted_at,topic,tone,category
0,0053b5e3-7d21-4dda-82ef-60af64a61bf7,Stop crawling my HTML â€“ use the API,2025-12-14T18:44:38.000Z,Hacker News,Stop crawling my HTML â€“ use the API,2025-12-16 02:48:13.503551+00,Other,negative,Opinion
1,006dc8e3-865a-4505-a891-2d4b28fe5f85,Go Proposal: Secret Mode,2025-12-09T21:10:52.000Z,Hacker News,Go Proposal: Secret Mode,2025-12-16 02:47:38.512492+00,Other,neutral,Other
2,00857629-93ec-482d-b610-94f4f7956e7c,2025 Word of the Year: Slop,2025-12-15T11:25:10.000Z,Hacker News,2025 Word of the Year: Slop,2025-12-16 02:46:49.768566+00,Other,neutral,Other
3,0139c06e-2684-49d5-a033-84e074f03390,CM0 â€“ A new Raspberry Pi you can't buy,2025-12-12T15:19:19.000Z,Hacker News,CM0 â€“ A new Raspberry Pi you can't buy,2025-12-16 02:47:28.333476+00,Other,neutral,Product Launch
4,01803083-0187-49f9-987b-c83a194c3182,Modern SID chip substitutes [video],2025-12-14T03:17:11.000Z,Hacker News,Modern SID chip substitutes [video],2025-12-20 00:55:15.810099+00,Other,neutral,Other


### Explore data

In [3]:
print(df.shape)
print(df.columns)

(1070, 9)
Index(['id', 'title', 'date', 'source', 'content', 'inserted_at', 'topic',
       'tone', 'category'],
      dtype='object')


In [4]:
df["tone"].value_counts(normalize=True).round(2)

tone
neutral     0.63
positive    0.21
negative    0.15
Name: proportion, dtype: float64

### Split into training and test datasets

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df["title"],        # using title only
    df["tone"],
    test_size=0.2,
    random_state=42,
    stratify=df["tone"]
)

In [6]:
train_dist = y_train.value_counts(normalize=True).rename("train")
test_dist = y_test.value_counts(normalize=True).rename("test")

pd.concat([train_dist, test_dist], axis=1).round(2)

Unnamed: 0_level_0,train,test
tone,Unnamed: 1_level_1,Unnamed: 2_level_1
neutral,0.63,0.64
positive,0.21,0.21
negative,0.15,0.15


### Build and train model

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

model = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=5000, ngram_range=(1,2))),
    ("clf", LogisticRegression(max_iter=1000, class_weight="balanced"))
])

### Training Data Preview

In [8]:
pd.DataFrame({
    "title": X_train.sample(10, random_state=1).values,
    "tone": y_train.sample(10, random_state=1).values
})

Unnamed: 0,title,tone
0,Why Rust for Embedded Systems? (and Why I'm Te...,positive
1,Defrag.exfat Is Inefficient and Dangerous,negative
2,Six Big Bets,neutral
3,Filecoin slides 5% alongside major decline in ...,negative
4,The future of AI: What will your life look lik...,neutral
5,Ford is starting a battery storage business to...,positive
6,A kernel bug froze my machine: Debugging an as...,neutral
7,"AI: A Dedicated Fact-Failing Machine, Or, yet ...",negative
8,Cloudflare error page generator,neutral
9,Detailed balance in large language model-drive...,neutral


### MLFlow

In [9]:
import mlflow
import mlflow.sklearn
import joblib
from sklearn.metrics import accuracy_score, f1_score

mlflow.set_experiment("tone_model")

# ML FLow Scenario #1

with mlflow.start_run():

    # params
    mlflow.log_param("model", "logistic_regression_pipeline")
    mlflow.log_param("max_features", 100) # base = 5000
    mlflow.log_param("ngram_range", (1,1)) # base = 1,2
    mlflow.log_param("class_weight", "balanced")
    mlflow.log_param("max_iter", 1000)

    # train
    model.fit(X_train, y_train)

    # evaluate
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1_weighted", f1)

    # save pipeline
    joblib.dump(model, "tone_model.pkl")
    mlflow.log_artifact("tone_model.pkl")

    # optional: also log pipeline as native MLflow model
    mlflow.sklearn.log_model(model, "model")

print("MLflow run complete:", acc, f1)

2025/12/19 17:02:44 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/12/19 17:02:44 INFO mlflow.store.db.utils: Updating database tables
2025/12/19 17:02:44 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/19 17:02:44 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2025/12/19 17:02:44 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/19 17:02:44 INFO alembic.runtime.migration: Will assume non-transactional DDL.


MLflow run complete: 0.6542056074766355 0.6493936046823552


In [10]:
# ML Flow Scenario #2
with mlflow.start_run(run_name="weak_model"):
    mlflow.log_param("ngram_range", (1,1))
    mlflow.log_param("max_features", 200)
    mlflow.log_param("C", 0.01)
    mlflow.log_param("stop_words", "english")

    model = Pipeline([
        ("tfidf", TfidfVectorizer(
            max_features=200,
            ngram_range=(1,1),
            stop_words="english"
        )),
        ("clf", LogisticRegression(
            C=0.01,
            max_iter=1000,
            class_weight="balanced"
        ))
    ])

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1_weighted", f1)

    mlflow.sklearn.log_model(model, "model")

print("Weak model:", acc, f1)



Weak model: 0.6074766355140186 0.6035615054306643


### Test Model

In [11]:
sample_df = X_test.sample(10, random_state=1)

pd.DataFrame({
    "text": sample_df.values,
    "prediction": model.predict(sample_df),
    "actual": y_test.loc[sample_df.index].values
})

Unnamed: 0,text,prediction,actual
0,Our emotional pain became a product,neutral,negative
1,Launch HN: BrowserBook (YC F24) â€“ IDE for de...,positive,positive
2,Ford Considers Scrapping Electric Version of F...,neutral,negative
3,How to think about durable execution,neutral,neutral
4,Wall Street Ruined the Roomba and Then Blamed ...,neutral,negative
5,Zoom brings its AI assistant to the web with a...,negative,positive
6,The appropriate amount of effort is zero,neutral,neutral
7,Show HN: Tiny VM sandbox in C with apps in Rus...,positive,neutral
8,"Nasdaq, home of Coinbase, Strategy stocks, see...",neutral,neutral
9,Why Twilio Segment moved from microservices ba...,neutral,neutral


### Evaluate model

In [12]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.24      0.30      0.27        33
     neutral       0.73      0.77      0.75       136
    positive       0.52      0.33      0.41        45

    accuracy                           0.61       214
   macro avg       0.50      0.47      0.48       214
weighted avg       0.61      0.61      0.60       214

[[ 10  20   3]
 [ 20 105  11]
 [ 11  19  15]]


In [13]:
sample = X_test.iloc[[100]]      # change 42 to whichever index you want
print (sample)
pred = model.predict(sample)
print(f"Predicted tone: {pred[0]}")
print(f"Actual tone:    {y_test.iloc[100]}")
print(f"Title: {sample.values[0]}")

606    Building an efficient hash table in Java
Name: title, dtype: object
Predicted tone: neutral
Actual tone:    neutral
Title: Building an efficient hash table in Java


### More testing

In [15]:
# Example: test the tone model with custom inputs
examples = [
    "I hate this",
    "I love this",
    "Not bad"
]

predictions = model.predict(examples)

for text, tone in zip(examples, predictions):
    print(f"{tone.upper():<10} | {text}")


NEUTRAL    | I hate this
NEUTRAL    | I love this
NEUTRAL    | Not bad
