### Import data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("data/feeds_rows_enriched.csv")
df.head()

Unnamed: 0,id,title,date,source,content,inserted_at,topic,tone,category
0,000e4d98-f740-4b7e-956a-a0691ee7976d,Introduction to the concept of likelihood and ...,2025-10-23T22:52:15.000Z,Hacker News,"This article, titled 'Introduction to the conc...",2025-10-29 03:21:07.813314+00,Other,neutral,Research
1,0018a889-d44f-48ad-ae60-82b39e436b70,"Plumbing vs. Internet, Revisited",2025-11-02T04:59:39.000Z,Hacker News,"This article, titled 'Plumbing vs. Internet, R...",2025-11-04 03:50:48.426329+00,Other,neutral,Other
2,00286a34-7800-4206-95a4-97bd22b408aa,U.S. Senate Democrats Assure Crypto CEOs They'...,"Wed, 22 Oct 2025 18:19:10 +0000",CoinDesk,Several top crypto executives met with senator...,2025-10-23 00:46:23.6802+00,Blockchain,neutral,Regulation
3,002a1cfb-3f83-4510-85d6-c95b269fe7d8,TigerBeetle and Synadia pledge $512k to the Zi...,2025-10-25T13:54:33.000Z,Hacker News,https:www.synadia.comblogsynadia-tigerbeetle-z...,2025-10-26 20:56:42.697888+00,Other,positive,Funding
4,00396635-3296-4217-b61a-812bdc0d91b1,Image Dithering: Eleven Algorithms and Source ...,2025-10-24T19:38:44.000Z,Hacker News,"This article, titled 'Image Dithering: Eleven ...",2025-10-29 03:17:29.052085+00,Other,neutral,Research


### Explore data

In [3]:
print(df.shape)
print(df.columns)

(4520, 9)
Index(['id', 'title', 'date', 'source', 'content', 'inserted_at', 'topic',
       'tone', 'category'],
      dtype='object')


In [4]:
df["tone"].value_counts(normalize=True).round(2)

tone
neutral     0.66
positive    0.20
negative    0.14
Name: proportion, dtype: float64

### Split into training and test datasets

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df["title"],        # using title only
    df["tone"],
    test_size=0.2,
    random_state=42,
    stratify=df["tone"]
)

In [6]:
train_dist = y_train.value_counts(normalize=True).rename("train")
test_dist = y_test.value_counts(normalize=True).rename("test")

pd.concat([train_dist, test_dist], axis=1).round(2)

Unnamed: 0_level_0,train,test
tone,Unnamed: 1_level_1,Unnamed: 2_level_1
neutral,0.66,0.66
positive,0.2,0.2
negative,0.14,0.14


### Build and train model

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

model = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=5000, ngram_range=(1,2))),
    ("clf", LogisticRegression(max_iter=1000, class_weight="balanced"))
])

### Training Data Preview

In [None]:
pd.DataFrame({
    "title": X_train.sample(10, random_state=1).values,
    "tone": y_train.sample(10, random_state=1).values
})

### MLFlow

In [11]:
import mlflow
import mlflow.sklearn
import joblib
from sklearn.metrics import accuracy_score, f1_score

mlflow.set_experiment("tone_model")

with mlflow.start_run():

    # params
    mlflow.log_param("model", "logistic_regression_pipeline")
    mlflow.log_param("max_features", 5000)
    mlflow.log_param("ngram_range", (1,2))
    mlflow.log_param("class_weight", "balanced")
    mlflow.log_param("max_iter", 1000)

    # train
    model.fit(X_train, y_train)

    # evaluate
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1_weighted", f1)

    # save pipeline
    joblib.dump(model, "tone_model.pkl")
    mlflow.log_artifact("tone_model.pkl")

    # optional: also log pipeline as native MLflow model
    mlflow.sklearn.log_model(model, "model")

print("MLflow run complete:", acc, f1)

2025/12/11 18:48:14 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/12/11 18:48:14 INFO mlflow.store.db.utils: Updating database tables
2025/12/11 18:48:14 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/11 18:48:14 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2025/12/11 18:48:14 INFO alembic.runtime.migration: Running upgrade  -> 451aebb31d03, add metric step
2025/12/11 18:48:14 INFO alembic.runtime.migration: Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
2025/12/11 18:48:14 INFO alembic.runtime.migration: Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
2025/12/11 18:48:14 INFO alembic.runtime.migration: Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
2025/12/11 18:48:14 INFO alembic.runtime.migration: Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
2025/12/11 18:48:14 INFO alembic.runtime.migration: Running 

MLflow run complete: 0.8152654867256637 0.8185343244348041


### Test Model

In [13]:
sample_df = X_test.sample(10, random_state=1)

pd.DataFrame({
    "text": sample_df.values,
    "prediction": model.predict(sample_df),
    "actual": y_test.loc[sample_df.index].values
})

Unnamed: 0,text,prediction,actual
0,The Smol Training Playbook: The Secrets to Bui...,neutral,neutral
1,Sphere Computer – The Innovative 1970s Compute...,neutral,neutral
2,ProEnergy repurposes jet engines to power data...,neutral,neutral
3,"Harder, Better, Faster, Stronger Version of Ub...",neutral,neutral
4,Intel shares jump on improved revenue as turna...,positive,positive
5,AdapTive-LeArning Speculator System (ATLAS): F...,neutral,neutral
6,Improving PixelMelt's Kindle Web Deobfuscator,neutral,neutral
7,Doing well in your courses: Andrej's advice fo...,positive,positive
8,RF Shielding History: When the FCC Cracked Dow...,neutral,neutral
9,Corrosion,neutral,neutral


### Evaluate model

In [14]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.66      0.73      0.69       124
     neutral       0.90      0.83      0.87       596
    positive       0.70      0.81      0.75       184

    accuracy                           0.82       904
   macro avg       0.75      0.79      0.77       904
weighted avg       0.83      0.82      0.82       904

[[ 91  24   9]
 [ 43 497  56]
 [  4  31 149]]


In [15]:
sample = X_test.iloc[[100]]      # change 42 to whichever index you want
print (sample)
pred = model.predict(sample)
print(f"Predicted tone: {pred[0]}")
print(f"Actual tone:    {y_test.iloc[100]}")
print(f"Title: {sample.values[0]}")

2726    The Rise and Fall of Urbit
Name: title, dtype: object
Predicted tone: negative
Actual tone:    negative
Title: The Rise and Fall of Urbit


### More testing

In [16]:
# Example: test the tone model with custom inputs
examples = [
    "Deepagent: A powerful desktop AI assistant.",
    "The Rise and Fall of Urbit",
    "Excellent product launch with strong reviews!"
]

predictions = model.predict(examples)

for text, tone in zip(examples, predictions):
    print(f"{tone.upper():<10} | {text}")


POSITIVE   | Deepagent: A powerful desktop AI assistant.
NEGATIVE   | The Rise and Fall of Urbit
POSITIVE   | Excellent product launch with strong reviews!
