In [1]:
import pandas as pd

In [3]:
# un-structured

logs = [
  "2025-09-03 10:02:12 | SM:23.4 | TEMP:17.8",
  "2025-09-03 10:07:12 | SM:23.1 | TEMP:18.0",
  "2025-09-03 10:12:15 | SM:22.9 | TEMP:18.3"
]

data = []
for log in logs:
    parts = log.split("|") # separate vertical symbol
    timestamp = parts[0].strip() # Remove any free space
    sm = float(parts[1].split(":")[1])  # ----> SM: 23.24 >>>> [SM], [23.4]>>> 23.4
    temp = parts[2].split(":")[1]
    data.append([timestamp, sm, temp])

# list to DataFrame
mydf = pd.DataFrame(data, columns=['timstamp', 'soil_moisture', 'temperature'])
mydf.head()



Unnamed: 0,timstamp,soil_moisture,temperature
0,2025-09-03 10:02:12,23.4,17.8
1,2025-09-03 10:07:12,23.1,18.0
2,2025-09-03 10:12:15,22.9,18.3


In [6]:
posts = [
    "Just tried plant-based chicken nuggets, surprisingly good!",
    "Energy drinks keep me alive during finals, but I know they’re unhealthy 😅",
    "Fermented foods like kimchi and kefir are amazing for gut health!",
    "Too much junk food on campus… we need better healthy snack options.",
    "Protein shakes are my go-to after gym sessions."
]

In [7]:
# scikit-learn

from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF Concept Term Frequency (TF): how often a word appears in a document.
# Inverse Document Frequency (IDF): reduces the weight of very common words that appear everywhere (like food, good, day).

In [8]:
vectorizer = TfidfVectorizer(stop_words='english')

X = vectorizer.fit_transform(posts)

df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
df.head()

Unnamed: 0,alive,amazing,based,better,campus,chicken,drinks,energy,fermented,finals,...,nuggets,options,plant,protein,sessions,shakes,snack,surprisingly,tried,unhealthy
0,0.0,0.0,0.353553,0.0,0.0,0.353553,0.0,0.0,0.0,0.0,...,0.353553,0.0,0.353553,0.0,0.0,0.0,0.0,0.353553,0.353553,0.0
1,0.408248,0.0,0.0,0.0,0.0,0.0,0.408248,0.408248,0.0,0.408248,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.408248
2,0.0,0.353553,0.0,0.0,0.0,0.0,0.0,0.0,0.353553,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.353553,0.353553,0.0,0.0,0.0,0.0,0.0,...,0.0,0.353553,0.0,0.0,0.0,0.0,0.353553,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.5,0.5,0.5,0.0,0.0,0.0,0.0


In [9]:
word_counts = df.sum().sort_values(ascending=False)
word_counts.head(10)

shakes       0.500000
sessions     0.500000
protein      0.500000
gym          0.500000
alive        0.408248
finals       0.408248
know         0.408248
unhealthy    0.408248
drinks       0.408248
energy       0.408248
dtype: float64

In [11]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(X)

for i, post in enumerate(posts):
    print(f"Cluster{clusters[i]}: {post}")

Cluster0: Just tried plant-based chicken nuggets, surprisingly good!
Cluster0: Energy drinks keep me alive during finals, but I know they’re unhealthy 😅
Cluster0: Fermented foods like kimchi and kefir are amazing for gut health!
Cluster0: Too much junk food on campus… we need better healthy snack options.
Cluster1: Protein shakes are my go-to after gym sessions.
