# 08 — Intro to Machine Learning (Scikit-learn)

## Learning goals
- Understand train/test split
- Train a basic regression model
- Evaluate with MAE / R²

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

# Synthetic study-hours dataset
df = pd.DataFrame({
    "hours": [1,2,3,4,5,6,7,8,9,10],
    "score": [50,55,60,63,68,72,78,84,88,93]
})

X = df[["hours"]]
y = df["score"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

pred = model.predict(X_test)
print("MAE:", mean_absolute_error(y_test, pred))
print("R²:", r2_score(y_test, pred))

MAE: 0.7508250825082546
R²: 0.9945611117397111


In [2]:
# Make a prediction
hours = [[7.5]]
predicted_score = model.predict(hours)[0]
print(f"Predicted score for 7.5 study hours: {predicted_score:.2f}")

Predicted score for 7.5 study hours: 80.81




In [None]:
# Practice task:
# Try a different test_size and compare MAE/R².
test_sizes = [0.2, 0.3, 0.4, 0.5]
results = []
for ts in test_sizes:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ts, random_state=42)
    model = LinearRegression()
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, pred)
    r2 = r2_score(y_test, pred)
    results.append((ts, mae, r2))

print('Results for different test_size values:')
for ts, mae, r2 in results:
    print(f'test_size={ts}: MAE={mae:.3f}, R²={r2:.3f}')

# Note: with a very small dataset metrics can vary — use cross-validation for robust estimates