# ü§ñ Module 5.1: Machine Learning Fundamentals

**Time:** 5 hours | **Difficulty:** üî¥ Advanced

## Learning Objectives
- ‚úÖ Supervised learning concepts
- ‚úÖ Regression and classification
- ‚úÖ Model evaluation
- ‚úÖ Feature engineering

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report

## 1. Generate Financial Dataset

In [None]:
np.random.seed(42)
n_samples = 500

# Simulated stock features
data = pd.DataFrame({
    'PE_Ratio': np.random.uniform(5, 50, n_samples),
    'Debt_Ratio': np.random.uniform(0, 1, n_samples),
    'ROE': np.random.uniform(-0.1, 0.4, n_samples),
    'Revenue_Growth': np.random.uniform(-0.2, 0.5, n_samples),
    'Market_Cap_B': np.random.uniform(1, 500, n_samples),
})

# Target: Future return (regression)
data['Future_Return'] = (
    0.1 - 0.002 * data['PE_Ratio'] 
    - 0.15 * data['Debt_Ratio'] 
    + 0.3 * data['ROE'] 
    + 0.2 * data['Revenue_Growth']
    + np.random.normal(0, 0.05, n_samples)
)

# Target: Buy signal (classification)
data['Buy_Signal'] = (data['Future_Return'] > 0.05).astype(int)

print(data.head())
print(f"\nBuy signals: {data['Buy_Signal'].sum()} / {len(data)}")

## 2. Linear Regression for Return Prediction

In [None]:
# Prepare data
features = ['PE_Ratio', 'Debt_Ratio', 'ROE', 'Revenue_Growth', 'Market_Cap_B']
X = data[features]
y = data['Future_Return']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train model
reg_model = LinearRegression()
reg_model.fit(X_train_scaled, y_train)

# Predictions
y_pred = reg_model.predict(X_test_scaled)

# Evaluation
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse:.4f}")
print(f"R¬≤ Score: {reg_model.score(X_test_scaled, y_test):.4f}")

# Feature importance
importance = pd.DataFrame({
    'Feature': features,
    'Coefficient': reg_model.coef_
}).sort_values('Coefficient', key=abs, ascending=False)
print(f"\nFeature Importance:\n{importance}")

## 3. Classification for Buy Signals

In [None]:
# Prepare classification data
y_class = data['Buy_Signal']
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X, y_class, test_size=0.2, random_state=42)

X_train_c_scaled = scaler.fit_transform(X_train_c)
X_test_c_scaled = scaler.transform(X_test_c)

# Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_c_scaled, y_train_c)

# Predictions
y_pred_c = clf.predict(X_test_c_scaled)

# Evaluation
print(f"Accuracy: {accuracy_score(y_test_c, y_pred_c):.4f}")
print(f"\nClassification Report:\n{classification_report(y_test_c, y_pred_c)}")

## 4. Feature Importance Visualization

In [None]:
# Random Forest feature importance
feat_importance = pd.DataFrame({
    'Feature': features,
    'Importance': clf.feature_importances_
}).sort_values('Importance', ascending=True)

plt.figure(figsize=(10, 5))
plt.barh(feat_importance['Feature'], feat_importance['Importance'], color='steelblue')
plt.xlabel('Importance')
plt.title('Random Forest Feature Importance')
plt.tight_layout()
plt.show()

## 5. Model Comparison

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'Decision Tree': DecisionTreeClassifier(),
    'KNN': KNeighborsClassifier(),
}

results = []
for name, model in models.items():
    model.fit(X_train_c_scaled, y_train_c)
    accuracy = model.score(X_test_c_scaled, y_test_c)
    results.append({'Model': name, 'Accuracy': accuracy})

results_df = pd.DataFrame(results).sort_values('Accuracy', ascending=False)
print(results_df.to_string(index=False))

## üìù Exercise: Cross-Validation
Implement k-fold cross-validation to get more robust model evaluation

In [None]:
# YOUR CODE HERE
# from sklearn.model_selection import cross_val_score


---
**Next:** Module 5.2 - Time Series Forecasting ‚Üí