# 🐟 Tanzania Fish Price Prediction
## Machine Learning Project
**Objective:** Predict fish prices (in TZS) based on fish characteristics, market conditions, and region using Linear Regression and Decision Tree models.

In [None]:
# ============================================================
# 1. IMPORT LIBRARIES
# ============================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

plt.style.use('seaborn-v0_8-whitegrid')
print("✅ Libraries imported successfully!")

## 2. Load & Explore the Dataset

In [None]:
# ============================================================
# 2. LOAD DATASET
# ============================================================
df = pd.read_csv('tanzania_fish_prices.csv')

print("📊 Dataset Shape:", df.shape)
print("\n📋 First 5 rows:")
df.head()

In [None]:
print("📌 Dataset Info:")
df.info()

In [None]:
print("📈 Statistical Summary:")
df.describe()

In [None]:
print("🔍 Missing Values:")
print(df.isnull().sum())
print("\n✅ No missing values!" if df.isnull().sum().sum() == 0 else "⚠️ Missing values found!")

## 3. Exploratory Data Analysis (EDA)

In [None]:
# ============================================================
# 3. EXPLORATORY DATA ANALYSIS
# ============================================================

fig, axes = plt.subplots(2, 3, figsize=(16, 10))
fig.suptitle('Tanzania Fish Price Dataset — Exploratory Data Analysis', fontsize=15, fontweight='bold')

# Price distribution
axes[0,0].hist(df['price_tzs'], bins=30, color='#2196F3', edgecolor='white', alpha=0.85)
axes[0,0].set_title('Distribution of Fish Prices (TZS)')
axes[0,0].set_xlabel('Price (TZS)')
axes[0,0].set_ylabel('Frequency')

# Average price by species
avg_species = df.groupby('species')['price_tzs'].mean().sort_values(ascending=False)
axes[0,1].bar(avg_species.index, avg_species.values, color=['#4CAF50','#2196F3','#FF9800','#E91E63','#9C27B0'])
axes[0,1].set_title('Average Price by Fish Species')
axes[0,1].set_xlabel('Species')
axes[0,1].set_ylabel('Avg Price (TZS)')
axes[0,1].tick_params(axis='x', rotation=15)

# Average price by season
avg_season = df.groupby('season')['price_tzs'].mean()
axes[0,2].bar(avg_season.index, avg_season.values, color=['#03A9F4','#FF5722'])
axes[0,2].set_title('Average Price by Season')
axes[0,2].set_xlabel('Season')
axes[0,2].set_ylabel('Avg Price (TZS)')

# Price by market type
df.boxplot(column='price_tzs', by='market_type', ax=axes[1,0], 
           boxprops=dict(color='#2196F3'), medianprops=dict(color='red'))
axes[1,0].set_title('Price Distribution by Market Type')
axes[1,0].set_xlabel('Market Type')
axes[1,0].set_ylabel('Price (TZS)')
plt.sca(axes[1,0])
plt.xticks(rotation=10)

# Price vs Weight
axes[1,1].scatter(df['weight_kg'], df['price_tzs'], alpha=0.4, color='#9C27B0', s=20)
axes[1,1].set_title('Price vs Fish Weight')
axes[1,1].set_xlabel('Weight (kg)')
axes[1,1].set_ylabel('Price (TZS)')

# Price vs Freshness
axes[1,2].scatter(df['freshness_days'], df['price_tzs'], alpha=0.4, color='#FF5722', s=20)
axes[1,2].set_title('Price vs Freshness (Days Old)')
axes[1,2].set_xlabel('Days Since Catch')
axes[1,2].set_ylabel('Price (TZS)')

plt.tight_layout()
plt.savefig('eda_plots.png', dpi=150, bbox_inches='tight')
plt.show()
print("✅ EDA plots saved!")

In [None]:
# Correlation heatmap (numeric features)
fig, ax = plt.subplots(figsize=(8, 6))
numeric_df = df[['weight_kg', 'freshness_days', 'distance_to_market_km', 'quantity_kg', 'price_tzs']]
corr = numeric_df.corr()
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', ax=ax, 
            linewidths=0.5, square=True, cbar_kws={'shrink': 0.8})
ax.set_title('Correlation Matrix of Numeric Features', fontsize=13, fontweight='bold')
plt.tight_layout()
plt.savefig('correlation_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()
print("✅ Correlation heatmap saved!")

## 4. Data Preprocessing

In [None]:
# ============================================================
# 4. DATA PREPROCESSING
# ============================================================

df_processed = df.copy()

# Encode categorical variables
le = LabelEncoder()
cat_cols = ['species', 'region', 'season', 'market_type', 'quality_grade']

encoders = {}
for col in cat_cols:
    encoders[col] = LabelEncoder()
    df_processed[col] = encoders[col].fit_transform(df_processed[col])
    print(f"✅ Encoded '{col}': {dict(zip(encoders[col].classes_, encoders[col].transform(encoders[col].classes_)))}")

print("\n📋 Processed dataset (first 5 rows):")
df_processed.head()

In [None]:
# Define features and target
X = df_processed.drop('price_tzs', axis=1)
y = df_processed['price_tzs']

print("Features (X):", list(X.columns))
print("Target (y): price_tzs")
print(f"\nDataset size: {X.shape[0]} rows × {X.shape[1]} features")

# Split into train and test sets (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\n✅ Training set: {X_train.shape[0]} samples")
print(f"✅ Testing set:  {X_test.shape[0]} samples")

# Feature scaling (for Linear Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("\n✅ Features scaled successfully!")

## 5. Model Training & Evaluation

In [None]:
# ============================================================
# 5A. LINEAR REGRESSION MODEL
# ============================================================

lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

# Predictions
y_pred_lr = lr_model.predict(X_test_scaled)

# Metrics
lr_mae  = mean_absolute_error(y_test, y_pred_lr)
lr_mse  = mean_squared_error(y_test, y_pred_lr)
lr_rmse = np.sqrt(lr_mse)
lr_r2   = r2_score(y_test, y_pred_lr)
lr_cv   = cross_val_score(lr_model, X_train_scaled, y_train, cv=5, scoring='r2').mean()

print("=" * 45)
print("  📊 LINEAR REGRESSION RESULTS")
print("=" * 45)
print(f"  MAE  : {lr_mae:>12,.2f} TZS")
print(f"  RMSE : {lr_rmse:>12,.2f} TZS")
print(f"  R²   : {lr_r2:>12.4f}")
print(f"  CV R²: {lr_cv:>12.4f}")
print("=" * 45)

In [None]:
# ============================================================
# 5B. DECISION TREE MODEL
# ============================================================

dt_model = DecisionTreeRegressor(max_depth=6, random_state=42)
dt_model.fit(X_train, y_train)

# Predictions
y_pred_dt = dt_model.predict(X_test)

# Metrics
dt_mae  = mean_absolute_error(y_test, y_pred_dt)
dt_mse  = mean_squared_error(y_test, y_pred_dt)
dt_rmse = np.sqrt(dt_mse)
dt_r2   = r2_score(y_test, y_pred_dt)
dt_cv   = cross_val_score(dt_model, X_train, y_train, cv=5, scoring='r2').mean()

print("=" * 45)
print("  🌳 DECISION TREE RESULTS")
print("=" * 45)
print(f"  MAE  : {dt_mae:>12,.2f} TZS")
print(f"  RMSE : {dt_rmse:>12,.2f} TZS")
print(f"  R²   : {dt_r2:>12.4f}")
print(f"  CV R²: {dt_cv:>12.4f}")
print("=" * 45)

## 6. Model Comparison & Visualization

In [None]:
# ============================================================
# 6. MODEL COMPARISON VISUALIZATIONS
# ============================================================

fig, axes = plt.subplots(2, 2, figsize=(15, 11))
fig.suptitle('Model Evaluation: Linear Regression vs Decision Tree', fontsize=15, fontweight='bold')

# --- Plot 1: Metrics Comparison ---
metrics = ['MAE', 'RMSE', 'R²', 'CV R²']
lr_vals = [lr_mae, lr_rmse, lr_r2 * 10000, lr_cv * 10000]
dt_vals = [dt_mae, dt_rmse, dt_r2 * 10000, dt_cv * 10000]

x = np.arange(len(metrics))
width = 0.35
bars1 = axes[0,0].bar(x - width/2, [lr_mae, lr_rmse, lr_r2, lr_cv], width, label='Linear Regression', color='#2196F3', alpha=0.85)
bars2 = axes[0,0].bar(x + width/2, [dt_mae, dt_rmse, dt_r2, dt_cv], width, label='Decision Tree', color='#4CAF50', alpha=0.85)
axes[0,0].set_title('Metrics Comparison (MAE, RMSE normalized)')
axes[0,0].set_xticks(x)
axes[0,0].set_xticklabels(metrics)
axes[0,0].legend()
axes[0,0].set_ylabel('Value')

# --- Plot 2: Actual vs Predicted (LR) ---
axes[0,1].scatter(y_test, y_pred_lr, alpha=0.5, color='#2196F3', s=25, label='Predictions')
lim = [min(y_test.min(), y_pred_lr.min()), max(y_test.max(), y_pred_lr.max())]
axes[0,1].plot(lim, lim, 'r--', lw=2, label='Perfect Fit')
axes[0,1].set_title(f'Linear Regression: Actual vs Predicted\nR² = {lr_r2:.4f}')
axes[0,1].set_xlabel('Actual Price (TZS)')
axes[0,1].set_ylabel('Predicted Price (TZS)')
axes[0,1].legend()

# --- Plot 3: Actual vs Predicted (DT) ---
axes[1,0].scatter(y_test, y_pred_dt, alpha=0.5, color='#4CAF50', s=25, label='Predictions')
axes[1,0].plot(lim, lim, 'r--', lw=2, label='Perfect Fit')
axes[1,0].set_title(f'Decision Tree: Actual vs Predicted\nR² = {dt_r2:.4f}')
axes[1,0].set_xlabel('Actual Price (TZS)')
axes[1,0].set_ylabel('Predicted Price (TZS)')
axes[1,0].legend()

# --- Plot 4: Residuals ---
lr_resid = y_test - y_pred_lr
dt_resid = y_test - y_pred_dt
axes[1,1].hist(lr_resid, bins=25, alpha=0.6, color='#2196F3', label='LR Residuals')
axes[1,1].hist(dt_resid, bins=25, alpha=0.6, color='#4CAF50', label='DT Residuals')
axes[1,1].axvline(0, color='red', linestyle='--')
axes[1,1].set_title('Residuals Distribution')
axes[1,1].set_xlabel('Residual (TZS)')
axes[1,1].legend()

plt.tight_layout()
plt.savefig('model_comparison.png', dpi=150, bbox_inches='tight')
plt.show()
print("✅ Model comparison plots saved!")

In [None]:
# Feature importance (Decision Tree)
feat_imp = pd.Series(dt_model.feature_importances_, index=X.columns).sort_values(ascending=True)

fig, ax = plt.subplots(figsize=(8, 5))
colors = ['#FF9800' if v == feat_imp.max() else '#2196F3' for v in feat_imp.values]
feat_imp.plot(kind='barh', ax=ax, color=colors)
ax.set_title('Feature Importance — Decision Tree Model', fontsize=13, fontweight='bold')
ax.set_xlabel('Importance Score')
plt.tight_layout()
plt.savefig('feature_importance.png', dpi=150, bbox_inches='tight')
plt.show()
print("✅ Feature importance plot saved!")

## 7. Select Best Model & Save

In [None]:
# ============================================================
# 7. SELECT BEST MODEL & SAVE
# ============================================================

print("\n" + "=" * 50)
print("       🏆 MODEL SELECTION SUMMARY")
print("=" * 50)
print(f"{'Metric':<15} {'Linear Reg':>15} {'Decision Tree':>15}")
print("-" * 50)
print(f"{'MAE (TZS)':<15} {lr_mae:>15,.2f} {dt_mae:>15,.2f}")
print(f"{'RMSE (TZS)':<15} {lr_rmse:>15,.2f} {dt_rmse:>15,.2f}")
print(f"{'R² Score':<15} {lr_r2:>15.4f} {dt_r2:>15.4f}")
print(f"{'CV R² Score':<15} {lr_cv:>15.4f} {dt_cv:>15.4f}")
print("=" * 50)

best_model_name = "Decision Tree" if dt_r2 > lr_r2 else "Linear Regression"
best_model = dt_model if dt_r2 > lr_r2 else lr_model
best_r2 = max(dt_r2, lr_r2)

print(f"\n✅ Best Model: {best_model_name} (R² = {best_r2:.4f})")

# Save the best model
joblib.dump(best_model, 'model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(encoders, 'encoders.pkl')
print("\n💾 Saved: model.pkl, scaler.pkl, encoders.pkl")

In [None]:
# Verify saved model works
loaded_model = joblib.load('model.pkl')
test_pred = loaded_model.predict(X_test[:5])
print("✅ Model loaded & verified!")
print("\nSample predictions (TZS):")
for i, (pred, actual) in enumerate(zip(test_pred, y_test.values[:5])):
    print(f"  Sample {i+1}: Predicted = {pred:>10,.2f} | Actual = {actual:>10,.2f}")

## ✅ Summary

| Item | Details |
|------|---------|
| **Dataset** | 500 records of Tanzania fish prices |
| **Features** | Species, Region, Season, Market Type, Quality, Weight, Freshness, Distance, Quantity |
| **Target** | Fish Price (TZS) |
| **Models** | Linear Regression & Decision Tree |
| **Best Model** | Decision Tree (higher R²) |
| **Saved Files** | `model.pkl`, `scaler.pkl`, `encoders.pkl` |