# ðŸšŒ Public Transport Demand Predictor
**Course:** Machine Learning | **Task:** TEST2 | **Context:** Tanzania Public Transit

This notebook covers data generation, preprocessing, model training (LR & DT), evaluation, visualization, and saving the best model.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import joblib, warnings
warnings.filterwarnings('ignore')
print('âœ… Libraries imported!')

## 1. Dataset Generation

In [None]:
np.random.seed(42)
n = 800
cities=["Dar es Salaam","Dodoma","Arusha","Mwanza","Mbeya","Morogoro","Tanga","Zanzibar","Iringa","Tabora"]
routes=["City Centre - Airport","Ubungo - Kariakoo","Mbezi - Posta","Arusha - Moshi","Dodoma - Kondoa","Mwanza - Musoma","Tanga - Korogwe","Mbeya - Songea","Iringa - Morogoro","Tabora - Shinyanga"]
transport=["Daladala","Bus","BRT","Taxi","Tuk-tuk","Ferry"]
day_types=["Weekday","Weekend","Public Holiday"]
time_slots=["Early Morning (5-7am)","Morning Peak (7-9am)","Midday (10am-12pm)","Afternoon (1-3pm)","Evening Peak (4-7pm)","Night (8-10pm)"]
weather=["Sunny","Cloudy","Rainy","Heavy Rain"]
seasons=["Dry Season","Short Rains","Long Rains"]

city_c  = np.random.choice(cities,n)
route_c = np.random.choice(routes,n)
trans_c = np.random.choice(transport,n)
day_c   = np.random.choice(day_types,n,p=[0.71,0.20,0.09])
time_c  = np.random.choice(time_slots,n)
weath_c = np.random.choice(weather,n,p=[0.50,0.25,0.15,0.10])
seas_c  = np.random.choice(seasons,n)

dist_km   = np.round(np.random.uniform(2,120,n),1)
fare_tzs  = np.round(np.random.uniform(300,5000,n),-1)
vehicles  = np.random.randint(1,40,n)
pop_den   = np.round(np.random.uniform(500,12000,n),0)
wait_min  = np.round(np.random.uniform(2,60,n),1)
temp_c    = np.round(np.random.uniform(18,36,n),1)
nr_mkt    = np.random.choice([0,1],n,p=[0.4,0.6])
nr_sch    = np.random.choice([0,1],n,p=[0.5,0.5])

demand = (
    120
    + np.where(day_c=='Public Holiday',80,np.where(day_c=='Weekend',30,50))
    + np.where(time_c=='Morning Peak (7-9am)',200,np.where(time_c=='Evening Peak (4-7pm)',180,
      np.where(time_c=='Early Morning (5-7am)',60,np.where(time_c=='Midday (10am-12pm)',80,
      np.where(time_c=='Afternoon (1-3pm)',70,30)))))
    + np.where(weath_c=='Heavy Rain',-60,np.where(weath_c=='Rainy',-30,0))
    + np.where(trans_c=='BRT',120,np.where(trans_c=='Bus',80,np.where(trans_c=='Daladala',60,
      np.where(trans_c=='Ferry',100,20))))
    + pop_den*0.01 - fare_tzs*0.02 - wait_min*1.5 + vehicles*3
    + nr_mkt*40 + nr_sch*30
    + np.where(seas_c=='Long Rains',-20,np.where(seas_c=='Short Rains',-10,10))
    + np.random.normal(0,25,n)
)
demand = np.round(np.clip(demand,10,700),0).astype(int)

df = pd.DataFrame({'City':city_c,'Route':route_c,'Transport_Type':trans_c,'Day_Type':day_c,
    'Time_Slot':time_c,'Weather':weath_c,'Season':seas_c,'Route_Distance_km':dist_km,
    'Fare_TZS':fare_tzs,'Available_Vehicles':vehicles,'Population_Density':pop_den,
    'Avg_Wait_Min':wait_min,'Temp_Celsius':temp_c,'Near_Market':nr_mkt,'Near_School':nr_sch,
    'Passengers_Per_Hour':demand})
df.to_csv('public_transport_dataset.csv',index=False)
print(f'Dataset: {df.shape[0]} rows x {df.shape[1]} cols')
df.head()

## 2. EDA

In [None]:
print(df.describe())
print('Missing:', df.isnull().sum().sum())

In [None]:
fig,axes=plt.subplots(2,3,figsize=(18,10))
fig.suptitle('Public Transport Demand Analysis â€” Tanzania',fontsize=16,fontweight='bold',color='#0369a1')

axes[0,0].hist(df['Passengers_Per_Hour'],bins=30,color='#0369a1',alpha=0.8,edgecolor='white')
axes[0,0].set_title('Demand Distribution'); axes[0,0].set_xlabel('Passengers/hr')

df.groupby('Transport_Type')['Passengers_Per_Hour'].mean().sort_values().plot(kind='barh',ax=axes[0,1],color='#0284c7')
axes[0,1].set_title('Avg Demand by Transport Type')

df.groupby('Time_Slot')['Passengers_Per_Hour'].mean().sort_values().plot(kind='barh',ax=axes[0,2],color='#38bdf8')
axes[0,2].set_title('Avg Demand by Time Slot')

df.groupby('Day_Type')['Passengers_Per_Hour'].mean().plot(kind='bar',ax=axes[1,0],color=['#0369a1','#0284c7','#38bdf8'])
axes[1,0].set_title('Avg Demand by Day Type'); axes[1,0].tick_params(axis='x',rotation=15)

axes[1,1].scatter(df['Fare_TZS'],df['Passengers_Per_Hour'],alpha=0.3,color='#0369a1',s=15)
axes[1,1].set_title('Fare vs Demand'); axes[1,1].set_xlabel('Fare (TZS)')

df.groupby('Weather')['Passengers_Per_Hour'].mean().plot(kind='bar',ax=axes[1,2],color=['#f59e0b','#64748b','#3b82f6','#1d4ed8'])
axes[1,2].set_title('Avg Demand by Weather'); axes[1,2].tick_params(axis='x',rotation=15)

plt.tight_layout(); plt.savefig('eda_transport.png',dpi=150,bbox_inches='tight'); plt.show()
print('EDA plots saved.')

## 3. Preprocessing

In [None]:
cat_cols=['City','Route','Transport_Type','Day_Type','Time_Slot','Weather','Season']
num_cols=['Route_Distance_km','Fare_TZS','Available_Vehicles','Population_Density','Avg_Wait_Min','Temp_Celsius','Near_Market','Near_School']

df_enc=df.copy(); encoders={}
for col in cat_cols:
    le=LabelEncoder(); df_enc[col]=le.fit_transform(df_enc[col]); encoders[col]=le

X=df_enc.drop('Passengers_Per_Hour',axis=1)
y=df_enc['Passengers_Per_Hour']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

scaler=StandardScaler()
X_train_s=X_train.copy(); X_test_s=X_test.copy()
X_train_s[num_cols]=scaler.fit_transform(X_train[num_cols])
X_test_s[num_cols] =scaler.transform(X_test[num_cols])
print(f'Train: {X_train.shape[0]} | Test: {X_test.shape[0]}')

## 4. Model Training

In [None]:
lr=LinearRegression(); lr.fit(X_train_s,y_train); lr_p=lr.predict(X_test_s)
dt=DecisionTreeRegressor(max_depth=10,min_samples_split=8,random_state=42)
dt.fit(X_train,y_train); dt_p=dt.predict(X_test)
print('Both models trained!')

## 5. Evaluation

In [None]:
def ev(name,yt,yp):
    return {'Model':name,'MAE':round(mean_absolute_error(yt,yp),2),
            'RMSE':round(np.sqrt(mean_squared_error(yt,yp)),2),'R2':round(r2_score(yt,yp),4)}
lr_m=ev('Linear Regression',y_test,lr_p)
dt_m=ev('Decision Tree',y_test,dt_p)
results=pd.DataFrame([lr_m,dt_m])
print(results.to_string(index=False))
best=results.loc[results['R2'].idxmax(),'Model']
print(f'\nBest: {best}')

## 6. Visualizations

In [None]:
fig,axes=plt.subplots(1,3,figsize=(18,5))
fig.suptitle('Model Evaluation â€” Transport Demand Predictor',fontsize=15,fontweight='bold')

x=np.arange(2); w=0.35
axes[0].bar(x-w/2,[lr_m['MAE'],lr_m['RMSE']],w,label='Linear Regression',color='#0369a1')
axes[0].bar(x+w/2,[dt_m['MAE'],dt_m['RMSE']],w,label='Decision Tree',color='#38bdf8')
axes[0].set_xticks(x); axes[0].set_xticklabels(['MAE','RMSE'])
axes[0].set_title('Error Metrics'); axes[0].legend()

axes[1].bar(['Linear Regression','Decision Tree'],[lr_m['R2'],dt_m['R2']],color=['#0369a1','#38bdf8'])
axes[1].set_ylim(0,1); axes[1].set_title('RÂ² Score')
for i,v in enumerate([lr_m['R2'],dt_m['R2']]): axes[1].text(i,v+0.01,str(v),ha='center',fontweight='bold')

bp=lr_p if best=='Linear Regression' else dt_p
bt=y_test
axes[2].scatter(bt,bp,alpha=0.4,color='#0284c7',s=20)
mn,mx=bt.min(),bt.max(); axes[2].plot([mn,mx],[mn,mx],'r--',lw=2,label='Perfect')
axes[2].set_title(f'Actual vs Predicted ({best})')
axes[2].set_xlabel('Actual'); axes[2].set_ylabel('Predicted'); axes[2].legend()
plt.tight_layout(); plt.savefig('model_eval_transport.png',dpi=150,bbox_inches='tight'); plt.show()

## 7. Save Best Model

In [None]:
best_model=lr if best=='Linear Regression' else dt
bundle={'model':best_model,'encoders':encoders,'scaler':scaler,'feature_cols':list(X.columns),
        'cat_cols':cat_cols,'num_cols':num_cols,'best_name':best,'lr_metrics':lr_m,'dt_metrics':dt_m,
        'use_scaler':(best=='Linear Regression')}
joblib.dump(bundle,'transport_model.pkl')
print(f'Saved transport_model.pkl ({best}) âœ“')