In [11]:
import pandas as pd
import numpy as np
import sklearn.preprocessing

data = {
    'timestamp': pd.to_datetime([
        '2025-07-30 08:30:00', '2025-07-30 09:15:00', '2025-07-30 09:45:00',
        '2025-07-30 12:00:00', '2025-07-30 13:30:00', '2025-07-30 17:00:00',
        '2025-07-30 18:30:00', '2025-07-30 19:00:00', '2025-07-30 21:00:00',
        '2025-07-30 22:30:00'
    ]),
    'cab_type': ['Mini', 'Sedan', 'Mini', 'Sedan', 'Mini', 'SUV', 'Sedan', 'Mini', 'Sedan', 'Mini'],
    'distance_km': [5.5, 12.1, 3.2, 8.0, 4.5, 6.7, 15.3, 4.1, 9.8, 7.6],
    'temperature_celsius': [28.5, 29.0, 29.2, 32.0, 31.5, 29.8, 27.5, np.nan, 26.0, 25.5],
    'weather_condition': ['Clear', 'Clear', 'Clear', 'Clear', 'Clear', 'Rainy', 'Rainy', 'Rainy', 'Clear', 'Foggy'],
    'demand_multiplier': [1.0, 1.5, 1.2, 1.0, 1.0, 2.0, 2.5, 2.2, 1.2, 1.8]
}

df = pd.DataFrame(data)
df



Unnamed: 0,timestamp,cab_type,distance_km,temperature_celsius,weather_condition,demand_multiplier
0,2025-07-30 08:30:00,Mini,5.5,28.5,Clear,1.0
1,2025-07-30 09:15:00,Sedan,12.1,29.0,Clear,1.5
2,2025-07-30 09:45:00,Mini,3.2,29.2,Clear,1.2
3,2025-07-30 12:00:00,Sedan,8.0,32.0,Clear,1.0
4,2025-07-30 13:30:00,Mini,4.5,31.5,Clear,1.0
5,2025-07-30 17:00:00,SUV,6.7,29.8,Rainy,2.0
6,2025-07-30 18:30:00,Sedan,15.3,27.5,Rainy,2.5
7,2025-07-30 19:00:00,Mini,4.1,,Rainy,2.2
8,2025-07-30 21:00:00,Sedan,9.8,26.0,Clear,1.2
9,2025-07-30 22:30:00,Mini,7.6,25.5,Foggy,1.8


In [12]:
#finding missing values
print("---Missing values in original data")
print(df.isnull().sum())


---Missing values in original data
timestamp              0
cab_type               0
distance_km            0
temperature_celsius    1
weather_condition      0
demand_multiplier      0
dtype: int64


In [13]:
# Artificially add some missing values to the 'temperature_celsius' column for practice
# df.loc[df.sample(frac=0.05, random_state=42).index, 'temperature_celsius'] = np.nan

print("\n--- Missing Values After Artificial Removal ---")
print(df.isnull().sum())


--- Missing Values After Artificial Removal ---
timestamp              0
cab_type               0
distance_km            0
temperature_celsius    1
weather_condition      0
demand_multiplier      0
dtype: int64


In [14]:
median_temp = df['temperature_celsius'].median()
print(f"\nThe median temperature to fill with is: {median_temp:.2f}")

# 2. Fill the missing values (NaNs) with the median
# inplace=True modifies the DataFrame directly
df['temperature_celsius'].fillna(median_temp,inplace=True)
print(df.isnull().sum())

print("\n--- Missing Values After Imputation ---")
print(df.isnull().sum())


The median temperature to fill with is: 29.00
timestamp              0
cab_type               0
distance_km            0
temperature_celsius    0
weather_condition      0
demand_multiplier      0
dtype: int64

--- Missing Values After Imputation ---
timestamp              0
cab_type               0
distance_km            0
temperature_celsius    0
weather_condition      0
demand_multiplier      0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['temperature_celsius'].fillna(median_temp,inplace=True)


In [15]:
#implementing with pd_getdummies
print("--- DataFrame Before Encoding (First 5 Rows) ---")
print(df.head())


--- DataFrame Before Encoding (First 5 Rows) ---
            timestamp cab_type  distance_km  temperature_celsius  \
0 2025-07-30 08:30:00     Mini          5.5                 28.5   
1 2025-07-30 09:15:00    Sedan         12.1                 29.0   
2 2025-07-30 09:45:00     Mini          3.2                 29.2   
3 2025-07-30 12:00:00    Sedan          8.0                 32.0   
4 2025-07-30 13:30:00     Mini          4.5                 31.5   

  weather_condition  demand_multiplier  
0             Clear                1.0  
1             Clear                1.5  
2             Clear                1.2  
3             Clear                1.0  
4             Clear                1.0  


In [16]:
df_encoded = pd.get_dummies(df , columns=['cab_type','weather_condition'], drop_first=True)

#see the result
print("\n--- Data frame after encoding ----")
print(df_encoded.head())


--- Data frame after encoding ----
            timestamp  distance_km  temperature_celsius  demand_multiplier  \
0 2025-07-30 08:30:00          5.5                 28.5                1.0   
1 2025-07-30 09:15:00         12.1                 29.0                1.5   
2 2025-07-30 09:45:00          3.2                 29.2                1.2   
3 2025-07-30 12:00:00          8.0                 32.0                1.0   
4 2025-07-30 13:30:00          4.5                 31.5                1.0   

   cab_type_SUV  cab_type_Sedan  weather_condition_Foggy  \
0         False           False                    False   
1         False            True                    False   
2         False           False                    False   
3         False            True                    False   
4         False           False                    False   

   weather_condition_Rainy  
0                    False  
1                    False  
2                    False  
3                 

In [17]:
print("\nNew columns created:", df_encoded.columns)


New columns created: Index(['timestamp', 'distance_km', 'temperature_celsius', 'demand_multiplier',
       'cab_type_SUV', 'cab_type_Sedan', 'weather_condition_Foggy',
       'weather_condition_Rainy'],
      dtype='object')


In [18]:
#day 13 
df_encoded = pd.get_dummies(df, columns=['cab_type', 'weather_condition'], drop_first = True)
print("Data type of 'timestamp' BEFORE conversion:", df_encoded['timestamp'].dtype)

#Convert the column to datetime objects using pd.to_datetime
df_encoded['timestamp'] = pd.to_datetime(df_encoded['timestamp'])

print("Data type of 'timestamp' AFTER conversion:", df_encoded['timestamp'].dtype)


Data type of 'timestamp' BEFORE conversion: datetime64[ns]
Data type of 'timestamp' AFTER conversion: datetime64[ns]


In [19]:
#extracting new fatures with .dt from date and time column
df_encoded['hour_of_day'] = df_encoded['timestamp'].dt.dayofweek
df_encoded['day_of_week'] = df_encoded['timestamp'].dt.dayofweek  # Monday=0, Sunday=6
df_encoded['is_weekend'] = (df_encoded['timestamp'].dt.dayofweek >= 5).astype(int)  # 1 if True, 0 if False

df_final = df_encoded.drop('timestamp', axis=1)

#final column
print("\n--- DataFrame with New Time-Based Features ---")
print(df_final[['hour_of_day', 'day_of_week', 'is_weekend']].head())

print("\n--- All Final Columns ---")
print(df_final.columns)





--- DataFrame with New Time-Based Features ---
   hour_of_day  day_of_week  is_weekend
0            2            2           0
1            2            2           0
2            2            2           0
3            2            2           0
4            2            2           0

--- All Final Columns ---
Index(['distance_km', 'temperature_celsius', 'demand_multiplier',
       'cab_type_SUV', 'cab_type_Sedan', 'weather_condition_Foggy',
       'weather_condition_Rainy', 'hour_of_day', 'day_of_week', 'is_weekend'],
      dtype='object')


In [20]:
# Feature scaling puts all features on a level playing field, ensuring that the model judges them based on their predictive power, not their arbitrary scale.
#now separate features and target 

X = df_final.drop('demand_multiplier', axis=1)
y = df_final['demand_multiplier']

print("---Shape of our features(X)---")
print(X.shape)

print("\n--- Shape of our Target (y) ---")
print(y.shape)


---Shape of our features(X)---
(10, 9)

--- Shape of our Target (y) ---
(10,)


In [23]:
from sklearn.preprocessing import StandardScaler

#initializing the scaler
scaler = StandardScaler()

#fit the scaler to the features and transform them
#.fit_transform() calculates the mean/std and applies the transformation in one step
X_scaled = scaler.fit_transform(X)

#the output is a numpy array , so lets convert it dataframe to view it
X_scaled_df = pd.DataFrame(X_scaled , columns=X.columns)

print("\n---Scaled Features---")
print(X_scaled_df.head())

#lets verufy the scaling
print("\n---Statistical Summary of Scaled Features---")
print(X_scaled_df.describe())


---Scaled Features---
   distance_km  temperature_celsius  cab_type_SUV  cab_type_Sedan  \
0    -0.602504            -0.151369     -0.333333       -0.816497   
1     1.221591             0.100912     -0.333333        1.224745   
2    -1.238174             0.201825     -0.333333       -0.816497   
3     0.088441             1.614597     -0.333333        1.224745   
4    -0.878882             1.362317     -0.333333       -0.816497   

   weather_condition_Foggy  weather_condition_Rainy  hour_of_day  day_of_week  \
0                -0.333333                -0.654654          0.0          0.0   
1                -0.333333                -0.654654          0.0          0.0   
2                -0.333333                -0.654654          0.0          0.0   
3                -0.333333                -0.654654          0.0          0.0   
4                -0.333333                -0.654654          0.0          0.0   

   is_weekend  
0         0.0  
1         0.0  
2         0.0  
3         0

In [24]:
from sklearn.model_selection import train_test_split

#performing the split
X_train , X_test , y_train , y_test = train_test_split(
    X_scaled,
    y,
    test_size=0.2,
    random_state=42
)

#lets check the shapes of our new datasets to verify the split
print("---Shapes of Split Data---")
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

---Shapes of Split Data---
X_train shape: (8, 9)
X_test shape: (2, 9)
y_train shape: (8,)
y_test shape: (2,)
