In [1]:
!pip freeze | grep scikit-learn

scikit-learn==1.6.1


In [3]:
import pickle
import pandas as pd

In [2]:
!python -V

Python 3.9.21


In [7]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd
import pickle

# Load and sample
df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet')
df = df.dropna(subset=['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'trip_distance', 'PULocationID', 'DOLocationID'])
df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]
df['long_trip'] = (df['duration'] > 10).astype(int)

# Sample data to reduce memory use during dev
df = df.sample(n=100_000, random_state=1)

features = ['PULocationID', 'DOLocationID', 'trip_distance']
df[features] = df[features].astype(str)

X_dict = df[features].to_dict(orient='records')
y = df['long_trip']

# Use sparse encoding
dv = DictVectorizer(sparse=True)
X = dv.fit_transform(X_dict)

# Train model
model = LogisticRegression()
model.fit(X, y)

# Save model
with open('model.bin', 'wb') as f_out:
    pickle.dump((dv, model), f_out)

print("✅ Model saved with sparse features")


✅ Model saved with sparse features


In [8]:
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [9]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df


In [10]:
df = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet')


In [None]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)


In [12]:
df = df.dropna(subset=['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'trip_distance', 'PULocationID', 'DOLocationID'])
df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]


In [13]:
# Assuming you already have a trained model (model) and vectorizer (dv)
X_dict = df[features].to_dict(orient='records')
X = dv.transform(X_dict)

# Predict the trip durations
predicted_durations = model.predict(X)


In [6]:
import mlflow
mlflow.set_experiment("Lab5")


Traceback (most recent call last):
  File "/home/anilm/anaconda3/envs/exp-lab2-env/lib/python3.9/site-packages/mlflow/store/tracking/file_store.py", line 329, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "/home/anilm/anaconda3/envs/exp-lab2-env/lib/python3.9/site-packages/mlflow/store/tracking/file_store.py", line 427, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "/home/anilm/anaconda3/envs/exp-lab2-env/lib/python3.9/site-packages/mlflow/store/tracking/file_store.py", line 1373, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "/home/anilm/anaconda3/envs/exp-lab2-env/lib/python3.9/site-packages/mlflow/store/tracking/file_store.py", line 1366, in _read_helper
    result = read_yaml(root, file_name)
  File "/home/anilm/anaconda3/envs/exp-lab2-env/lib/python3.9/site-packages/mlflow/utils/file_utils.py", line 310, in read_yaml
    raise MissingConfigExcepti

<Experiment: artifact_location='file:///home/anilm/mlops-zoomcamp/mlruns/374418875240430509', creation_time=1746583596441, experiment_id='374418875240430509', last_update_time=1746583596441, lifecycle_stage='active', name='Lab5', tags={}>

In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Step 1: Load the data
df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet')

# Step 2: Clean and filter
df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]

# Step 3: Define features and target
df['PULocationID'] = df['PULocationID'].astype(str)
df['DOLocationID'] = df['DOLocationID'].astype(str)

features = ['PULocationID', 'DOLocationID']
X_dict = df[features].to_dict(orient='records')
y = df['duration']

# Step 4: Split the data
X_train_dict, X_val_dict, y_train, y_val = train_test_split(X_dict, y, test_size=0.2, random_state=42)

# Step 5: Vectorize features
dv = DictVectorizer()
X_train = dv.fit_transform(X_train_dict)
X_val = dv.transform(X_val_dict)

# Step 6: Train the regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 7: Predict on validation set and calculate std deviation
y_pred = model.predict(X_val)
std_dev = np.std(y_pred)

print(f"✅ Standard Deviation of Predicted Durations: {std_dev:.2f}")



✅ Standard Deviation of Predicted Durations: 6.76


In [8]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
import pyarrow as pa

# -----------------------------
# Load March data (validation)
df_val = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet')
df_val['duration'] = (df_val['tpep_dropoff_datetime'] - df_val['tpep_pickup_datetime']).dt.total_seconds() / 60
df_val = df_val[(df_val['duration'] >= 1) & (df_val['duration'] <= 60)]
df_val['PULocationID'] = df_val['PULocationID'].astype(str)
df_val['DOLocationID'] = df_val['DOLocationID'].astype(str)

# -----------------------------
# Vectorization
features = ['PULocationID', 'DOLocationID']

dv = DictVectorizer()
val_dicts = df_val[features].to_dict(orient='records')
X_val = dv.fit_transform(val_dicts)
y_val = df_val['duration']

# -----------------------------
# Train model
model = LinearRegression()
model.fit(X_val, y_val)

# -----------------------------
# Predict and evaluate
y_pred = model.predict(X_val)

# -----------------------------
# Create the results DataFrame
df_result = pd.DataFrame({
    'ride_id': [f'2023/03_{i}' for i in df_val.index],
    'predicted_duration': y_pred
})

# -----------------------------
# Save as Parquet
output_file = 'predicted_durations_march_2023.parquet'
df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)

# -----------------------------
# Check the file size
import os
file_size = os.path.getsize(output_file) / (1024 * 1024)  # In MB
print(f"File size: {file_size:.2f} MB")


2025/05/07 02:09:56 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '0e7cadcfcb59422bab6073915514cc43', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


File size: 65.46 MB


In [4]:
import argparse
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import sys

# Step 1: Define command-line arguments
def parse_args():
    if 'ipykernel' in sys.modules:  # Jupyter mode
        year = 2023
        month = 4
        return year, month
    else:
        parser = argparse.ArgumentParser(description="Predict trip duration for a given year and month.")
        parser.add_argument('year', type=int, help='Year of the dataset (e.g., 2023)')
        parser.add_argument('month', type=int, help='Month of the dataset (1-12)')
        args = parser.parse_args()
        return args.year, args.month

# Step 2: Load and process the data
def load_data(year, month):
    file_url = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year:04d}-{month:02d}.parquet'
    try:
        df = pd.read_parquet(file_url)
        df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
        df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]
        df['PULocationID'] = df['PULocationID'].astype(str)
        df['DOLocationID'] = df['DOLocationID'].astype(str)
        return df
    except Exception as e:
        print(f"❌ Error loading data for {month}/{year}: {e}")
        return pd.DataFrame()

# Step 3: Train model and evaluate on test set
def train_and_evaluate(df):
    features = ['PULocationID', 'DOLocationID']
    dv = DictVectorizer()
    X_dict = df[features].to_dict(orient='records')
    X = dv.fit_transform(X_dict)
    y = df['duration']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    mean_pred_duration = np.mean(y_pred)
    return mean_pred_duration

# Main function
def main():
    year, month = parse_args()
    df = load_data(year, month)

    if df.empty:
        print("❌ No data available for prediction.")
        return

    mean_pred_duration = train_and_evaluate(df)
    print(f"✅ Mean Predicted Duration for {month:02d}/{year}: {mean_pred_duration:.2f} minutes")

if __name__ == '__main__':
    main()


✅ Mean Predicted Duration for 04/2023: 15.27 minutes


In [2]:
import mlflow

# List all experiments
experiments = mlflow.search_experiments()
print(experiments)


Traceback (most recent call last):
  File "/home/anilm/anaconda3/envs/exp-lab2-env/lib/python3.9/site-packages/mlflow/store/tracking/file_store.py", line 329, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "/home/anilm/anaconda3/envs/exp-lab2-env/lib/python3.9/site-packages/mlflow/store/tracking/file_store.py", line 427, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "/home/anilm/anaconda3/envs/exp-lab2-env/lib/python3.9/site-packages/mlflow/store/tracking/file_store.py", line 1373, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "/home/anilm/anaconda3/envs/exp-lab2-env/lib/python3.9/site-packages/mlflow/store/tracking/file_store.py", line 1366, in _read_helper
    result = read_yaml(root, file_name)
  File "/home/anilm/anaconda3/envs/exp-lab2-env/lib/python3.9/site-packages/mlflow/utils/file_utils.py", line 310, in read_yaml
    raise MissingConfigExcepti

[<Experiment: artifact_location='file:///home/anilm/mlops-zoomcamp/mlruns/934363259442561227', creation_time=1746583642272, experiment_id='934363259442561227', last_update_time=1746583642272, lifecycle_stage='active', name='My_Lab5', tags={}>, <Experiment: artifact_location='file:///home/anilm/mlops-zoomcamp/mlruns/374418875240430509', creation_time=1746583596441, experiment_id='374418875240430509', last_update_time=1746583596441, lifecycle_stage='active', name='Lab5', tags={}>]
