In [3]:
import pandas as pd

import pandas as pd
from sklearn.linear_model import SGDRegressor, SGDClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report



In [4]:
df = pd.read_csv('data.csv')
df

Unnamed: 0,Material_Id,Vendor_Id,Po_Date,Actual_Delivery_Date,Lead_Time,Standard_Lead_Time,Quantity,Price,Unnamed: 8
0,M4,V3,1/1/2000,1/6/2000,4,4,800,500,
1,M2,V5,1/3/2000,1/16/2000,13,14,1700,750,
2,M3,V3,1/3/2000,1/5/2000,2,3,400,590,
3,M3,V5,1/4/2000,1/10/2000,6,6,500,450,
4,M4,V3,1/7/2000,1/11/2000,4,4,1100,440,
...,...,...,...,...,...,...,...,...,...
620,M5,V5,2/21/2002,3/5/2002,12,12,5000,300,
621,M5,V4,2/26/2002,3/7/2002,9,9,2500,350,
622,M5,V5,3/1/2002,3/12/2002,11,12,4000,300,
623,M5,V5,3/2/2002,3/14/2002,12,12,5000,300,


In [7]:
df = df.drop(columns=['Unnamed: 8'])
df.head()


Unnamed: 0,Material_Id,Vendor_Id,Po_Date,Actual_Delivery_Date,Lead_Time,Standard_Lead_Time,Quantity,Price
0,M4,V3,1/1/2000,1/6/2000,4,4,800,500
1,M2,V5,1/3/2000,1/16/2000,13,14,1700,750
2,M3,V3,1/3/2000,1/5/2000,2,3,400,590
3,M3,V5,1/4/2000,1/10/2000,6,6,500,450
4,M4,V3,1/7/2000,1/11/2000,4,4,1100,440


In [9]:
# Convert dates to datetime
df['Po_Date'] = pd.to_datetime(df['Po_Date'])
df['Actual_Delivery_Date'] = pd.to_datetime(df['Actual_Delivery_Date'])

# Extract useful features from dates
df['Po_Year'] = df['Po_Date'].dt.year
df['Po_Month'] = df['Po_Date'].dt.month
df['Po_Day'] = df['Po_Date'].dt.day


""" Year, Month, Day: Extracting features like year, month, and day can be useful for:
Trend Analysis: Analyzing patterns or trends over time (e.g., monthly or yearly trends).
Seasonality: Identifying seasonal patterns or cyclical behavior (e.g., sales might be higher in certain months).
Grouping and Aggregation: Grouping data by year, month, or day for summary statistics or aggregations."""

' Year, Month, Day: Extracting features like year, month, and day can be useful for:\nTrend Analysis: Analyzing patterns or trends over time (e.g., monthly or yearly trends).\nSeasonality: Identifying seasonal patterns or cyclical behavior (e.g., sales might be higher in certain months).\nGrouping and Aggregation: Grouping data by year, month, or day for summary statistics or aggregations.'

In [11]:
# Calculate actual lead time if not already available
if 'Lead_Time' not in df.columns:
    df['Lead_Time'] = (df['Actual_Delivery_Date'] - df['Po_Date']).dt.days

# Define features and target for lead time prediction
features = ['Material_Id', 'Vendor_Id', 'Po_Year', 'Po_Month', 'Po_Day', 'Quantity', 'Price']
target = 'Lead_Time'
df.head()

Unnamed: 0,Material_Id,Vendor_Id,Po_Date,Actual_Delivery_Date,Lead_Time,Standard_Lead_Time,Quantity,Price,Po_Year,Po_Month,Po_Day
0,M4,V3,2000-01-01,2000-01-06,4,4,800,500,2000,1,1
1,M2,V5,2000-01-03,2000-01-16,13,14,1700,750,2000,1,3
2,M3,V3,2000-01-03,2000-01-05,2,3,400,590,2000,1,3
3,M3,V5,2000-01-04,2000-01-10,6,6,500,450,2000,1,4
4,M4,V3,2000-01-07,2000-01-11,4,4,1100,440,2000,1,7


In [13]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
X = df[features]
y = df[target]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Quantity', 'Price']), # Standardizes features by removing the mean and scaling to unit variance.
        ('cat', OneHotEncoder(), ['Material_Id', 'Vendor_Id', 'Po_Year', 'Po_Month', 'Po_Day']) #Encodes categorical features as a one-hot numeric array.
    ])

# Create a pipeline for preprocessing and model
pipeline = Pipeline(steps=[   # Chains together multiple steps, including preprocessing and model fitting.
    ('preprocessor', preprocessor),
    ('regressor', SGDRegressor(loss='squared_error', learning_rate='constant', eta0=0.01))  #Fits a Stochastic Gradient Descent (SGD) model
    #to the data. SGDRegressor with loss='squared_error' 
])

# Fit the model
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")   #Linear regression model that uses stochastic gradient descent for optimization.


from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2}")



Mean Squared Error: 3.3894370215684626
R² Score: 0.5390778636901026


In [37]:
df['Best_Vendor'] = (
    (df.groupby('Vendor_Id')['Lead_Time'].transform('mean') <= df['Lead_Time']) & 
    (df.groupby('Vendor_Id')['Price'].transform('mean') <= df['Price'])
)

df['Best_Vendor'] = df['Best_Vendor'].astype(int)

# Define features and target for classification
X = df[features]
y = df['Best_Vendor']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline for preprocessing and classification
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SGDClassifier(loss='log', learning_rate='constant', eta0=0.01))
])

# Fit the model
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")


Accuracy: 0.984
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        84
           1       0.95      1.00      0.98        41

    accuracy                           0.98       125
   macro avg       0.98      0.99      0.98       125
weighted avg       0.98      0.98      0.98       125





In [15]:
# Function to add new data and predict
def predict_lead_time(new_data, pipeline):
    """
    Predict lead times for new data and find the best vendor.
    
    Parameters:
    - new_data (DataFrame): New data to predict.
    - pipe line (Pipeline): Trained pipeline for predictions.
    
    Returns:
       - DataFrame with predictions and best vendor information.
    """
    # Preprocess new data
    new_data_preprocessed = pipeline.named_steps['preprocessor'].transform(new_data)
    
    # Predict lead times
    predictions = pipeline.named_steps['regressor'].predict(new_data_preprocessed)     
    
    # Add predictions to new_data DataFrame
    new_data['Predicted_Lead_Time'] = predictions
    # Find the vendor with the least lead time
    best_vendor = new_data.groupby('Vendor_Id')['Predicted_Lead_Time'].mean().idxmin()
    
    return new_data, best_vendor

In [17]:
import joblib
joblib.dump(pipeline, 'lead_time_predictor.pkl')


['lead_time_predictor.pkl']

In [23]:
# Load the saved model
pipeline = joblib.load('lead_time_predictor.pkl')

# Load new data
new_data = pd.read_csv('book1.csv')
new_data

Unnamed: 0,Material_Id,Vendor_Id,Po_Date,Actual_Delivery_Date,Lead_Time,Standard_Lead_Time,Quantity,Price
0,M4,V3,01-01-2000,01-06-2000,4,4,800,500
1,M2,V5,01-03-2000,1/16/2000,13,14,1700,750
2,M3,V3,01-03-2000,01-05-2000,2,3,400,590
3,M3,V5,01-04-2000,01-10-2000,6,6,700,450
4,M4,V3,01-07-2000,01-11-2000,4,4,1100,440
5,M1,V4,01-08-2000,1/15/2000,7,8,300,400
6,M1,V1,01-09-2000,1/14/2000,5,6,300,790
7,M4,V3,01-09-2000,1/14/2000,4,4,800,460
8,M4,V4,01-09-2000,1/17/2000,7,7,1500,800
9,M2,V1,01-11-2000,1/17/2000,6,5,500,700


In [25]:
# Convert dates to datetime
new_data['Po_Date'] = pd.to_datetime(df['Po_Date'])
new_data['Actual_Delivery_Date'] = pd.to_datetime(df['Actual_Delivery_Date'])

# Extract useful features from dates
new_data['Po_Year'] = new_data['Po_Date'].dt.year
new_data['Po_Month'] = new_data['Po_Date'].dt.month
new_data['Po_Day'] = new_data['Po_Date'].dt.day

In [27]:
# Calculate actual lead time if not already available
if 'Lead_Time' not in df.columns:
   new_data['Lead_Time'] = (new_data['Actual_Delivery_Date'] - new_data['Po_Date']).dt.days

# Define features and target for lead time prediction
features = ['Material_Id', 'Vendor_Id', 'Po_Year', 'Po_Month', 'Po_Day', 'Quantity', 'Price']
target = 'Lead_Time'
df.head()

Unnamed: 0,Material_Id,Vendor_Id,Po_Date,Actual_Delivery_Date,Lead_Time,Standard_Lead_Time,Quantity,Price,Po_Year,Po_Month,Po_Day
0,M4,V3,2000-01-01,2000-01-06,4,4,800,500,2000,1,1
1,M2,V5,2000-01-03,2000-01-16,13,14,1700,750,2000,1,3
2,M3,V3,2000-01-03,2000-01-05,2,3,400,590,2000,1,3
3,M3,V5,2000-01-04,2000-01-10,6,6,500,450,2000,1,4
4,M4,V3,2000-01-07,2000-01-11,4,4,1100,440,2000,1,7


In [153]:
# Preprocess new data similarly
X_new = new_data[features]
y_new = new_data[target]

In [155]:
# Fit model with new data
pipeline.fit(X_new, y_new)  # Depending on the size of the new data, you might use partial_fit instead of fit

In [157]:
# importances = pipeline.named_steps['regressor'].coef_  '''This accesses the model component of the pipeline, which in your case is a SGDRegressor
# Larger absolute values of coefficients indicate higher importance.'''
# feature_names = pipeline.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out() '''#This attribute contains a list of transformers 
# used in the ColumnTransformer This selects the second transformer in the list of transformers, which is likely the one handling categorical features (often an OneHotEncoder)'''
# feature_importances = dict(zip(feature_names, importances))

In [159]:
# Predict lead times using the pipeline
predicted_lead_times = pipeline.predict(X_new)

# Add predictions to the new data
new_data['Predicted_Lead_Time'] = predicted_lead_times


In [161]:
 

# Extract relevant information for printing
best_vendor_summary = best_vendor[['Material_Id', 'Vendor_Id', 'Po_Date', 'Predicted_Lead_Time']]

# Print the results
print("Best suitable vendor and lead time based on new data:")
print(best_vendor_summary)


Best suitable vendor and lead time based on new data:
   Material_Id Vendor_Id    Po_Date  Predicted_Lead_Time
5           M1        V4 2000-01-08             6.926996
6           M1        V1 2000-01-09             5.181289
11          M1        V3 2000-01-12             6.834419
1           M2        V5 2000-01-03            12.635267
9           M2        V1 2000-01-11             5.877616
2           M3        V3 2000-01-03             2.432450
3           M3        V5 2000-01-04             6.054431
0           M4        V3 2000-01-01             3.759502
4           M4        V3 2000-01-07             4.167816
7           M4        V3 2000-01-09             3.901491
10          M4        V4 2000-01-11             7.505006


In [21]:
pip install mlflow

Collecting mlflow
  Using cached mlflow-2.16.2-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==2.16.2 (from mlflow)
  Using cached mlflow_skinny-2.16.2-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Using cached alembic-1.13.3-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Using cached docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Using cached graphene-3.3-py2.py3-none-any.whl.metadata (7.7 kB)
Collecting waitress<4 (from mlflow)
  Using cached waitress-3.0.0-py3-none-any.whl.metadata (4.2 kB)
Collecting cachetools<6,>=5.0.0 (from mlflow-skinny==2.16.2->mlflow)
  Using cached cachetools-5.5.0-py3-none-any.whl.metadata (5.3 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.16.2->mlflow)
  Using cached databricks_sdk-0.33.0-py3-none-any.whl.metadata (37 kB)
Collecting opentelemetry-api<3,>=1.9.0 (from mlflow-skinny==2.16.2->mlflow)
  Using cached opentelemetr

In [29]:
import mlflow

In [45]:
# # Assuming you have your true labels and predictions
# y_true = y_test  # True labels from the test set
# y_pred = pipeline.predict(X_test)  # Predictions from your model

# # Log the classification metrics
# report = classification_report(y_true, y_pred, output_dict=True)

# # Now you can access the metrics
# report_dict = report

# # Set the experiment name
# mlflow.set_experiment("First Experiment")
# mlflow.set_tracking_uri("http://127.0.0.1:5000/")

# # Start a new MLflow run
# with mlflow.start_run():
#     # Log metrics
#     mlflow.log_metrics({
#         'accuracy': report_dict['accuracy'],
#         'recall': report_dict['recall'],
#         'f1_score_macro': report_dict['macro avg']['f1-score']
#     })
    
#     # Log the model
#     mlflow.sklearn.log_model(pipeline, "SGDClassifier")


2024/09/30 12:27:33 INFO mlflow.tracking._tracking_service.client: 🏃 View run rare-goat-155 at: http://127.0.0.1:5000/#/experiments/449815078131393551/runs/3c16cdba77034fbf9e8b3bb455a5c2af.
2024/09/30 12:27:33 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/449815078131393551.


KeyError: 'recall'

In [48]:
# Log the classification metrics
report = classification_report(y_true, y_pred, output_dict=True)

# Now you can access the metrics
report_dict = report

# Set the experiment name
mlflow.set_experiment("First Experiment")
mlflow.set_tracking_uri("http://127.0.0.1:5000/")

# Start a new MLflow run
with mlflow.start_run():
    # Log metrics for the positive class (1)
    mlflow.log_metrics({
        'accuracy': report_dict['accuracy'],
        'recall': report_dict['1']['recall'],  # Access recall for class '1'
        'f1_score_macro': report_dict['macro avg']['f1-score']
    })
    
    # Log the model
    mlflow.sklearn.log_model(pipeline, "SGDClassifier")


2024/09/30 12:29:28 INFO mlflow.tracking._tracking_service.client: 🏃 View run dazzling-sponge-554 at: http://127.0.0.1:5000/#/experiments/449815078131393551/runs/29304529ce054512b2c022cb765e5b64.
2024/09/30 12:29:28 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/449815078131393551.


In [50]:
# Prompt for run ID
run_id = input("Enter the run ID: ")

# Register the model
model_name = "SGDClassifier"
model_uri = f"runs:/{run_id}/{model_name}"

# Register the model with MLflow
result = mlflow.register_model(model_uri, model_name)

print(f"Model registered: {result.name}, Version: {result.version}")


Enter the run ID:  29304529ce054512b2c022cb765e5b64


Successfully registered model 'SGDClassifier'.
2024/09/30 12:53:56 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: SGDClassifier, version 1


Model registered: SGDClassifier, Version: 1


Created version '1' of model 'SGDClassifier'.
