In [17]:
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import Lasso
import pandas as pd
import numpy as np
import pickle as pkl

In [18]:
training_data = pd.read_csv("../../data/PreProcessingfinaldata.csv")

training_data.head()

Unnamed: 0.1,Unnamed: 0,week,sales_amount,base_price,final_price,promotion_type,facebook_spend,google ads_spend,influencer marketing_spend,instagram_spend,ooh_spend,print_spend,radio_spend,tv_spend,youtube_spend,facebook_ctr,google ads_ctr,influencer marketing_ctr,instagram_ctr,youtube_ctr
0,0,2023-01-01,13516527.77,101.830513,94.148539,Percentage Discount,1152.82,810.68,1122.12,707.48,11230.47,6214.43,6723.33,11311.42,703.13,3.047174,4.333516,2.471559,2.008197,2.116972
1,1,2023-01-08,95081753.02,101.830513,94.148539,Percentage Discount,7472.53,6973.85,7179.02,6834.33,11380.75,11069.6,9505.5,14004.01,6562.0,2.636847,2.732868,2.930657,3.354279,3.570124
2,2,2023-01-15,94804406.04,101.830513,94.148539,Percentage Discount,7204.29,7383.5,7185.08,6963.4,10270.3,8861.17,7836.04,14442.24,7318.19,2.679349,3.136116,2.879586,2.938546,3.776793
3,3,2023-01-22,94833974.28,101.830513,94.148539,Percentage Discount,7726.84,6522.2,7710.31,7479.41,8335.56,11601.91,6663.51,11917.89,7745.75,3.015955,3.836348,2.983655,2.858832,2.823088
4,4,2023-01-29,94806994.45,101.830513,94.148539,Percentage Discount,6987.22,6969.68,7094.25,7294.12,9575.21,7488.18,12158.0,6753.84,7044.42,2.744554,3.511152,2.338256,2.403631,3.01939


In [19]:
# Select features for scaling and encoding
numerical_features = ['sales_amount', 'base_price', 'final_price',
                      'facebook_spend', 'google ads_spend', 'influencer marketing_spend',
                      'instagram_spend', 'ooh_spend', 'print_spend', 'radio_spend',
                      'tv_spend', 'youtube_spend', 'facebook_ctr', 'google ads_ctr',
                      'influencer marketing_ctr', 'instagram_ctr', 'youtube_ctr']

categorical_features = ['promotion_type']

In [20]:
# Save the fitted scaler and encoder after training  
def save_preprocessing_objects(scaler, encoder, scaler_path="scaler.pkl", encoder_path="encoder.pkl"):  
    pkl.dump(scaler, open(scaler_path, "wb"))  
    pkl.dump(encoder, open(encoder_path, "wb"))  

In [21]:
# Load the fitted scaler and encoder for inference  
def load_preprocessing_objects(scaler_path="scaler.pkl", encoder_path="encoder.pkl"):  
    scaler = pkl.load(open(scaler_path, "rb"))  
    encoder = pkl.load(open(encoder_path, "rb"))  
    return scaler, encoder  

In [22]:
# Save the training schema
def save_training_schema(data, schema_path="schema.pkl"):
    schema = {
        "columns": list(data.columns),
        "dtypes": {col: str(data[col].dtype) for col in data.columns},
    }
    pkl.dump(schema, open(schema_path, "wb"))


# Load the training schema
def load_training_schema(schema_path="schema.pkl"):
    return pkl.load(open(schema_path, "rb"))

In [23]:
def pre_processing(data):
    # Step 1: Scale numerical features
    scaler = StandardScaler()
    scaled_numerical_data = scaler.fit_transform(data[numerical_features])

    # Convert scaled numerical data to a DataFrame
    scaled_numerical_df = pd.DataFrame(scaled_numerical_data, columns=numerical_features)

    # Step 2: Encode categorical features
    encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
    encoded_categorical_data = encoder.fit_transform(data[categorical_features])

    # Get column names for encoded categorical features
    encoded_categorical_columns = encoder.get_feature_names_out(categorical_features)

    # Convert encoded categorical data to a DataFrame
    encoded_categorical_df = pd.DataFrame(encoded_categorical_data, columns=encoded_categorical_columns)

    # Step 3: Combine scaled numerical data and encoded categorical data
    processed_df = pd.concat([scaled_numerical_df, encoded_categorical_df], axis=1)
    save_preprocessing_objects(scaler, encoder)
    return processed_df

processed_df = pre_processing(training_data)

# Display the processed DataFrame
processed_df.head()

Unnamed: 0,sales_amount,base_price,final_price,facebook_spend,google ads_spend,influencer marketing_spend,instagram_spend,ooh_spend,print_spend,radio_spend,tv_spend,youtube_spend,facebook_ctr,google ads_ctr,influencer marketing_ctr,instagram_ctr,youtube_ctr,promotion_type_Buy One Get One Free,promotion_type_Percentage Discount
0,-7.20824,-1.888843,-1.079989,-6.138594,-6.036483,-6.258867,-6.41476,0.452694,-1.329475,-0.967204,0.269455,-6.2631,0.408029,2.567653,-1.354243,-2.366897,-2.237695,0.0,1.0
1,0.156877,-1.888843,-1.079989,0.671809,0.104967,0.342237,-0.124517,0.511463,0.407651,0.102746,1.192669,-0.319184,-0.548539,-0.479803,-0.315938,0.70132,1.459348,0.0,1.0
2,0.131833,-1.888843,-1.079989,0.382741,0.513173,0.348841,0.007995,0.077207,-0.382501,-0.539284,1.342926,0.447983,-0.449456,0.287937,-0.431442,-0.246289,1.985146,0.0,1.0
3,0.134503,-1.888843,-1.079989,0.945865,-0.345091,0.921263,0.537766,-0.6794,0.598106,-0.990209,0.477397,0.881749,0.335249,1.621101,-0.196076,-0.427986,-0.441227,0.0,1.0
4,0.132067,-1.888843,-1.079989,0.148817,0.100812,0.24985,0.347535,-0.194618,-0.873741,1.122828,-1.293211,0.170239,-0.29745,1.001963,-1.655725,-1.465558,0.058197,0.0,1.0


In [24]:
# Check if there are any zero or negative values in the numerical columns
print((processed_df[numerical_features] <= 0).sum())

sales_amount                   1
base_price                    27
final_price                   27
facebook_spend                24
google ads_spend              19
influencer marketing_spend    22
instagram_spend               19
ooh_spend                     28
print_spend                   28
radio_spend                   31
tv_spend                      24
youtube_spend                 24
facebook_ctr                  24
google ads_ctr                27
influencer marketing_ctr      31
instagram_ctr                 29
youtube_ctr                   27
dtype: int64


In [25]:
# Replace zero or negative values with a small positive value (e.g., 1e-6)
processed_df[numerical_features] = processed_df[numerical_features].applymap(lambda x: max(x, 1e-6))

# Apply log transformation to the target variable (sales_amount) and features (X)
target_column = 'sales_amount'
X = processed_df.drop(columns=[target_column])
y = processed_df[target_column]

# Apply log transformation (log-log regression)
X_log = np.log1p(X)  # log(1 + x) to handle zero and negative values
y_log = np.log1p(y)  # log(1 + y) to handle zero and negative values

In [26]:
save_training_schema(processed_df)

In [27]:
# Initialize Lasso model with a regularization parameter (alpha)
lasso_model = Lasso(alpha=0.1)  

# Train the model on log-transformed features and target
lasso_model.fit(X_log, y_log)

# Get the coefficients of the Lasso model
lasso_coefficients = pd.DataFrame({
    'Feature': X_log.columns,
    'Coefficient': lasso_model.coef_
}).sort_values(by='Coefficient', ascending=False)

# Logging the model score 
print("r_squared", lasso_model.score(X_log, y_log))

# Logging coefficients 
for feature, coef in zip(lasso_coefficients['Feature'], lasso_coefficients['Coefficient']):
    print(f"coef_{feature}", coef)  

# Save the coefficients DataFrame as a CSV and log it as an artifact
coefficients_file_path = "./tmp/lasso_coefficients.csv"
lasso_coefficients.to_csv(coefficients_file_path, index=False)  # Saving DataFrame to CSV file

# Log the CSV file as an artifact 
print(coefficients_file_path)  # Log the CSV file as an artifact

# Logging the model 
print("model_type", "lasso") 

with open("lassomodel_trained.pkl", "wb") as f:
    pkl.dump(lasso_model, f)

r_squared 0.0
coef_base_price 0.0
coef_final_price 0.0
coef_promotion_type_Buy One Get One Free 0.0
coef_youtube_ctr 0.0
coef_instagram_ctr 0.0
coef_influencer marketing_ctr 0.0
coef_google ads_ctr -0.0
coef_facebook_ctr -0.0
coef_youtube_spend -0.0
coef_tv_spend -0.0
coef_radio_spend 0.0
coef_print_spend 0.0
coef_ooh_spend 0.0
coef_instagram_spend 0.0
coef_influencer marketing_spend 0.0
coef_google ads_spend 0.0
coef_facebook_spend -0.0
coef_promotion_type_Percentage Discount -0.0
./tmp/lasso_coefficients.csv
model_type lasso


In [28]:
infer_df = pd.read_csv("./databricks/input1.csv")
model = pkl.load(open("./lassomodel.pkl", "rb"))

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [29]:
# Validate and enforce schema on inference data  
def enforce_schema(inference_data, schema):  
    # Ensure all required columns are present  
    for col in schema["columns"]:  
        if col not in inference_data.columns:  
            if schema["dtypes"][col] == "object":  
                inference_data[col] = "missing"  # Fill missing categorical columns  
            else:  
                inference_data[col] = np.nan  # Fill missing numerical columns  
      
    # Ensure datatypes match the training schema  
    for col in schema["columns"]:  
        inference_data[col] = inference_data[col].astype(schema["dtypes"][col])  
      
    # Ensure column order matches the training schema  
    inference_data = inference_data[schema["columns"]]  
      
    return inference_data

In [54]:
# Preprocessing function for inference  
def pre_processing_inference(data, numerical_features, categorical_features, scaler, encoder, schema):  
    # Step 0: Handle missing values  
    # For numerical features, you can fill missing values with mean/median/etc.  
    data[numerical_features] = data[numerical_features].fillna(0)  
      
    # For categorical features, fill missing values with a placeholder (e.g., "missing")  
    data[categorical_features] = data[categorical_features].fillna("missing")  
      
    # Step 1: Scale numerical features using the pre-fitted scaler  
    scaled_numerical_data = scaler.transform(data[numerical_features])  
    # Convert scaled numerical data to a DataFrame  
    scaled_numerical_df = pd.DataFrame(scaled_numerical_data, columns=numerical_features)  
      
    # Step 2: Encode categorical features using the pre-fitted encoder  
    encoded_categorical_data = encoder.transform(data[categorical_features])  
    # Get column names for encoded categorical features  
    encoded_categorical_columns = encoder.get_feature_names_out(categorical_features)  
    # Convert encoded categorical data to a DataFrame  
    encoded_categorical_df = pd.DataFrame(encoded_categorical_data, columns=encoded_categorical_columns)  
      
    # Step 3: Combine scaled numerical data and encoded categorical data  
    processed_df = pd.concat([scaled_numerical_df, encoded_categorical_df], axis=1)  
    
    processed_df = processed_df.drop("sales_amount", axis=1).replace(np.NaN, 0)
    
    # Enforce schema on inference data  
    data = enforce_schema(data, schema) 
      
    return processed_df
  
# Example usage during inference  
# Load the pre-fitted scaler and encoder  
scaler, encoder = load_preprocessing_objects()  
schema = load_training_schema()
# Perform preprocessing for inference  
processed_inference_df = pre_processing_inference(infer_df, numerical_features, categorical_features, scaler, encoder, schema)  
  
# Display the processed DataFrame for inference
print(processed_inference_df.head())  

   base_price  final_price  facebook_spend  google ads_spend  \
0   -1.888843  -507.850096       -8.623255         -6.036483   
1   -1.888843  -507.850096       -8.623255         -6.036483   

   influencer marketing_spend  instagram_spend  ooh_spend  print_spend  \
0                   -6.258867         -6.41476   0.452694    -1.329475   
1                   -6.258867         -6.41476   0.452694    -1.329475   

   radio_spend  tv_spend  youtube_spend  facebook_ctr  google ads_ctr  \
0    -0.967204  0.269455        -6.2631      0.408029        1.932676   
1    -0.967204  0.269455        -6.2631      0.408029        1.932676   

   influencer marketing_ctr  instagram_ctr  youtube_ctr  \
0                 -1.354243      -6.944316    -2.237695   
1                 -1.354243      -6.944316    -2.237695   

   promotion_type_Buy One Get One Free  promotion_type_Percentage Discount  
0                                  0.0                                 1.0  
1                               

In [52]:
lasso_model.predict(processed_inference_df.loc[0].to_frame().T)

array([0.1270629])

In [38]:
training_data.columns.to_list(), training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53 entries, 0 to 52
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0                  53 non-null     int64  
 1   week                        53 non-null     object 
 2   sales_amount                53 non-null     float64
 3   base_price                  53 non-null     float64
 4   final_price                 53 non-null     float64
 5   promotion_type              53 non-null     object 
 6   facebook_spend              53 non-null     float64
 7   google ads_spend            53 non-null     float64
 8   influencer marketing_spend  53 non-null     float64
 9   instagram_spend             53 non-null     float64
 10  ooh_spend                   53 non-null     float64
 11  print_spend                 53 non-null     float64
 12  radio_spend                 53 non-null     float64
 13  tv_spend                    53 non-nu

(['Unnamed: 0',
  'week',
  'sales_amount',
  'base_price',
  'final_price',
  'promotion_type',
  'facebook_spend',
  'google ads_spend',
  'influencer marketing_spend',
  'instagram_spend',
  'ooh_spend',
  'print_spend',
  'radio_spend',
  'tv_spend',
  'youtube_spend',
  'facebook_ctr',
  'google ads_ctr',
  'influencer marketing_ctr',
  'instagram_ctr',
  'youtube_ctr'],
 None)

In [66]:
infer_df

Unnamed: 0,week,sales_amount,base_price,final_price,promotion_type,facebook_spend,google ads_spend,influencer marketing_spend,instagram_spend,ooh_spend,...,youtube_ctr,extra_field,sales_region,city__name,city__population,undefined_field,invalid_format_field,percentage_discount,promotion_type_Buy One Get One Free,promotion_type_Percentage Discount
0,01-01-2023,13516527.77,101.830513,0.0,Percentage Discount,-1152.82,810.68,1122.12,707.48,11230.47,...,2.116972,InvalidField,North,CityName,100000.0,99999.0,01-01-2023,120.0,,
1,,13516527.77,101.830513,0.0,missing,-1152.82,810.68,1122.12,707.48,11230.47,...,2.116972,,East,,,,,,,


In [None]:
from typing import Optional
from pydantic import BaseModel, ValidationError
import pandas as pd


# Define the schema for your dataset using Pydantic
class DatasetSchema(BaseModel):
    week: Optional[str]  # Allow missing or NaN values for string fields
    sales_amount: Optional[float]  # Allow missing or NaN values for numerical fields
    base_price: Optional[float]
    final_price: Optional[float]
    promotion_type: Optional[str]
    facebook_spend: Optional[float]
    google_ads_spend: Optional[float]
    influencer_marketing_spend: Optional[float]
    instagram_spend: Optional[float]
    ooh_spend: Optional[float]
    print_spend: Optional[float]
    radio_spend: Optional[float]
    tv_spend: Optional[float]
    youtube_spend: Optional[float]
    facebook_ctr: Optional[float]
    google_ads_ctr: Optional[float]
    influencer_marketing_ctr: Optional[float]
    instagram_ctr: Optional[float]
    youtube_ctr: Optional[float]


# Function to validate a single record
def validate_record(record: dict) -> dict:
    try:
        validated_record = DatasetSchema(**record).dict()  # Validate the record
        return validated_record
    except ValidationError as e:
        print(f"Validation error: {e}")
        return None  # Return None for invalid records


# Function to validate an entire DataFrame
def validate_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    # Replace NaN values with None to make them compatible with Pydantic
    df = df.where(pd.notnull(df), None)
    print(df)
    validated_records = []
    for index, row in df.iterrows():
        validated_record = validate_record(row.to_dict())  # Validate each record
        if validated_record:
            validated_records.append(validated_record)  # Append valid records
    return pd.DataFrame(validated_records)


validate_dataframe(infer_df)

         week  sales_amount  base_price  final_price       promotion_type  \
0  01-01-2023   13516527.77  101.830513          0.0  Percentage Discount   
1        None   13516527.77  101.830513          0.0              missing   

   facebook_spend  google ads_spend  influencer marketing_spend  \
0        -1152.82            810.68                     1122.12   
1        -1152.82            810.68                     1122.12   

   instagram_spend  ooh_spend  ...  youtube_ctr   extra_field  sales_region  \
0           707.48   11230.47  ...     2.116972  InvalidField         North   
1           707.48   11230.47  ...     2.116972          None          East   

   city__name  city__population  undefined_field  invalid_format_field  \
0    CityName          100000.0          99999.0            01-01-2023   
1        None               NaN              NaN                  None   

   percentage_discount  promotion_type_Buy One Get One Free  \
0                120.0                    