In [2]:
import pandas as pd
!pip install s3fs

Collecting s3fs
  Downloading s3fs-2024.12.0-py3-none-any.whl.metadata (1.6 kB)
Collecting fsspec==2024.12.0.* (from s3fs)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading s3fs-2024.12.0-py3-none-any.whl (30 kB)
Downloading fsspec-2024.12.0-py3-none-any.whl (183 kB)
Installing collected packages: fsspec, s3fs
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2023.6.0
    Uninstalling fsspec-2023.6.0:
      Successfully uninstalled fsspec-2023.6.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jupyter-ai 2.28.3 requires faiss-cpu!=1.8.0.post0,<2.0.0,>=1.8.0, which is not installed.
jupyter-scheduler 2.9.0 requires fsspec==2023.6.0, but you have fsspec 2024.12.0 which is incompatible.[0m[31m
[0mSuccessfully installed fsspec-2024.12.0 s3fs-2024.12.0


In [4]:
def preprocess_data(file_path):
    try:
        # Specify the delimiter and decimal handling for non-standard formats
        df = pd.read_csv(
            file_path,
            delimiter=";",  # Use semicolon as the delimiter
            decimal=",",    # Handle commas as decimal points
            encoding="utf-8"  # Ensure proper encoding
        )
    except Exception as e:
        print(f"Error reading CSV: {e}")
        return None

    try:
        # Convert to datetime columns
        df["firstorder"] = pd.to_datetime(df["firstorder"], format="%d.%m.%y", errors="coerce")
        df["lastorder"] = pd.to_datetime(df["lastorder"], format="%d.%m.%y", errors="coerce")
        df["created"] = pd.to_datetime(df["created"], format="%d.%m.%y", errors="coerce")

        # Drop Rows with null values
        df = df.dropna()

        # Create Column which gives the days between 
        # the last order and the first order
        df["first_last_days_diff"] = (df["lastorder"] - df["firstorder"]).dt.days

        # Create Column which gives the days between 
        # when the customer record was created and the first order
        df["created_first_days_diff"] = (df["created"] - df["firstorder"]).dt.days

        # Drop Columns
        df.drop(["custid", "created", "firstorder", "lastorder"], axis=1, inplace=True)

        # Apply one-hot encoding on favday and city columns
        df = pd.get_dummies(df, prefix=["favday", "city"], columns=["favday", "city"])
        return df
    except Exception as e:
        print(f"Error during preprocessing: {e}")
        return None


# Set the required configurations
model_name = "churn_model"
env = "dev"

# S3 Bucket
default_bucket = "customer-chunk-genai-project"

# Preprocess the dataset
storedata = preprocess_data(f"s3://{default_bucket}/data/storedata_total.csv")

In [5]:
storedata.head()

Unnamed: 0,retained,esent,eopenrate,eclickrate,avgorder,ordfreq,paperless,refill,doorstep,first_last_days_diff,...,favday_Monday,favday_Saturday,favday_Sunday,favday_Thursday,favday_Tuesday,favday_Wednesday,city_BLR,city_BOM,city_DEL,city_MAA
0,0,29,100.0,3.448276,14.52,0.0,0,0,0,0,...,True,False,False,False,False,False,False,False,True,False
1,1,95,92.631579,10.526316,83.69,0.181641,1,1,1,1024,...,False,False,False,False,False,False,False,False,True,False
2,0,0,0.0,0.0,33.58,0.059908,0,0,0,217,...,False,False,False,False,False,True,False,False,True,False
3,0,0,0.0,0.0,54.96,0.0,0,0,0,0,...,False,False,False,True,False,False,False,True,False,False
4,1,30,90.0,13.333333,111.91,0.00885,0,0,0,791,...,True,False,False,False,False,False,False,True,False,False
