In [None]:
!pip install --upgrade pip
!pip install fosforml numpy pandas matplotlib scikit-learn seaborn python-dateutil
!pip uninstall urllib3 -y
!pip install urllib3==1.26.15
!pip install fosforml 
!pip install fosforio
!pip install refractio
!pip install refractml
!pip install cloudpickle
!pip install -U cloudpickle

In [None]:
!pip install seaborn scipy xgboost pandas dice-ml tabulate numpy scikit-learn pandas-profiling plotly matplotlib scipy statsmodels seaborn pydantic-settings

In [None]:
from fosforml.model_manager.snowflakesession import get_session

my_session = get_session()

In [None]:
df = 'ATTRITION_MASTER_TABLE'

In [None]:
sf_df = my_session.sql("select * from {}".format(df))

In [None]:
import pandas as pd
pandas_df = sf_df.to_pandas()

In [None]:
pandas_df

In [None]:
print(pandas_df.isnull().sum())

In [None]:
Original_df = pandas_df.dropna()

In [None]:
print(Original_df.isnull().sum())

In [None]:
df = Original_df.drop(["USER_ID", "EMPLOYEE_ID", "JOB_STARTDATE", "JOB_ENDDATE", "SCHOOL_ENDDATE","CHURN_OTHER","PEOPLE_JOINED_BEFORE_AND_LEFT_IN_THIS_MONTH","PEOPLE_JOINED_AND_NEVER_LEFT","TOTAL_EMPLOYEE","RETENTION","SUM_OF_TENURE","SUM_OF_AGE"], axis = 1)

In [None]:
df

In [None]:
CATEGORICAL_COLUMNS = ["ROLE","ETHNICITY","ORGANIZATION_TYPE", "ORGANIZATION_OWNERSHIP","COMPANY_NAME","CITY","STATE","DISTANCE","COUNTRY","GENDER",
                       "BUSINESS_TRAVEL","ENVIRONMENT_SATISFACTION","JOB_SATISFACTION","MARITAL_STATUS","OVER_TIME","PERFORMANCE_RATING","RELATIONSHIP_SATISFACTION","WORK_LIFE_BALANCE","CHURN_STATUS_TABLE","DEGREE_CLEAN"]
NUMERICAL_COLUMNS = ["SALARY", "SENIORITY", "TENURE_MONTHS", "MONTHS_AFTER_COLLEGE", "BIRTH_YEAR","AGE","OVERTIME_HOURS","PERCENTAGE_SALARY_HIKE"]
LABEL_COLUMNS = ["CHURN_VALUE_TABLE"]
DROPPED_COLUMNS = ["USER_ID", "EMPLOYEE_ID", "JOB_STARTDATE", "JOB_ENDDATE", "SCHOOL_ENDDATE","CHURN_OTHER","PEOPLE_JOINED_BEFORE_AND_LEFT_IN_THIS_MONTH","PEOPLE_JOINED_AND_NEVER_LEFT","TOTAL_EMPLOYEE","RETENTION","SUM_OF_TENURE","SUM_OF_AGE"]
OUTPUT_COLUMNS = ["PREDICTION"]

In [None]:
# Filter feature columns
feature_columns = CATEGORICAL_COLUMNS + NUMERICAL_COLUMNS
feature_columns = [col for col in feature_columns if col in Original_df.columns]
LABEL_COLUMNS = [col for col in LABEL_COLUMNS if col in Original_df.columns]
 
# Split data into features and labels
X = Original_df[feature_columns + DROPPED_COLUMNS]
y = Original_df[LABEL_COLUMNS].values.ravel()  # Flatten to 1D array for consistency

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test = train_test_split(pandas_df, test_size=0.2)

In [None]:
y_train = x_train['CHURN_VALUE_TABLE']
x_train = x_train.drop('CHURN_VALUE_TABLE',axis=1)
y_train

In [None]:
y_test = x_test['CHURN_VALUE_TABLE']
x_test = x_test.drop('CHURN_VALUE_TABLE',axis=1)
y_test

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from xgboost import XGBClassifier
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
 
# Define transformers
categorical_transformer = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='missing'),
    OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
)
 
numerical_transformer = make_pipeline(
    SimpleImputer(strategy='mean'),
    MinMaxScaler(clip=True)
)
 
# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, CATEGORICAL_COLUMNS),
        ('num', numerical_transformer, NUMERICAL_COLUMNS)
    ]
)
 
# Create pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier())
])

pipeline.fit(x_train, y_train)
result = pipeline.predict(x_test)

In [None]:
result

In [None]:
from joblib import dump, load
filename = "Attrition.joblib"
dump(pipeline, filename)

In [None]:
y_pred = pipeline.predict(x_test)
y_prob = pipeline.predict_proba(x_test)

In [None]:
from refractml import *

from refractml.constants import MLModelFlavours

In [None]:
@scoring_func
def score(model, request):
    payload_dict = request.json["payload"]
    data_json = eval(payload)
    data = pd.DataFrame([data_json])
    prediction = str(model.predict(data)[0])
    return prediction

In [None]:
import requests
import datetime
import pandas as pd 
payload = str(X_test.iloc[123].to_dict())
req = requests.Request()
req.json = {"payload": payload}

print(score(pipeline, req))

In [None]:
req.json

In [None]:
from fosforml import register_model

register_model(
  model_obj=model,
  session=my_session,
  x_train=x_train,
  y_train=y_train,
  x_test=x_test,
  y_test=y_test,
  y_pred=y_pred,
  source="Notebook",
  dataset_name="ATTRITION_MASTER_TABLE",
  dataset_source="Snowflake",
  name="attrition_master_dataset",
  description="This is a Model for the attrition analyzing",
  flavour="sklearn",
  model_type="classification",
  conda_dependencies=["scikit-learn==1.3.2"]
)

In [None]:
from fosforml import register_model

register_model(
  model_obj=pipeline,  # Ensure the correct model object is passed
  session=my_session,
  x_train=X_train,  # Ensure X_train is passed correctly
  y_train=pd.Series(y_train),  # Convert y_train to pandas Series
  x_test=X_test,
  y_test=y_test,
  y_pred=y_pred,
  source="Notebook",
  dataset_name="ATTRITION_MASTER_TABLE",
  dataset_source="Snowflake",
  name="attrition_master_table",
  description="This is a Model for the analyzing attrition",
  flavour="sklearn",
  model_type="classification",
  conda_dependencies=["scikit-learn==1.3.2"]
)