# DATA TRANSFORMATION using SNOWPARK DATAFRAME on FOSFOR

In [1]:
from snowflake.snowpark import Session

# Data Science Libs
import numpy as np
import pandas as pd
import statsmodels
import matplotlib.pyplot as plt
import seaborn as sns

# create_temp_table warning suppresion
import warnings; warnings.simplefilter('ignore')

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

from fosforio import snowflake
from snowflake.snowpark.functions import col, lit, when, year, to_date


Connection manager service url initialised to http://fdc-project-manager:80/project-manager
If you need to update its value then update the variable CONNECTION_MANAGER_BASE_URL in os env.


In [4]:
from fosforml.model_manager.snowflakesession import get_session
my_session = get_session()

table_name = 'AUTO_INSURANCE_CLAIMS_DATA'
sf_df = my_session.sql("select * from {}".format(table_name))


In [10]:
# 1. Filter data for rows where INCIDENT_SEVERITY is 'Major Damage' and POLICY_STATE is 'CT'
filtered_df = sf_df.filter(
    (col("INCIDENT_SEVERITY") == lit("Major Damage")) & (col("POLICY_STATE") == lit("CT"))
)


In [6]:

# 2. Select a few specific columns for further analysis
selected_df = filtered_df.select(
    "MONTHS_AS_CUSTOMER",
    "CUSTOMER_AGE",
    "POLICY_NUMBER",
    "POLICY_BIND_DATE",
    "POLICY_STATE",
    "POLICY_ANNUAL_PREMIUM",
    "INCIDENT_DATE",
    "INCIDENT_SEVERITY",
    "NUMBER_OF_VEHICLES_INVOLVED",
    "TOTAL_CLAIM_AMOUNT_PAID",
    "AUTO_MAKE",
    "AUTO_MODEL",
    "AUTO_YEAR"
)

In [7]:

# 3. Convert VARCHAR date columns to DATE
transformed_df = selected_df.with_column("INCIDENT_DATE", to_date(col("INCIDENT_DATE")))
transformed_df = transformed_df.with_column("POLICY_BIND_DATE", to_date(col("POLICY_BIND_DATE")))

In [8]:

# 4. Create a new column that categorizes vehicles based on AUTO_YEAR (e.g., Old, Mid, New)
transformed_df = transformed_df.with_column(
    "VEHICLE_AGE_CATEGORY",
    when(col("AUTO_YEAR") < 2000, lit("Old"))
    .when((col("AUTO_YEAR") >= 2000) & (col("AUTO_YEAR") < 2010), lit("Mid"))
    .otherwise(lit("New"))
)

In [9]:
# 5. Create another column to calculate the number of years the policy has been active
transformed_df = transformed_df.with_column(
    "POLICY_ACTIVE_YEARS",
    (year(col("INCIDENT_DATE")) - year(col("POLICY_BIND_DATE")))
)
