In [None]:
import numpy as np
import pandas as pd


In [None]:
#read in files that were created from mqi_data_import.ipynb
model_actions = pd.read_pickle("mqi_data_combined/model_actions_combined.pkl")
model_state = pd.read_pickle("mqi_data_combined/model_state_combined.pkl")
model_cases = pd.read_pickle("mqi_data_combined/model_cases_combined.pkl")
model_info = pd.read_pickle("mqi_data_combined/model_info_combined.pkl")

#read in file that was created from other_data_stuff.ipynb
model_info_addedcols = pd.read_pickle(
    "mqi_data_clean/model_info_clean_added_col_3.pkl"
)

In [None]:
# ***********START MODEL INFO *******************
model_info.info()

In [None]:
def drop_modelinfo_deactivate_rows():
    """    sets 'active' field in table. this value will be used for all other tables to know when a model was active or not.
    active status will == false for all past records if model is not currently active.
    case 1: current ingestion_ts active == true
        action make all entries active == true
    case 2: current ingestion_ts active == false
        2.1: there is an active == true before any null values
            action: make all entries after the active == true ==> active == true
            make all entries before the active == true ==> active == false
        2.2: there are no active == true before any null values
            we know that the model is currently deactivate or deleted, there must be a save in order to make a deactivate or deleted, 
            therefore last save date is point where model was deactivate or deleted
            action find last save point and drop any rows after that ingestion point
    Returns:
        list, dataframe: list of indexs of rows to remove from model_info Df were a model was not active, DF containing current value of active for each model
    """
    # NOTE: older dates are less than newer dates: ie. dates more in past < dates closer to now
    
    #make all "active" = null rows false, we do not know for sure the state but if set to false it will fall into group of models to determine current state
    model_info["active"].fillna(False, inplace=True)
    # separate out CASE 1: all rows that currently active, we know all these will models will have active == true
    model_info["current_active_value"] = model_info.groupby("modelid")["active"].transform(
        "first"
    )
    # DF for only models who are not active now
    cur_set_false = model_info.loc[model_info["current_active_value"] != True].copy()

    # keep only required columns, we made all null rows == False, (this will let us pull out True rows with .max())
    cur_set_false = cur_set_false[["modelid", "lastsavetime", "ingestion_ts", "active"]]

    # add column for every row with the newest(nearest to now) last save date for each model ID
    # cur_set_false will be helpfull for applying same deactivate criteria to all tables
    cur_set_false["newest_lastsavedate"] = cur_set_false.groupby("modelid")[
        "lastsavetime"
    ].transform("max")

    cur_set_false_unique = cur_set_false.groupby("modelid")[
        ["newest_lastsavedate"]
    ].max()

    return (
        cur_set_false[
            cur_set_false["ingestion_ts"] >= cur_set_false["newest_lastsavedate"]
        ].index.tolist(),
        cur_set_false_unique,
    )


# sort ingestion date and reset index
model_info.sort_values(by="ingestion_ts", inplace=True, ascending=False)
model_info.reset_index(drop=True, inplace=True)

# drop all indexes that ingestion date >= last save date (we know model is currently not active and save must be done to deactivate so date of last save is deactivation point)
print(model_info.shape)
(index_to_drop, cur_deactivate_df) = drop_modelinfo_deactivate_rows()
model_info.drop(index=index_to_drop, inplace=True)

#any rows left are active models, make all rows status active
model_info['active'] = True


#add Model model_info_addedcols to Model Info
model_info = model_info.merge(
        model_info_addedcols[
            [
                "position_code",
                "position_description",
                "site",
                "position_class",
            ]
        ],
        left_on="modelid",
        right_on="modelid",
        how="left",
    )


print(model_info.shape)


In [None]:
model_info.isna().sum()


In [None]:

def modinfo_clean_unbuiltmodels():
    """
    Looking at the rows with no lastbuildtime and comparing them to model state table, it looks these are not good models,
    most the values for actual / expected ..etc are 0. the belief is that they are models that were added but had no data to backfill and were never fully trained
    action: remove model IDs that have no build status 
    Returns:
        List: of indexes of models with null build dates to drop
    """
    #check null build date for model info
    modinfo_null_build = model_info.loc[pd.isna(model_info["lastbuildtime"])]
    modinfo_null_build_lst = modinfo_null_build["modelid"].unique()
    
    return modinfo_null_build.index.tolist()


# First check rows that are missing build status, if the model has any build status, if model has a build status use the newest last build
model_info["maxLastBuild"] = model_info.groupby("modelid")["lastbuildtime"].transform(
    "max"
)
model_info["lastbuildtime"] = model_info["lastbuildtime"].fillna(
    model_info["maxLastBuild"]
)
model_info.drop(columns=["maxLastBuild"], inplace=True)


#any models that are left with no build status have never had one and remove them
print(model_info.shape)
model_info.drop(index=modinfo_clean_unbuiltmodels(), inplace=True)
print(model_info.shape)
print(f"after drop null last build date: {model_info['lastbuildtime'].isna().sum()}")

# ***********END MODEL INFO*******************


In [None]:
# ***********START MODEL STATES *******************
model_state.info()

In [None]:
model_state.isna().sum()


In [None]:
# drop all rows where model ID not found model info table
print(model_state.shape)
model_state = model_state.loc[
    model_state["modelid"].isin(model_info["modelid"].unique().tolist())
]
print(model_state.shape)


In [None]:
# add deactivate date column, date from model_info table: cur_deactivate_df["newest_lastsavedate"] is the date model became deactivate
model_state["deleted_date"] = np.nan
model_state["deleted_date"] = model_state["modelid"].map(
    cur_deactivate_df["newest_lastsavedate"]
)

print(model_state.shape)
# note null vales in compare will be false. adding 1 day to deleted date BC data table in datalake have slightly different ingestion times
index_to_drop = model_state[
    model_state["ingestion_ts"] >= (model_state["deleted_date"] - pd.Timedelta(1, unit='d'))
].index.tolist()
model_state.drop(index=index_to_drop, inplace=True)
print(model_state.shape)

# drop deleted_date column, we do not need it anymore
model_state.drop(columns=["deleted_date"], inplace=True)


In [None]:
#make sure data type correct
model_state["activealerts"] = model_state["activealerts"].astype("string")

# fill active alert values with no active alerts with a value
model_state["activealerts"] = model_state["activealerts"].replace(
    r"^\s*$", "noAlert", regex=True
)

# check how close number of records are between info and state (should be pretty close)
print(f"Difference between model ID in model state vs model info: {len(model_state['modelid'].unique()) - len(model_info['modelid'].unique())}")

In [None]:
#remove rows in Difference between model ID in model state vs model info. looking at rows they are single rows, that have all zeros for actual and expected values.
print(model_info.shape)
model_info = model_info.loc[
    model_info["modelid"].isin(model_state["modelid"].unique().tolist())
]
print(model_info.shape)
print(f"Difference between model ID in model state vs model info: {len(model_state['modelid'].unique()) - len(model_info['modelid'].unique())}")

# ***********END MODEL STATES *******************

In [None]:

b = pd.read_pickle("MQI_dataLake_raw/model_actions_compress_1-2.pkl")


In [None]:
# ***********START MODEL ACTIONS*******************
model_actions.info()


In [None]:
model_actions.isna().sum()


In [None]:
# drop all rows not found model info table
print(model_actions.shape)
model_actions = model_actions.loc[
    model_actions["modelid"].isin(model_info["modelid"].unique().tolist())
]
print(model_actions.shape)


In [None]:
# convert Types
model_actions[["actionnote"]] = model_actions[["actionnote"]].astype("string")

# add empty string for rows with no comments
model_actions.loc[pd.isna(model_actions["actionnote"]), ["actionnote"]] = ""
model_actions.isna().sum()
# ***********END MODEL ACTIONS*******************

In [None]:
#DROP ANY ACTIONS WHEN MODEL WAS NOT ACTIVE
# add deactivate date column, date from model_info table: cur_deactivate_df["newest_lastsavedate"] is the date model became deactivate
model_actions["deleted_date"] = np.nan
model_actions["deleted_date"] = model_actions["modelid"].map(
    cur_deactivate_df["newest_lastsavedate"]
)

print(model_actions.shape)
# note null vales in compare will be false. adding 1 day to deleted date BC data table in datalake have slightly different ingestion times
index_to_drop = model_actions[
    model_actions['changedate'] >= (model_actions["deleted_date"] - pd.Timedelta(1, unit='d'))
].index.tolist()
model_actions.drop(index=index_to_drop, inplace=True)
print(model_actions.shape)

# drop deleted_date column, we do not need it anymore
model_actions.drop(columns=["deleted_date"], inplace=True)

In [None]:
# ***********START MODEL CASES *******************
model_cases.info()

In [None]:
#remove cases we know were written from non Asset 360 models (1 month after last comment by CHRISTOPHER M CHRISMAN)
print(model_cases.shape)
model_cases = model_cases.loc[~((model_cases['casemgmt_created_by_name']=='CHRISTOPHER M CHRISMAN') & (model_cases['casemgmt_date_created'] > '2022-03-24T04:00:00.000'))]
print(model_cases.shape)
# ***********END MODEL CASES *******************

In [None]:
#!not efficient method to do this, but only need it one time, could make better if needed to run repeatedly
'''
    UPDATE MODEL FOR CHANGE MODEL TYPES
    it was found that the model type may change over time (ie. starts as a fixed limit, then at some point changes to APR).
    the consequence of this is a model score or other frequencies will not be accurate represented due to age being incorrect.
    1333 models have changed types, for now remove these rows. but in future create function to find change date and add as new model
    
'''
def check_status_change(model_info):
    """
        CHECK HOW MANY MODEL IDs HAVE CHANGED TYPES
    Returns:
        dataframe, list: df model ids and count of unique model types, list of counts 
    """
    modelinfo_type = pd.DataFrame()
    modelinfo_type[["modeltype_count"]] = model_info.groupby("modelid", group_keys=False)[["modeltype"]].nunique()

    #get list of model types
    modeltype_lst = model_info['modeltype'].value_counts().index.tolist()

    return modelinfo_type, modeltype_lst


def get__dates_of_changes(modelinfo_type, modeltype_lst):
    """GET START AND END DATES FOR EACH MODEL TYPE CHANGE

    Returns:
        dataframe: of model id and each type change dates
    """
    #get list of model IDs that have multiple model types
    modelinfo_multtype = modelinfo_type.loc[modelinfo_type['modeltype_count']>1]
    modinfo_multtype_lst = modelinfo_multtype.index.unique().tolist()

    #get all models records that have multiple IDs
    modinfo_multtyp_df = model_info.loc[model_info['modelid'].isin(modinfo_multtype_lst), ['modelid', 'modelname', 'modeltype', 'lastbuildtime', 'lastsavetime', 'ingestion_ts']]
    modinfo_multtyp_df.sort_values(by="ingestion_ts", inplace=True, ascending=False)

    #for each model type, split out and get when model type started and ended for each type, then add to a new column modelType_start and modelType_end
    for i in range(len(modeltype_lst)):
        #find ingestion data when model type started and when model type was change
        modinfo_multtyp_df.loc[modinfo_multtyp_df['modeltype']==modeltype_lst[i], f"{modeltype_lst[i]}_start"] = modinfo_multtyp_df.loc[
                    modinfo_multtyp_df['modeltype']==modeltype_lst[i]].groupby("modelid")["ingestion_ts"].transform("min")

        modinfo_multtyp_df.loc[modinfo_multtyp_df['modeltype']==modeltype_lst[i], f"{modeltype_lst[i]}_end"] = modinfo_multtyp_df.loc[
                    modinfo_multtyp_df['modeltype']==modeltype_lst[i]].groupby("modelid")["ingestion_ts"].transform("max")

    #group all models back together to get model ID to start and end time of each of its model types
    modelinfo_type_updated = pd.DataFrame()
    modelinfo_type_updated = modinfo_multtyp_df.groupby("modelid", group_keys=False).first()

    return modelinfo_type_updated


def create_new_modelid(modelinfo_type_updated):
    """CREATE NEW MODEL ID FOR EACH MODEL TYPE CHANGE
    loop each model type, check to see if row has value for start date if no, go to next model type, if yes create new model ID and add it to DF
    """
    new_modid_toadd = pd.DataFrame()
    for index, row in modelinfo_type_updated.iterrows():
        for modtype in modeltype_lst:
            if not pd.isna(row[f'{modtype}_start']):
                newrow = pd.Series({'modeltype_start':row[f'{modtype}_start'] , 'modeltype_end': row[f'{modtype}_end'], 'old_modid': index})
                newrow.name = f"{index}_{modtype}"
                new_modid_toadd = pd.concat([new_modid_toadd, newrow.to_frame().T])

    return new_modid_toadd

def update_with_new_modelid(new_modid_toadd, df, time_offset=0):
    """Change model_info to new model IDs
        get old model ID rows for each new model ID and change old model ID for those rows to new model ID
    Args:
        new_modid_toadd (_type_): _description_
        df (_type_): _description_
        time_offset (int, optional): _description_. Defaults to 0.

    Returns:
        dataframe: same dataframe passed but with updated model ids.
    """
    for index, row in new_modid_toadd.iterrows():
        df.loc[(df['modelid']== row['old_modid']) & 
            (df['ingestion_ts'] >= row['modeltype_start']-pd.Timedelta(time_offset, unit='d')) & 
            (df['ingestion_ts'] <= row['modeltype_end']+pd.Timedelta(time_offset, unit='d')), 'modelid'] = index
    return df


#get models with changed types and list of all types of each model
modelinfo_type, modeltype_lst = check_status_change(model_info)
print(modelinfo_type['modeltype_count'].value_counts())

#get start and end date of every type change for each model
modelinfo_type_updated = get__dates_of_changes(modelinfo_type, modeltype_lst)

#get dataframe of all new model IDs that need to be added
new_modid_toadd = create_new_modelid(modelinfo_type_updated)

#update model_info model_states, model_actions table with new model ids, and remove old model ids.
model_info = update_with_new_modelid(new_modid_toadd, model_info.copy(), 0)
model_state = update_with_new_modelid(new_modid_toadd, model_state.copy(), 1)
model_actions = update_with_new_modelid(new_modid_toadd, model_actions.copy(), 1)


In [None]:
#drop any model ids in model_states and model_actions not in model_info
print(f"model_info: {len(model_info['modelid'].unique())}, model ids in model States: {len(model_state['modelid'].unique())}, model ids in model actions: {len(model_actions['modelid'].unique())}")

model_state = model_state.loc[
    model_state["modelid"].isin(model_info["modelid"].unique().tolist())
]
model_actions = model_actions.loc[
    model_actions["modelid"].isin(model_info["modelid"].unique().tolist())
]
print(f"model_info: {len(model_info['modelid'].unique())}, model ids in model States: {len(model_state['modelid'].unique())}, model ids in model actions: {len(model_actions['modelid'].unique())}")

In [None]:
#small difference about 15 models, drop them to make sure everything lines up
print(model_info.shape)
model_info = model_info.loc[
    model_info["modelid"].isin(model_state["modelid"].unique().tolist())
]
print(model_info.shape)
print(f"Difference between model ID in model state vs model info: {len(model_state['modelid'].unique()) - len(model_info['modelid'].unique())}")

In [None]:
model_info.to_pickle("mqi_data_clean/model_info_clean_cpy.pkl")
model_actions.to_pickle("mqi_data_clean/model_actions_clean_cpy.pkl")
model_state.to_pickle("mqi_data_clean/model_state_clean_cpy.pkl")
model_cases.to_pickle("mqi_data_clean/model_cases_clean.pkl")
