In [1]:
import numpy as np
import pandas as pd
import random as r
import itertools as it


In [None]:

def generate_dataset(params=None, simulation_params=None):

    covs, cov_names = sample_covs(N=simulation_params['N'], ncovs=simulation_params['ncovs'], p=simulation_params['p'])

    df = covs.copy()
    df[simulation_params['trend_name']] = np.random.normal(loc=10.0, scale=1.0, size=simulation_params['N'])

    #print(df.head(20))
    true_description = find_true_description(nlits=params[2], ncovs=simulation_params['ncovs'])
    #print(true_description)

    # generate subgroup
    dataset = generate_subgroup(df=df, true_description=true_description, dist=params[0], sd=params[1], simulation_params=simulation_params)    
    dataset_ordered, attributes, descriptives = define_attributes(dataset=dataset, cov_names=cov_names, simulation_params=simulation_params)

    #print(dataset_ordered)

    return dataset_ordered, attributes, descriptives, true_description


In [None]:

def define_attributes(dataset=None, cov_names=None, simulation_params=None):

    dataset['tp'] = r.choices(list(np.arange(1,simulation_params['tp']+1)),k=simulation_params['N'])
    data_sorted = dataset.sort_values(['tp'], ascending=[True])
    data_sorted['id'] = np.arange(len(data_sorted))

    #print(data_sorted.head(20))

    descriptives = {'num_atts': [], 'bin_atts': cov_names, 
                    'nom_atts': [], 'ord_atts': []}
    attributes = {'time_attribute': ['tp'], 'skip_attributes': [],
                  'id_attribute': ['id'], 'outcome_attribute': [simulation_params['trend_name']]}

    return data_sorted, attributes, descriptives


In [None]:

def generate_subgroup(df=None, true_description=None, dist=None, sd=None, simulation_params=None):

    dataset = df.copy()
    desc = true_description

    # select cov_names based on desc
    mask = (dataset[list(desc.keys())] == pd.Series(desc)).all(axis=1)
    #print(np.sum(mask))

    n = np.sum(mask)
    trend_values = np.random.normal(loc=10.0+dist, scale=sd, size=n)
    dataset.loc[mask, simulation_params['trend_name']] = trend_values 

    return dataset



In [None]:
def sample_covs(N=None, ncovs=None, p=None):

    # sample covariates
    covs = pd.DataFrame()
    for cov in np.arange(1,ncovs+1):
        covs['x' + str(cov)] = np.random.binomial(n=1, p=p, size=N)

    cov_names = ['x' + str(k) for k in np.arange(1,ncovs+1)]

    return covs, cov_names



In [None]:
def find_true_description(nlits=None, ncovs=None):
   
    true_description = {}
    # randomly choose covs from entire list
    # number of lits is determined by params[2]
    lits = r.sample(list(np.arange(1,ncovs+1)),nlits) # returns list
    for l in lits:
        true_description['x' + str(l)] = 1

    return true_description



In [None]:
def process_result(result_emm=None, true_description=None):

    #print(true_description)
    vars = list(true_description.keys())
    for var in vars:
        if var not in result_emm:
            result_emm[var] = np.nan

    cols = result_emm.dtypes.index
    covs = cols[cols.str.startswith('x')]
    descriptions = result_emm.loc['description', covs]
    descriptions.reset_index(drop=True,inplace=True)
    descriptions.fillna(value=999,inplace=True)
    #stack = descriptions.stack()
    #stack[pd.isnull(stack)] = 999
    #descriptions = stack.unstack()
    #descriptions[pd.isnull(descriptions)] = 999
    #print(descriptions)

    all_covs = {k: 999 for k in covs}
    for lit in vars:
        all_covs[lit] = [1]
    
    
    equal = descriptions.apply(lambda row: row == pd.Series(all_covs), axis=1)

    quals = result_emm.loc['qualities', :] 
    quals.reset_index(drop=True,inplace=True)
    sel_qual = quals.loc[equal.all(axis=1)]

    result = {}
    if len(sel_qual) > 0:    
        result['quality_value'] = sel_qual['qm_value'].values[0]
        result['rank'] = sel_qual['sg'].values[0] + 1
        result['size'] = sel_qual['sg_size'].values[0]
    else:
        result['quality_value'] = 0
        result['rank'] = 51
        result['size'] = 0

    return result

In [21]:
import numpy as np
import pandas as pd
import random as r

def generate_machine_failure_dataset(simulation_params=None):
    # Initialize empty dataframe for storing all machines' data
    df_list = []

    # For each machine, generate the time series data
    for machine_id in range(1, simulation_params['num_machines'] + 1):
        machine_data = generate_machine_data(machine_id, simulation_params)
        df_list.append(machine_data)

    # Combine all machines' data into one DataFrame
    dataset = pd.concat(df_list, ignore_index=True)
    
    return dataset

def generate_machine_data(machine_id, simulation_params):
    """Generate data for a single machine over a year."""
    num_timepoints = simulation_params['num_timepoints']
    
    # Generate time series data for each feature
    timestamp = pd.date_range(start='2023-01-01', periods=num_timepoints, freq='H')
    
    # Continuous variables
    operating_temperature = np.random.normal(loc=70, scale=10, size=num_timepoints)
    vibration_level = np.random.normal(loc=15, scale=5, size=num_timepoints)
    power_consumption = np.random.normal(loc=50, scale=5, size=num_timepoints)
    pressure = np.random.normal(loc=120, scale=20, size=num_timepoints)
    utilization_rate = np.random.uniform(50, 100, size=num_timepoints)
    ambient_temperature = np.random.normal(loc=25, scale=5, size=num_timepoints)
    load = np.random.uniform(0.5, 1.0, size=num_timepoints)
    machine_age = simulation_params['machine_age'][machine_id - 1]  # Different age for each machine

    # Categorical variables
    maintenance_schedule = np.random.choice(['Yes', 'No'], size=num_timepoints, p=[0.8, 0.2])

    # Initialize failure variable (No failure initially)
    failure = np.zeros(num_timepoints)

    # Seed patterns for failure
    failure = seed_failure_patterns(failure,machine_id,machine_age,maintenance_schedule,ambient_temperature,load, operating_temperature, vibration_level, power_consumption, pressure, utilization_rate, simulation_params)

    # Create DataFrame for the machine
    df = pd.DataFrame({
        'timestamp': timestamp,
        'machine_id': machine_id,
        'operating_temperature': operating_temperature,
        'vibration_level': vibration_level,
        'power_consumption': power_consumption,
        'pressure': pressure,
        'utilization_rate': utilization_rate,
        'maintenance_schedule': maintenance_schedule,
        'ambient_temperature': ambient_temperature,
        'load': load,
        'machine_age': machine_age,
        'failure': failure
    })

    return df

def seed_failure_patterns(failure,machine_id,machine_age,maintenance_schedule,ambient_temperature,load, operating_temperature, vibration_level, power_consumption, pressure, utilization_rate, simulation_params):
    """Apply failure patterns to seed failures based on the patterns you described."""
    pattern1 = 0
    pattern2 = 0
    pattern3 = 0
    pattern4 = 0
    pattern5 = 0
    pattern6 = 0
    
    if machine_id == 1:
        # Machine 1: operating_temperature > 80°C for 10+ hours
        temp_fail_indices = np.where(operating_temperature > 80)[0]
        for i in range(len(temp_fail_indices) - 10):
            if all(temp_fail_indices[i:i+10] == np.arange(temp_fail_indices[i], temp_fail_indices[i]+10)):
                failure[temp_fail_indices[i+9]] = 1
                pattern1+=1
        print(pattern1)

    elif machine_id == 2:
        # Machine 2: vibration level > 20 mm/s within 6 hours & power_consumption fluctuates > 10% within a day
        vib_diff = np.diff(vibration_level)
        vib_fail_indices = np.where(vib_diff > 20)[0]
        power_diff = np.abs(np.diff(power_consumption))
        power_fail_indices = np.where(power_diff > 0.1 * power_consumption[:-1])[0]
        for i in vib_fail_indices:
            if i + 6 < len(failure) and any(j in power_fail_indices for j in range(i-24, i+1)):
                failure[i + 6] = 1
                pattern2+=1
        print(pattern2)

    elif machine_id == 3:
        # Machine 3: pressure > 150 PSI for 12 hours & utilization_rate > 90% for 8 hours & maintenance = No for 4 hours
        pressure_fail_indices = np.where(pressure > 150)[0]
        util_fail_indices = np.where(utilization_rate > 90)[0]
        maintenance_fail_indices = np.where(maintenance_schedule == 'No')[0]
        for i in range(len(pressure_fail_indices) - 12):
            if all(pressure_fail_indices[i:i+12] == np.arange(pressure_fail_indices[i], pressure_fail_indices[i]+12)) and \
               any(j in util_fail_indices for j in range(pressure_fail_indices[i], pressure_fail_indices[i]+8)) and \
               any(k in maintenance_fail_indices for k in range(pressure_fail_indices[i], pressure_fail_indices[i]+4)):
                failure[pressure_fail_indices[i+11]] = 1
                pattern3+=1
        print(pattern3)

    elif machine_id == 4:
        # Machine 4: ambient_temperature > 35°C for 3 hours & load > 1.5 for 2 hours & operating_temperature > 80°C for 10 hours
        ambient_temp_fail_indices = np.where(ambient_temperature > 35)[0]
        load_fail_indices = np.where(load > 1.5)[0]
        temp_fail_indices = np.where(operating_temperature > 80)[0]
        for i in range(len(ambient_temp_fail_indices) - 3):
            if all(ambient_temp_fail_indices[i:i+3] == np.arange(ambient_temp_fail_indices[i], ambient_temp_fail_indices[i]+3)) and \
               any(j in load_fail_indices for j in range(ambient_temp_fail_indices[i], ambient_temp_fail_indices[i]+2)) and \
               any(k in temp_fail_indices for k in range(ambient_temp_fail_indices[i], ambient_temp_fail_indices[i]+10)):
                failure[ambient_temp_fail_indices[i+2]] = 1
                pattern4+=1
        print(pattern4)

    elif machine_id == 5:
        # Machine 5: pressure > 150 PSI for 12 hours & power_consumption fluctuates > 10% & operating_temperature > 80°C for 10 hours & utilization_rate > 90% for 8 hours & maintenance = No for 4 hours
        pressure_fail_indices = np.where(pressure > 150)[0]
        power_diff = np.abs(np.diff(power_consumption))
        power_fail_indices = np.where(power_diff > 0.1 * power_consumption[:-1])[0]
        temp_fail_indices = np.where(operating_temperature > 80)[0]
        util_fail_indices = np.where(utilization_rate > 90)[0]
        maintenance_fail_indices = np.where(maintenance_schedule == 'No')[0]
        for i in range(len(pressure_fail_indices) - 12):
            if all(pressure_fail_indices[i:i+12] == np.arange(pressure_fail_indices[i], pressure_fail_indices[i]+12)) and \
               any(j in power_fail_indices for j in range(pressure_fail_indices[i], pressure_fail_indices[i]+24)) and \
               any(k in temp_fail_indices for k in range(pressure_fail_indices[i], pressure_fail_indices[i]+10)) and \
               any(l in util_fail_indices for l in range(pressure_fail_indices[i], pressure_fail_indices[i]+8)) and \
               any(m in maintenance_fail_indices for m in range(pressure_fail_indices[i], pressure_fail_indices[i]+4)):
                failure[pressure_fail_indices[i+11]] = 1
                pattern5+=1
        print(pattern5)

    # Machine age > 30 months & operating_temperature > 100°C for 5+ hours
    if machine_age > 30:
        temp_fail_indices = np.where(operating_temperature > 100)[0]
        for i in range(len(temp_fail_indices) - 5):
            if all(temp_fail_indices[i:i+5] == np.arange(temp_fail_indices[i], temp_fail_indices[i]+5)):
                failure[temp_fail_indices[i+4]] = 1
                pattern6+=1
    print(pattern6)
    return failure

# Simulation parameters
simulation_params = {
    'num_machines': 5,               # Number of machines
    'num_timepoints': 8760,          # One year of hourly data (365 days * 24 hours)
    'machine_age': [12, 24, 36, 48, 60],  # Machine ages in months
}
# Generate the dataset
dataset = generate_machine_failure_dataset(simulation_params)


0
0
23
0
0
0
0
0
0
0


In [22]:
# Count how many times failure == 1 in the whole dataset
failure_count = dataset['failure'].sum()
print(f"Total number of failures in the dataset: {failure_count}")


Total number of failures in the dataset: 23.0
