In [1]:
import numpy as np
import pandas as pd
import random as r
import itertools as it


In [3]:

def generate_dataset(params=None, simulation_params=None):

    covs, cov_names = sample_covs(N=simulation_params['N'], ncovs=simulation_params['ncovs'], p=simulation_params['p'])

    df = covs.copy()
    df[simulation_params['trend_name']] = np.random.normal(loc=10.0, scale=1.0, size=simulation_params['N'])

    #print(df.head(20))
    true_description = find_true_description(nlits=params[2], ncovs=simulation_params['ncovs'])
    #print(true_description)

    # generate subgroup
    dataset = generate_subgroup(df=df, true_description=true_description, dist=params[0], sd=params[1], simulation_params=simulation_params)    
    dataset_ordered, attributes, descriptives = define_attributes(dataset=dataset, cov_names=cov_names, simulation_params=simulation_params)

    # print(dataset_ordered)

    return dataset_ordered, attributes, descriptives, true_description


In [4]:

def define_attributes(dataset=None, cov_names=None, simulation_params=None):

    dataset['tp'] = r.choices(list(np.arange(1,simulation_params['tp']+1)),k=simulation_params['N'])
    data_sorted = dataset.sort_values(['tp'], ascending=[True])
    data_sorted['id'] = np.arange(len(data_sorted))

    #print(data_sorted.head(20))

    descriptives = {'num_atts': [], 'bin_atts': cov_names, 
                    'nom_atts': [], 'ord_atts': []}
    attributes = {'time_attribute': ['tp'], 'skip_attributes': [],
                  'id_attribute': ['id'], 'outcome_attribute': [simulation_params['trend_name']]}

    return data_sorted, attributes, descriptives


In [5]:

def generate_subgroup(df=None, true_description=None, dist=None, sd=None, simulation_params=None):

    dataset = df.copy()
    desc = true_description

    # select cov_names based on desc
    mask = (dataset[list(desc.keys())] == pd.Series(desc)).all(axis=1)
    #print(np.sum(mask))

    n = np.sum(mask)
    trend_values = np.random.normal(loc=10.0+dist, scale=sd, size=n)
    dataset.loc[mask, simulation_params['trend_name']] = trend_values 

    return dataset



In [6]:
def sample_covs(N=None, ncovs=None, p=None):

    # sample covariates
    covs = pd.DataFrame()
    for cov in np.arange(1,ncovs+1):
        covs['x' + str(cov)] = np.random.binomial(n=1, p=p, size=N)

    cov_names = ['x' + str(k) for k in np.arange(1,ncovs+1)]

    return covs, cov_names



In [7]:
def find_true_description(nlits=None, ncovs=None):
   
    true_description = {}
    # randomly choose covs from entire list
    # number of lits is determined by params[2]
    lits = r.sample(list(np.arange(1,ncovs+1)),nlits) # returns list
    for l in lits:
        true_description['x' + str(l)] = 1

    return true_description



In [8]:
def process_result(result_emm=None, true_description=None):

    #print(true_description)
    vars = list(true_description.keys())
    for var in vars:
        if var not in result_emm:
            result_emm[var] = np.nan

    cols = result_emm.dtypes.index
    covs = cols[cols.str.startswith('x')]
    descriptions = result_emm.loc['description', covs]
    descriptions.reset_index(drop=True,inplace=True)
    descriptions.fillna(value=999,inplace=True)
    #stack = descriptions.stack()
    #stack[pd.isnull(stack)] = 999
    #descriptions = stack.unstack()
    #descriptions[pd.isnull(descriptions)] = 999
    #print(descriptions)

    all_covs = {k: 999 for k in covs}
    for lit in vars:
        all_covs[lit] = [1]
    
    
    equal = descriptions.apply(lambda row: row == pd.Series(all_covs), axis=1)

    quals = result_emm.loc['qualities', :] 
    quals.reset_index(drop=True,inplace=True)
    sel_qual = quals.loc[equal.all(axis=1)]

    result = {}
    if len(sel_qual) > 0:    
        result['quality_value'] = sel_qual['qm_value'].values[0]
        result['rank'] = sel_qual['sg'].values[0] + 1
        result['size'] = sel_qual['sg_size'].values[0]
    else:
        result['quality_value'] = 0
        result['rank'] = 51
        result['size'] = 0

    return result

In [11]:
import numpy as np
import pandas as pd
import random as r

def generate_machine_failure_dataset(simulation_params=None):
    # Initialize empty dataframe for storing all machines' data
    df_list = []

    # Seeding the random number generator for reproducibility
    np.random.seed(0)

    # For each machine, generate the time series data
    for machine_id in range(1, simulation_params['num_machines'] + 1):
        machine_data = generate_machine_data(machine_id, simulation_params)
        df_list.append(machine_data)

    # Combine all machines' data into one DataFrame
    dataset = pd.concat(df_list, ignore_index=True)
    
    return dataset

def generate_machine_data(machine_id, simulation_params):
    """Generate data for a single machine over a year."""
    num_timepoints = simulation_params['num_timepoints']
    
    # Generate time series data for each feature
    timestamp = pd.date_range(start='2023-01-01', periods=num_timepoints, freq='H')
    
    # Continuous variables
    operating_temperature = np.random.normal(loc=70, scale=10, size=num_timepoints)
    vibration_level = np.random.normal(loc=15, scale=5, size=num_timepoints)
    power_consumption = np.random.normal(loc=50, scale=5, size=num_timepoints)
    pressure = np.random.normal(loc=120, scale=20, size=num_timepoints)
    utilization_rate = np.random.uniform(50, 100, size=num_timepoints)
    ambient_temperature = np.random.normal(loc=25, scale=5, size=num_timepoints)
    load = np.random.uniform(0.5, 1.0, size=num_timepoints)
    machine_age = simulation_params['machine_age'][machine_id - 1]  # Different age for each machine

    # Categorical variables
    maintenance_scheduled = np.random.choice(['Yes', 'No'], size=num_timepoints, p=[0.8, 0.2])

    # Initialize failure variable (No failure initially)
    failure = np.zeros(num_timepoints)

    # Seed patterns for failure
    failure = seed_failure_patterns(failure,machine_id,machine_age,maintenance_scheduled,ambient_temperature,load, operating_temperature, vibration_level, power_consumption, pressure, utilization_rate, simulation_params)

    # Create DataFrame for the machine
    df = pd.DataFrame({
        'timestamp': timestamp,
        'machine_id': machine_id,
        'operating_temperature': operating_temperature,
        'vibration_level': vibration_level,
        'power_consumption': power_consumption,
        'pressure': pressure,
        'utilization_rate': utilization_rate,
        'maintenance_scheduled': maintenance_scheduled,
        'ambient_temperature': ambient_temperature,
        'load': load,
        'machine_age': machine_age,
        'failure': failure
    })

    return df

def seed_failure_patterns(failure,machine_id,machine_age,maintenance_scheduled,ambient_temperature,load, operating_temperature, vibration_level, power_consumption, pressure, utilization_rate, simulation_params):
    """Apply failure patterns to seed failures based on the patterns you described."""
    pattern1 = 0
    pattern2 = 0
    pattern3 = 0
    pattern4 = 0
    pattern5 = 0
    pattern6 = 0
    
    if machine_id == 1:
        # Machine 1: operating_temperature > 80°C for 10+ hours
        temp_fail_indices = np.where(operating_temperature > 80)[0]
        for i in range(len(temp_fail_indices) - 10):
            if all(temp_fail_indices[i:i+10] == np.arange(temp_fail_indices[i], temp_fail_indices[i]+10)):
                failure[temp_fail_indices[i+9]] = 1
                pattern1+=1
        print(pattern1)

    elif machine_id == 2:
        # Machine 2: vibration level > 20 mm/s within 6 hours & power_consumption fluctuates > 10% within a day
        vib_diff = np.diff(vibration_level)
        vib_fail_indices = np.where(vib_diff > 20)[0]
        power_diff = np.abs(np.diff(power_consumption))
        power_fail_indices = np.where(power_diff > 0.1 * power_consumption[:-1])[0]
        for i in vib_fail_indices:
            if i + 6 < len(failure) and any(j in power_fail_indices for j in range(i-24, i+1)):
                failure[i + 6] = 1
                pattern2+=1
        print(pattern2)

    elif machine_id == 3:
        # Machine 3: pressure > 150 PSI for 12 hours & utilization_rate > 90% for 8 hours & maintenance = No for 4 hours
        pressure_fail_indices = np.where(pressure > 150)[0]
        util_fail_indices = np.where(utilization_rate > 90)[0]
        maintenance_fail_indices = np.where(maintenance_scheduled == 'No')[0]
        for i in range(len(pressure_fail_indices) - 12):
            if all(pressure_fail_indices[i:i+12] == np.arange(pressure_fail_indices[i], pressure_fail_indices[i]+12)) and \
               any(j in util_fail_indices for j in range(pressure_fail_indices[i], pressure_fail_indices[i]+8)) and \
               any(k in maintenance_fail_indices for k in range(pressure_fail_indices[i], pressure_fail_indices[i]+4)):
                failure[pressure_fail_indices[i+11]] = 1
                pattern3+=1
        print(pattern3)

    elif machine_id == 4:
        # Machine 4: ambient_temperature > 35°C for 3 hours & load > 1.5 for 2 hours & operating_temperature > 80°C for 10 hours
        ambient_temp_fail_indices = np.where(ambient_temperature > 35)[0]
        load_fail_indices = np.where(load > 1.5)[0]
        temp_fail_indices = np.where(operating_temperature > 80)[0]
        for i in range(len(ambient_temp_fail_indices) - 3):
            if all(ambient_temp_fail_indices[i:i+3] == np.arange(ambient_temp_fail_indices[i], ambient_temp_fail_indices[i]+3)) and \
               any(j in load_fail_indices for j in range(ambient_temp_fail_indices[i], ambient_temp_fail_indices[i]+2)) and \
               any(k in temp_fail_indices for k in range(ambient_temp_fail_indices[i], ambient_temp_fail_indices[i]+10)):
                failure[ambient_temp_fail_indices[i+2]] = 1
                pattern4+=1
        print(pattern4)

    elif machine_id == 5:
        # Machine 5: pressure > 150 PSI for 12 hours & power_consumption fluctuates > 10% & operating_temperature > 80°C for 10 hours & utilization_rate > 90% for 8 hours & maintenance = No for 4 hours
        pressure_fail_indices = np.where(pressure > 150)[0]
        power_diff = np.abs(np.diff(power_consumption))
        power_fail_indices = np.where(power_diff > 0.1 * power_consumption[:-1])[0]
        temp_fail_indices = np.where(operating_temperature > 80)[0]
        util_fail_indices = np.where(utilization_rate > 90)[0]
        maintenance_fail_indices = np.where(maintenance_scheduled == 'No')[0]
        for i in range(len(pressure_fail_indices) - 12):
            if all(pressure_fail_indices[i:i+12] == np.arange(pressure_fail_indices[i], pressure_fail_indices[i]+12)) and \
               any(j in power_fail_indices for j in range(pressure_fail_indices[i], pressure_fail_indices[i]+24)) and \
               any(k in temp_fail_indices for k in range(pressure_fail_indices[i], pressure_fail_indices[i]+10)) and \
               any(l in util_fail_indices for l in range(pressure_fail_indices[i], pressure_fail_indices[i]+8)) and \
               any(m in maintenance_fail_indices for m in range(pressure_fail_indices[i], pressure_fail_indices[i]+4)):
                failure[pressure_fail_indices[i+11]] = 1
                pattern5+=1
        print(pattern5)

    # Machine age > 30 months & operating_temperature > 100°C for 5+ hours
    if machine_age > 30:
        temp_fail_indices = np.where(operating_temperature > 100)[0]
        for i in range(len(temp_fail_indices) - 5):
            if all(temp_fail_indices[i:i+5] == np.arange(temp_fail_indices[i], temp_fail_indices[i]+5)):
                failure[temp_fail_indices[i+4]] = 1
                pattern6+=1
    print(pattern6)
    return failure

# Simulation parameters
simulation_params = {
    'num_machines': 5,               # Number of machines
    'num_timepoints': 8760,          # One year of hourly data (365 days * 24 hours)
    'machine_age': [12, 24, 36, 48, 60],  # Machine ages in months
}
# Generate the dataset
dataset = generate_machine_failure_dataset(simulation_params)


0
0
22
0
0
0
0
0
0
0


  timestamp = pd.date_range(start='2023-01-01', periods=num_timepoints, freq='H')


In [12]:
# Count how many times failure == 1 in the whole dataset
failure_count = dataset['failure'].sum()
print(f"Total number of failures in the dataset: {failure_count}")


Total number of failures in the dataset: 22.0


In [14]:
# save the dataset as a csv file
dataset.to_csv('random_gen_dataset.csv', index=False)

In [11]:
# Injecting failure patterns in to the dataset
import numpy as np
import pandas as pd

def generate_machine_failure_dataset(simulation_params=None):
    # Initialize empty dataframe for storing all machines' data
    df_list = []

    # Seed the random number generator for reproducibility
    np.random.seed(42)

    # For each machine, generate the time series data
    for machine_id in range(1, simulation_params['num_machines'] + 1):
        machine_data = generate_machine_data(machine_id, simulation_params)
        machine_data = inject_failure_patterns(machine_data, machine_id, simulation_params)
        df_list.append(machine_data)

    # Combine all machines' data into one DataFrame
    dataset = pd.concat(df_list, ignore_index=True)
    
    return dataset

def inject_failure_patterns(df, machine_id, simulation_params):
    """Inject failure patterns into the dataset by modifying the data."""
    pattern_count = [0, 0, 0, 0, 0, 0, 0]	# Count of each pattern injected
    num_timepoints = len(df)

    if machine_id == 1:
        # Pattern: operating_temperature > 75°C for 6+ consecutive hours
        start_indices = np.random.choice(range(num_timepoints - 6), size=25, replace=False)
        for start in start_indices:
            df.loc[start:start+5, 'operating_temperature'] = np.random.uniform(76, 85)  # Set temperature above 75°C for 6 hours
            df.loc[start+5, 'failure'] = 1
            pattern_count[1] += 1
        

    elif machine_id == 2:
        # Pattern: vibration_level > 18 mm/s & power_consumption fluctuates > 10% within a day
        start_indices = np.random.choice(range(num_timepoints - 24), size=30, replace=False)
        for start in start_indices:
            df.loc[start:start+23, 'vibration_level'] = np.random.uniform(19, 25)  # Set vibration level above 18 mm/s
            fluctuation_indices = np.random.choice(range(start, start + 24), size=5, replace=False)
            df.loc[fluctuation_indices, 'power_consumption'] *= 1.15  # Apply fluctuations to power consumption
            df.loc[start+23, 'failure'] = 1
            pattern_count[2] += 1

    elif machine_id == 3:
        # Pattern: pressure > 140 PSI for 8 hours & utilization_rate > 85% & maintenance = No for 4 hours
        start_indices = np.random.choice(range(num_timepoints - 8), size=20, replace=False)
        for start in start_indices:
            df.loc[start:start+7, 'pressure'] = np.random.uniform(141, 150)  # Set pressure above 140 PSI for 8 hours
            df.loc[start:start+7, 'utilization_rate'] = np.random.uniform(86, 95)  # Set utilization rate above 85%
            df.loc[start:start+3, 'maintenance_schedule'] = 'No'  # Set maintenance status to 'No' for 4 hours
            df.loc[start+7, 'failure'] = 1
            pattern_count[3] += 1


    elif machine_id == 4:
        # Pattern: ambient_temperature > 33°C for 2 hours & load > 1.2 & operating_temperature > 75°C
        start_indices = np.random.choice(range(num_timepoints - 10), size=15, replace=False)
        for start in start_indices:
            df.loc[start:start+1, 'ambient_temperature'] = np.random.uniform(33, 35)  # Set ambient temperature above 33°C for 2 hours
            df.loc[start:start+1, 'load'] = np.random.uniform(1.2, 1.5)  # Set load above 1.2
            df.loc[start:start+9, 'operating_temperature'] = np.random.uniform(76, 85)  # Set operating temperature in a range above 75°C
            df.loc[start+1, 'failure'] = 1
            pattern_count[4] += 1
        

    elif machine_id == 5:
        # Pattern: pressure > 140 PSI for 8 hours & multiple other conditions
        start_indices = np.random.choice(range(num_timepoints - 8), size=10, replace=False)
        for start in start_indices:
            df.loc[start:start+7, 'pressure'] = np.random.uniform(141, 150)  # Set pressure above 140 PSI for 8 hours
            df.loc[start:start+7, 'power_consumption'] *= 1.2  # Set power consumption fluctuation
            df.loc[start:start+9, 'operating_temperature'] = np.random.uniform(76, 85)  # Set operating temperature above 75°C
            df.loc[start:start+7, 'utilization_rate'] = np.random.uniform(86, 95)  # Set utilization rate above 85%
            df.loc[start:start+3, 'maintenance_schedule'] = 'No'  # Set maintenance status to 'No' for 4 hours
            df.loc[start+7, 'failure'] = 1
            pattern_count[5] += 1
        

    # Pattern 6 (common to all machines): machine_age > 30 months & operating_temperature > 100°C for 5+ hours
    if simulation_params['machine_age'][machine_id - 1] > 30:
        start_indices = np.random.choice(range(num_timepoints - 5), size=10, replace=False)
        for start in start_indices:
            df.loc[start:start+4, 'operating_temperature'] = np.random.uniform(101, 85)
            df.loc[start+4, 'failure'] = 1
            pattern_count[6] += 1
        


    # print(f"Machine {machine_id} - Total Patterns Injected: {pattern_count}")
    for i in range(1, 7):
        print(f"Pattern {i}: {pattern_count[i]}")
    print("Total Patterns Injected: ", sum(pattern_count))

    return df

def generate_machine_data(machine_id, simulation_params):
    """Simulate time series data for a machine."""
    num_timepoints = simulation_params['num_timepoints']
    machine_age = simulation_params['machine_age'][machine_id - 1]
    
    # Simulate sensor data
    df = pd.DataFrame({
        'time': pd.date_range('2023-01-01', periods=num_timepoints, freq='h'),
        'machine_id': machine_id,
        'machine_age': machine_age,
        'maintenance_schedule': np.random.choice(['Yes', 'No'], size=num_timepoints, p=[0.95, 0.05]),
        'ambient_temperature': np.random.uniform(20, 35, size=num_timepoints),
        'load': np.random.uniform(0.8, 1.5, size=num_timepoints),
        'operating_temperature': np.random.uniform(60, 85, size=num_timepoints),
        'vibration_level': np.random.uniform(15, 25, size=num_timepoints),
        'power_consumption': np.random.uniform(0.9, 1.5, size=num_timepoints),
        'pressure': np.random.uniform(120, 150, size=num_timepoints),
        'utilization_rate': np.random.uniform(60, 95, size=num_timepoints),
        'failure': np.zeros(num_timepoints, dtype=int)  # Initialize the failure column with zeros
    })
    
    return df

# Simulation parameters
simulation_params = {
    'num_machines': 5,               # Number of machines
    'num_timepoints': 8760,          # One year of hourly data (365 days * 24 hours)
    'machine_age': [12, 24, 36, 48, 60],  # Machine ages in months
}

# Generate the dataset
dataset = generate_machine_failure_dataset(simulation_params)

# Count how many times failure == 1 in the whole dataset
print(f"Total number of failures in the dataset: {dataset['failure'].sum()}")


Pattern 1: 25
Pattern 2: 0
Pattern 3: 0
Pattern 4: 0
Pattern 5: 0
Pattern 6: 0
Total Patterns Injected:  25
Pattern 1: 0
Pattern 2: 30
Pattern 3: 0
Pattern 4: 0
Pattern 5: 0
Pattern 6: 0
Total Patterns Injected:  30
Pattern 1: 0
Pattern 2: 0
Pattern 3: 20
Pattern 4: 0
Pattern 5: 0
Pattern 6: 10
Total Patterns Injected:  30
Pattern 1: 0
Pattern 2: 0
Pattern 3: 0
Pattern 4: 15
Pattern 5: 0
Pattern 6: 10
Total Patterns Injected:  25
Pattern 1: 0
Pattern 2: 0
Pattern 3: 0
Pattern 4: 0
Pattern 5: 10
Pattern 6: 10
Total Patterns Injected:  20
Total number of failures in the dataset: 130


In [3]:
# Count the number of instances where the failure is 1 
print(dataset['failure'].value_counts())

failure
0    43670
1      130
Name: count, dtype: int64
