In [1]:
import numpy as np
import pandas as pd
import random as r
import itertools as it


In [2]:

def generate_dataset(params=None, simulation_params=None):

    covs, cov_names = sample_covs(N=simulation_params['N'], ncovs=simulation_params['ncovs'], p=simulation_params['p'])

    df = covs.copy()
    df[simulation_params['trend_name']] = np.random.normal(loc=10.0, scale=1.0, size=simulation_params['N'])

    #print(df.head(20))
    true_description = find_true_description(nlits=params[2], ncovs=simulation_params['ncovs'])
    #print(true_description)

    # generate subgroup
    dataset = generate_subgroup(df=df, true_description=true_description, dist=params[0], sd=params[1], simulation_params=simulation_params)    
    dataset_ordered, attributes, descriptives = define_attributes(dataset=dataset, cov_names=cov_names, simulation_params=simulation_params)

    # print(dataset_ordered)

    return dataset_ordered, attributes, descriptives, true_description


In [3]:

def define_attributes(dataset=None, cov_names=None, simulation_params=None):

    dataset['tp'] = r.choices(list(np.arange(1,simulation_params['tp']+1)),k=simulation_params['N'])
    data_sorted = dataset.sort_values(['tp'], ascending=[True])
    data_sorted['id'] = np.arange(len(data_sorted))

    #print(data_sorted.head(20))

    descriptives = {'num_atts': [], 'bin_atts': cov_names, 
                    'nom_atts': [], 'ord_atts': []}
    attributes = {'time_attribute': ['tp'], 'skip_attributes': [],
                  'id_attribute': ['id'], 'outcome_attribute': [simulation_params['trend_name']]}

    return data_sorted, attributes, descriptives


In [4]:

def generate_subgroup(df=None, true_description=None, dist=None, sd=None, simulation_params=None):

    dataset = df.copy()
    desc = true_description

    # select cov_names based on desc
    mask = (dataset[list(desc.keys())] == pd.Series(desc)).all(axis=1)
    #print(np.sum(mask))

    n = np.sum(mask)
    trend_values = np.random.normal(loc=10.0+dist, scale=sd, size=n)
    dataset.loc[mask, simulation_params['trend_name']] = trend_values 

    return dataset



In [5]:
def sample_covs(N=None, ncovs=None, p=None):

    # sample covariates
    covs = pd.DataFrame()
    for cov in np.arange(1,ncovs+1):
        covs['x' + str(cov)] = np.random.binomial(n=1, p=p, size=N)

    cov_names = ['x' + str(k) for k in np.arange(1,ncovs+1)]

    return covs, cov_names



In [6]:
def find_true_description(nlits=None, ncovs=None):
   
    true_description = {}
    # randomly choose covs from entire list
    # number of lits is determined by params[2]
    lits = r.sample(list(np.arange(1,ncovs+1)),nlits) # returns list
    for l in lits:
        true_description['x' + str(l)] = 1

    return true_description



In [7]:
def process_result(result_emm=None, true_description=None):

    #print(true_description)
    vars = list(true_description.keys())
    for var in vars:
        if var not in result_emm:
            result_emm[var] = np.nan

    cols = result_emm.dtypes.index
    covs = cols[cols.str.startswith('x')]
    descriptions = result_emm.loc['description', covs]
    descriptions.reset_index(drop=True,inplace=True)
    descriptions.fillna(value=999,inplace=True)
    #stack = descriptions.stack()
    #stack[pd.isnull(stack)] = 999
    #descriptions = stack.unstack()
    #descriptions[pd.isnull(descriptions)] = 999
    #print(descriptions)

    all_covs = {k: 999 for k in covs}
    for lit in vars:
        all_covs[lit] = [1]
    
    
    equal = descriptions.apply(lambda row: row == pd.Series(all_covs), axis=1)

    quals = result_emm.loc['qualities', :] 
    quals.reset_index(drop=True,inplace=True)
    sel_qual = quals.loc[equal.all(axis=1)]

    result = {}
    if len(sel_qual) > 0:    
        result['quality_value'] = sel_qual['qm_value'].values[0]
        result['rank'] = sel_qual['sg'].values[0] + 1
        result['size'] = sel_qual['sg_size'].values[0]
    else:
        result['quality_value'] = 0
        result['rank'] = 51
        result['size'] = 0

    return result

In [8]:
import numpy as np
import pandas as pd
import random as r

def generate_machine_failure_dataset(simulation_params=None):
    # Initialize empty dataframe for storing all machines' data
    df_list = []

    # Seeding the random number generator for reproducibility
    np.random.seed(0)

    # For each machine, generate the time series data
    for machine_id in range(1, simulation_params['num_machines'] + 1):
        machine_data = generate_machine_data(machine_id, simulation_params)
        df_list.append(machine_data)

    # Combine all machines' data into one DataFrame
    dataset = pd.concat(df_list, ignore_index=True)
    
    return dataset

def generate_machine_data(machine_id, simulation_params):
    """Generate data for a single machine over a year."""
    num_timepoints = simulation_params['num_timepoints']
    
    # Generate time series data for each feature
    timestamp = pd.date_range(start='2023-01-01', periods=num_timepoints, freq='H')
    
    # Continuous variables
    operating_temperature = np.random.normal(loc=70, scale=10, size=num_timepoints)
    vibration_level = np.random.normal(loc=15, scale=5, size=num_timepoints)
    power_consumption = np.random.normal(loc=50, scale=5, size=num_timepoints)
    pressure = np.random.normal(loc=120, scale=20, size=num_timepoints)
    utilization_rate = np.random.uniform(50, 100, size=num_timepoints)
    ambient_temperature = np.random.normal(loc=25, scale=5, size=num_timepoints)
    load = np.random.uniform(0.5, 1.0, size=num_timepoints)
    machine_age = simulation_params['machine_age'][machine_id - 1]  # Different age for each machine

    # Categorical variables
    maintenance_scheduled = np.random.choice(['Yes', 'No'], size=num_timepoints, p=[0.8, 0.2])

    # Initialize failure variable (No failure initially)
    failure = np.zeros(num_timepoints)

    # Seed patterns for failure
    failure = seed_failure_patterns(failure,machine_id,machine_age,maintenance_scheduled,ambient_temperature,load, operating_temperature, vibration_level, power_consumption, pressure, utilization_rate, simulation_params)

    # Create DataFrame for the machine
    df = pd.DataFrame({
        'timestamp': timestamp,
        'machine_id': machine_id,
        'operating_temperature': operating_temperature,
        'vibration_level': vibration_level,
        'power_consumption': power_consumption,
        'pressure': pressure,
        'utilization_rate': utilization_rate,
        'maintenance_scheduled': maintenance_scheduled,
        'ambient_temperature': ambient_temperature,
        'load': load,
        'machine_age': machine_age,
        'failure': failure
    })

    return df

def seed_failure_patterns(failure,machine_id,machine_age,maintenance_scheduled,ambient_temperature,load, operating_temperature, vibration_level, power_consumption, pressure, utilization_rate, simulation_params):
    """Apply failure patterns to seed failures based on the patterns you described."""
    pattern1 = 0
    pattern2 = 0
    pattern3 = 0
    pattern4 = 0
    pattern5 = 0
    pattern6 = 0
    
    if machine_id == 1:
        # Machine 1: operating_temperature > 80°C for 10+ hours
        temp_fail_indices = np.where(operating_temperature > 80)[0]
        for i in range(len(temp_fail_indices) - 10):
            if all(temp_fail_indices[i:i+10] == np.arange(temp_fail_indices[i], temp_fail_indices[i]+10)):
                failure[temp_fail_indices[i+9]] = 1
                pattern1+=1
        print(pattern1)

    elif machine_id == 2:
        # Machine 2: vibration level > 20 mm/s within 6 hours & power_consumption fluctuates > 10% within a day
        vib_diff = np.diff(vibration_level)
        vib_fail_indices = np.where(vib_diff > 20)[0]
        power_diff = np.abs(np.diff(power_consumption))
        power_fail_indices = np.where(power_diff > 0.1 * power_consumption[:-1])[0]
        for i in vib_fail_indices:
            if i + 6 < len(failure) and any(j in power_fail_indices for j in range(i-24, i+1)):
                failure[i + 6] = 1
                pattern2+=1
        print(pattern2)

    elif machine_id == 3:
        # Machine 3: pressure > 150 PSI for 12 hours & utilization_rate > 90% for 8 hours & maintenance = No for 4 hours
        pressure_fail_indices = np.where(pressure > 150)[0]
        util_fail_indices = np.where(utilization_rate > 90)[0]
        maintenance_fail_indices = np.where(maintenance_scheduled == 'No')[0]
        for i in range(len(pressure_fail_indices) - 12):
            if all(pressure_fail_indices[i:i+12] == np.arange(pressure_fail_indices[i], pressure_fail_indices[i]+12)) and \
               any(j in util_fail_indices for j in range(pressure_fail_indices[i], pressure_fail_indices[i]+8)) and \
               any(k in maintenance_fail_indices for k in range(pressure_fail_indices[i], pressure_fail_indices[i]+4)):
                failure[pressure_fail_indices[i+11]] = 1
                pattern3+=1
        print(pattern3)

    elif machine_id == 4:
        # Machine 4: ambient_temperature > 35°C for 3 hours & load > 1.5 for 2 hours & operating_temperature > 80°C for 10 hours
        ambient_temp_fail_indices = np.where(ambient_temperature > 35)[0]
        load_fail_indices = np.where(load > 1.5)[0]
        temp_fail_indices = np.where(operating_temperature > 80)[0]
        for i in range(len(ambient_temp_fail_indices) - 3):
            if all(ambient_temp_fail_indices[i:i+3] == np.arange(ambient_temp_fail_indices[i], ambient_temp_fail_indices[i]+3)) and \
               any(j in load_fail_indices for j in range(ambient_temp_fail_indices[i], ambient_temp_fail_indices[i]+2)) and \
               any(k in temp_fail_indices for k in range(ambient_temp_fail_indices[i], ambient_temp_fail_indices[i]+10)):
                failure[ambient_temp_fail_indices[i+2]] = 1
                pattern4+=1
        print(pattern4)

    elif machine_id == 5:
        # Machine 5: pressure > 150 PSI for 12 hours & power_consumption fluctuates > 10% & operating_temperature > 80°C for 10 hours & utilization_rate > 90% for 8 hours & maintenance = No for 4 hours
        pressure_fail_indices = np.where(pressure > 150)[0]
        power_diff = np.abs(np.diff(power_consumption))
        power_fail_indices = np.where(power_diff > 0.1 * power_consumption[:-1])[0]
        temp_fail_indices = np.where(operating_temperature > 80)[0]
        util_fail_indices = np.where(utilization_rate > 90)[0]
        maintenance_fail_indices = np.where(maintenance_scheduled == 'No')[0]
        for i in range(len(pressure_fail_indices) - 12):
            if all(pressure_fail_indices[i:i+12] == np.arange(pressure_fail_indices[i], pressure_fail_indices[i]+12)) and \
               any(j in power_fail_indices for j in range(pressure_fail_indices[i], pressure_fail_indices[i]+24)) and \
               any(k in temp_fail_indices for k in range(pressure_fail_indices[i], pressure_fail_indices[i]+10)) and \
               any(l in util_fail_indices for l in range(pressure_fail_indices[i], pressure_fail_indices[i]+8)) and \
               any(m in maintenance_fail_indices for m in range(pressure_fail_indices[i], pressure_fail_indices[i]+4)):
                failure[pressure_fail_indices[i+11]] = 1
                pattern5+=1
        print(pattern5)

    # Machine age > 30 months & operating_temperature > 100°C for 5+ hours
    if machine_age > 30:
        temp_fail_indices = np.where(operating_temperature > 100)[0]
        for i in range(len(temp_fail_indices) - 5):
            if all(temp_fail_indices[i:i+5] == np.arange(temp_fail_indices[i], temp_fail_indices[i]+5)):
                failure[temp_fail_indices[i+4]] = 1
                pattern6+=1
    print(pattern6)
    return failure

# Simulation parameters
simulation_params = {
    'num_machines': 5,               # Number of machines
    'num_timepoints': 8760,          # One year of hourly data (365 days * 24 hours)
    'machine_age': [12, 24, 36, 48, 60],  # Machine ages in months
}
# Generate the dataset
dataset = generate_machine_failure_dataset(simulation_params)


0
0
22
0
0
0
0
0
0
0


In [9]:
# Count how many times failure == 1 in the whole dataset
failure_count = dataset['failure'].sum()
print(f"Total number of failures in the dataset: {failure_count}")


Total number of failures in the dataset: 22.0


In [10]:
# save the dataset as a csv file
dataset.to_csv('random_gen_dataset.csv', index=False)

In [11]:
dataset.head()

Unnamed: 0,timestamp,machine_id,operating_temperature,vibration_level,power_consumption,pressure,utilization_rate,maintenance_scheduled,ambient_temperature,load,machine_age,failure
0,2023-01-01 00:00:00,1,87.640523,4.847366,50.749384,149.327627,50.150249,Yes,28.199156,0.5582,12,0.0
1,2023-01-01 01:00:00,1,74.001572,11.819197,58.245597,116.886084,55.164327,Yes,33.235991,0.885296,12,0.0
2,2023-01-01 02:00:00,1,79.78738,12.902442,50.036318,102.678343,74.34269,Yes,32.066503,0.732261,12,0.0
3,2023-01-01 03:00:00,1,92.408932,18.558026,53.601882,111.595638,93.56757,Yes,29.213413,0.576724,12,0.0
4,2023-01-01 04:00:00,1,88.67558,18.0532,55.881383,100.042076,77.24861,Yes,30.053019,0.563416,12,0.0


In [12]:
dataset[dataset['failure'] == 1.0]

Unnamed: 0,timestamp,machine_id,operating_temperature,vibration_level,power_consumption,pressure,utilization_rate,maintenance_scheduled,ambient_temperature,load,machine_age,failure
8863,2023-01-05 07:00:00,2,62.268314,12.530564,44.998036,113.738506,94.533365,Yes,20.613813,0.913076,24,1.0
8951,2023-01-08 23:00:00,2,61.134892,15.739043,43.231048,98.667246,50.6727,Yes,31.255828,0.930647,24,1.0
9167,2023-01-17 23:00:00,2,81.907383,20.449314,49.751778,139.279955,93.739752,No,25.177008,0.981068,24,1.0
9361,2023-01-26 01:00:00,2,67.622184,22.770604,43.65054,150.31825,96.395485,Yes,25.674112,0.675205,24,1.0
9462,2023-01-30 06:00:00,2,58.694567,13.901423,49.380229,146.829131,89.579285,Yes,22.628511,0.668672,24,1.0
9670,2023-02-07 22:00:00,2,61.015939,22.228883,49.529963,91.083685,56.838378,Yes,25.472852,0.822766,24,1.0
9967,2023-02-20 07:00:00,2,78.77985,19.512214,48.528824,117.243833,62.91221,No,32.46463,0.839565,24,1.0
11025,2023-04-05 09:00:00,2,47.502754,11.963047,57.655079,77.958142,88.759653,No,30.793761,0.592685,24,1.0
11316,2023-04-17 12:00:00,2,81.794174,22.447105,61.801301,103.234683,87.999061,Yes,16.263389,0.963984,24,1.0
11408,2023-04-21 08:00:00,2,65.951176,16.436797,53.041093,121.891302,89.870467,Yes,17.549158,0.607608,24,1.0


In [13]:
# Injecting failure patterns in to the dataset
import numpy as np
import pandas as pd

def generate_machine_failure_dataset(simulation_params=None):
    # Initialize empty dataframe for storing all machines' data
    df_list = []

    # Seed the random number generator for reproducibility
    np.random.seed(42)

    # For each machine, generate the time series data
    for machine_id in range(1, simulation_params['num_machines'] + 1):
        machine_data = generate_machine_data(machine_id, simulation_params)
        machine_data = inject_failure_patterns(machine_data, machine_id, simulation_params)
        df_list.append(machine_data)

    # Combine all machines' data into one DataFrame
    dataset = pd.concat(df_list, ignore_index=True)
    
    return dataset

def inject_failure_patterns(df, machine_id, simulation_params):
    """Inject failure patterns into the dataset by modifying the data."""
    pattern_count = [0, 0, 0, 0, 0, 0, 0]	# Count of each pattern injected
    num_timepoints = len(df)

    if machine_id == 1:
        # Pattern: operating_temperature > 75°C for 6+ consecutive hours
        start_indices = np.random.choice(range(num_timepoints - 6), size=25, replace=False)
        for start in start_indices:
            df.loc[start:start+5, 'operating_temperature'] = np.random.uniform(76, 85)  # Set temperature above 75°C for 6 hours
            df.loc[start+5, 'failure'] = 1
            pattern_count[1] += 1
        

    elif machine_id == 2:
        # Pattern: vibration_level > 18 mm/s & power_consumption fluctuates > 10% within a day
        start_indices = np.random.choice(range(num_timepoints - 24), size=30, replace=False)
        for start in start_indices:
            df.loc[start:start+23, 'vibration_level'] = np.random.uniform(19, 25)  # Set vibration level above 18 mm/s
            fluctuation_indices = np.random.choice(range(start, start + 24), size=5, replace=False)
            df.loc[fluctuation_indices, 'power_consumption'] *= 1.15  # Apply fluctuations to power consumption
            df.loc[start+23, 'failure'] = 1
            pattern_count[2] += 1

    elif machine_id == 3:
        # Pattern: pressure > 140 PSI for 8 hours & utilization_rate > 85% & maintenance = No for 4 hours
        start_indices = np.random.choice(range(num_timepoints - 8), size=20, replace=False)
        for start in start_indices:
            df.loc[start:start+7, 'pressure'] = np.random.uniform(141, 150)  # Set pressure above 140 PSI for 8 hours
            df.loc[start:start+7, 'utilization_rate'] = np.random.uniform(86, 95)  # Set utilization rate above 85%
            df.loc[start:start+3, 'maintenance_schedule'] = 'No'  # Set maintenance status to 'No' for 4 hours
            df.loc[start+7, 'failure'] = 1
            pattern_count[3] += 1


    elif machine_id == 4:
        # Pattern: ambient_temperature > 33°C for 2 hours & load > 1.2 & operating_temperature > 75°C
        start_indices = np.random.choice(range(num_timepoints - 10), size=15, replace=False)
        for start in start_indices:
            df.loc[start:start+1, 'ambient_temperature'] = np.random.uniform(33, 35)  # Set ambient temperature above 33°C for 2 hours
            df.loc[start:start+1, 'load'] = np.random.uniform(1.2, 1.5)  # Set load above 1.2
            df.loc[start:start+9, 'operating_temperature'] = np.random.uniform(76, 85)  # Set operating temperature in a range above 75°C
            df.loc[start+1, 'failure'] = 1
            pattern_count[4] += 1
        

    elif machine_id == 5:
        # Pattern: pressure > 140 PSI for 8 hours & multiple other conditions
        start_indices = np.random.choice(range(num_timepoints - 8), size=10, replace=False)
        for start in start_indices:
            df.loc[start:start+7, 'pressure'] = np.random.uniform(141, 150)  # Set pressure above 140 PSI for 8 hours
            df.loc[start:start+7, 'power_consumption'] *= 1.2  # Set power consumption fluctuation
            df.loc[start:start+9, 'operating_temperature'] = np.random.uniform(76, 85)  # Set operating temperature above 75°C
            df.loc[start:start+7, 'utilization_rate'] = np.random.uniform(86, 95)  # Set utilization rate above 85%
            df.loc[start:start+3, 'maintenance_schedule'] = 'No'  # Set maintenance status to 'No' for 4 hours
            df.loc[start+7, 'failure'] = 1
            pattern_count[5] += 1
        

    # Pattern 6 (common to all machines): machine_age > 30 months & operating_temperature > 100°C for 5+ hours
    if simulation_params['machine_age'][machine_id - 1] > 30:
        start_indices = np.random.choice(range(num_timepoints - 5), size=10, replace=False)
        for start in start_indices:
            df.loc[start:start+4, 'operating_temperature'] = np.random.uniform(101, 85)
            df.loc[start+4, 'failure'] = 1
            pattern_count[6] += 1
        


    # print(f"Machine {machine_id} - Total Patterns Injected: {pattern_count}")
    for i in range(1, 7):
        print(f"Pattern {i}: {pattern_count[i]}")
    print("Total Patterns Injected: ", sum(pattern_count))

    return df

def generate_machine_data(machine_id, simulation_params):
    """Simulate time series data for a machine."""
    num_timepoints = simulation_params['num_timepoints']
    machine_age = simulation_params['machine_age'][machine_id - 1]
    
    # Simulate sensor data
    df = pd.DataFrame({
        'time': pd.date_range('2023-01-01', periods=num_timepoints, freq='h'),
        'machine_id': machine_id,
        'machine_age': machine_age,
        'maintenance_schedule': np.random.choice(['Yes', 'No'], size=num_timepoints, p=[0.95, 0.05]),
        'ambient_temperature': np.random.uniform(20, 35, size=num_timepoints),
        'load': np.random.uniform(0.8, 1.5, size=num_timepoints),
        'operating_temperature': np.random.uniform(60, 85, size=num_timepoints),
        'vibration_level': np.random.uniform(15, 25, size=num_timepoints),
        'power_consumption': np.random.uniform(0.9, 1.5, size=num_timepoints),
        'pressure': np.random.uniform(120, 150, size=num_timepoints),
        'utilization_rate': np.random.uniform(60, 95, size=num_timepoints),
        'failure': np.zeros(num_timepoints, dtype=int)  # Initialize the failure column with zeros
    })
    
    return df

# Simulation parameters
simulation_params = {
    'num_machines': 5,               # Number of machines
    'num_timepoints': 8760,          # One year of hourly data (365 days * 24 hours)
    'machine_age': [12, 24, 36, 48, 60],  # Machine ages in months
}

# Generate the dataset
dataset = generate_machine_failure_dataset(simulation_params)

# Count how many times failure == 1 in the whole dataset
print(f"Total number of failures in the dataset: {dataset['failure'].sum()}")


Pattern 1: 25
Pattern 2: 0
Pattern 3: 0
Pattern 4: 0
Pattern 5: 0
Pattern 6: 0
Total Patterns Injected:  25
Pattern 1: 0
Pattern 2: 30
Pattern 3: 0
Pattern 4: 0
Pattern 5: 0
Pattern 6: 0
Total Patterns Injected:  30
Pattern 1: 0
Pattern 2: 0
Pattern 3: 20
Pattern 4: 0
Pattern 5: 0
Pattern 6: 10
Total Patterns Injected:  30
Pattern 1: 0
Pattern 2: 0
Pattern 3: 0
Pattern 4: 15
Pattern 5: 0
Pattern 6: 10
Total Patterns Injected:  25
Pattern 1: 0
Pattern 2: 0
Pattern 3: 0
Pattern 4: 0
Pattern 5: 10
Pattern 6: 10
Total Patterns Injected:  20
Total number of failures in the dataset: 130


In [14]:
# Count the number of instances where the failure is 1 
print(dataset['failure'].value_counts())

0    43670
1      130
Name: failure, dtype: int64


In [15]:
#!pip install pysubgroup

In [16]:

import pysubgroup as ps

In [17]:
# Load the example dataset
#from pysubgroup.datasets import get_titanic_data
data = dataset

target = ps.BinaryTarget ('failure', 1.0)
searchspace = ps.create_selectors(data, ignore=['failure', 'pressure', 'load'])
task = ps.SubgroupDiscoveryTask (
    data,
    target,
    searchspace,
    result_set_size=5,
    depth=2,
    qf=ps.SimpleBinomialQF())
result = ps.GpGrowth().execute(task)

In [18]:
print(result.to_dataframe())

    quality                                           subgroup  size_sg  \
0  0.006737  time==2023-10-09T04:00:00.000000000 AND vibrat...        2   
1  0.006737  time==2023-10-09T04:00:00.000000000 AND utiliz...        2   
2  0.006737  ambient_temperature: [26.05:29.07[ AND time==2...        2   
3  0.004764  machine_age==36 AND time==2023-01-01T11:00:00....        1   
4  0.004764  ambient_temperature: [26.05:29.07[ AND time==2...        1   

   size_dataset  positives_sg  positives_dataset  size_complement  \
0         43800             2                130            43798   
1         43800             2                130            43798   
2         43800             2                130            43798   
3         43800             1                130            43799   
4         43800             1                130            43799   

   relative_size_sg  relative_size_complement  coverage_sg  \
0          0.000046                  0.999954     0.015385   
1         

In [19]:
# Convert timestamp to datetime
dataset['timestamp'] = pd.to_datetime(dataset['time'])

# Resample the dataset into 24-hour non-overlapping slices
# First, ensure data is sorted by timestamp
data = dataset.sort_values(by='timestamp')


In [20]:
# Function to create overlapping windows
def create_overlapping_slices(df, machine_id_col, time_col, overlap_pct, window_size=24):
    df = df.set_index(time_col)
    total_hours = len(df)
    step = int(window_size * (1 - overlap_pct))  # Step size based on overlap percentage
    windows = [df.iloc[i:i + window_size] for i in range(0, total_hours - window_size + 1, step)]
    return windows

# Creating 25% overlapping windows
overlap_25_slices = {machine_id: create_overlapping_slices(group, 'machine_id', 'timestamp', 0.25)
                     for machine_id, group in data.groupby('machine_id')}

# Display the first slice for the first machine to verify
overlap_25_slices[1][0].head(20)


Unnamed: 0_level_0,time,machine_id,machine_age,maintenance_schedule,ambient_temperature,load,operating_temperature,vibration_level,power_consumption,pressure,utilization_rate,failure
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2023-01-01 00:00:00,2023-01-01 00:00:00,1,12,Yes,20.294158,1.192844,83.388032,23.207672,1.11709,120.979041,86.539368,0
2023-01-01 01:00:00,2023-01-01 01:00:00,1,12,No,30.07052,1.217269,69.803336,22.045199,0.908352,138.945137,87.85828,0
2023-01-01 02:00:00,2023-01-01 02:00:00,1,12,Yes,27.847372,1.086986,71.854734,19.402837,1.247404,149.200862,64.346927,0
2023-01-01 03:00:00,2023-01-01 03:00:00,1,12,Yes,33.479584,1.386938,70.539406,23.783877,0.98689,143.141168,71.374311,0
2023-01-01 04:00:00,2023-01-01 04:00:00,1,12,Yes,22.46589,0.929623,67.013676,16.376861,1.026603,127.632278,63.718242,0
2023-01-01 05:00:00,2023-01-01 05:00:00,1,12,Yes,32.061639,1.18839,82.376107,17.603386,0.972449,122.150661,93.898233,0
2023-01-01 06:00:00,2023-01-01 06:00:00,1,12,Yes,31.879415,1.305563,68.305972,19.895397,1.21226,138.499624,83.876567,0
2023-01-01 07:00:00,2023-01-01 07:00:00,1,12,Yes,21.907718,1.432783,74.464912,15.613386,0.957096,120.096867,60.673184,0
2023-01-01 08:00:00,2023-01-01 08:00:00,1,12,Yes,25.978388,1.418743,66.00084,15.956858,1.053814,143.777569,76.642669,0
2023-01-01 09:00:00,2023-01-01 09:00:00,1,12,Yes,22.55229,1.156118,79.934384,18.974434,1.171025,127.293616,69.223397,0


In [21]:

# Creating 25% overlapping windows
overlap_0_slices = {machine_id: create_overlapping_slices(group, 'machine_id', 'timestamp', 0)
                     for machine_id, group in data.groupby('machine_id')}

In [22]:
# Create 50% and 75% overlapping windows

# 50% overlap
overlap_50_slices = {machine_id: create_overlapping_slices(group, 'machine_id', 'timestamp', 0.50)
                     for machine_id, group in data.groupby('machine_id')}

# 75% overlap
overlap_75_slices = {machine_id: create_overlapping_slices(group, 'machine_id', 'timestamp', 0.75)
                     for machine_id, group in data.groupby('machine_id')}

# Verifying the 50% and 75% overlap results by checking the first slice for machine 1
overlap_50_slices[1][0].head()


Unnamed: 0_level_0,time,machine_id,machine_age,maintenance_schedule,ambient_temperature,load,operating_temperature,vibration_level,power_consumption,pressure,utilization_rate,failure
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2023-01-01 00:00:00,2023-01-01 00:00:00,1,12,Yes,20.294158,1.192844,83.388032,23.207672,1.11709,120.979041,86.539368,0
2023-01-01 01:00:00,2023-01-01 01:00:00,1,12,No,30.07052,1.217269,69.803336,22.045199,0.908352,138.945137,87.85828,0
2023-01-01 02:00:00,2023-01-01 02:00:00,1,12,Yes,27.847372,1.086986,71.854734,19.402837,1.247404,149.200862,64.346927,0
2023-01-01 03:00:00,2023-01-01 03:00:00,1,12,Yes,33.479584,1.386938,70.539406,23.783877,0.98689,143.141168,71.374311,0
2023-01-01 04:00:00,2023-01-01 04:00:00,1,12,Yes,22.46589,0.929623,67.013676,16.376861,1.026603,127.632278,63.718242,0


In [23]:
len(overlap_0_slices[1])

365

In [24]:
!pip3 install tsfresh



In [25]:
!pip3 install --upgrade scipy



In [57]:
from tsfresh.feature_extraction import extract_features
from tsfresh.feature_extraction.settings import EfficientFCParameters
from tsfresh.feature_extraction.settings import MinimalFCParameters


# Prepare data for tsfresh
def prepare_for_tsfresh(slices, time_col):
    all_data = []
    for machine_id, machine_slices in slices.items():
        count = 0
        for i, slice_data in enumerate(machine_slices):
            if isinstance(slice_data, pd.DataFrame):  # Ensure slice_data is a DataFrame
                count +=1
                slice_data = slice_data.copy()
                slice_data['id'] = f'{machine_id}_{count}'  # Assign unique ID per slice
                slice_data['time'] = slice_data.index    # Use the index (timestamp) as 'time'
                all_data.append(slice_data)
    return pd.concat(all_data)

# Preparing the 50% overlap slices for tsfresh
overlap_50_prepared = prepare_for_tsfresh(overlap_50_slices, 'timestamp')

# Columns to extract features from
columns_to_extract = ['operating_temperature', 'vibration_level', 'power_consumption',
                      'pressure', 'utilization_rate', 'ambient_temperature', 'load']

# Define feature extraction settings (EfficientFCParameters is a predefined set)
#fc_parameters = EfficientFCParameters()
fc_parameters =MinimalFCParameters()
# Extract features from the 50% overlap slices using tsfresh
overlap_50_features = extract_features(overlap_50_prepared, column_id="id", column_sort="time", 
                                       kind_to_fc_parameters={col: fc_parameters for col in columns_to_extract})

# Display the extracted features
print("Extracted features from 50% overlapping slices:\n", overlap_50_features.head(10))


Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 30/30 [00:32<00:00,  1.08s/it]


Extracted features from 50% overlapping slices:
        ambient_temperature__sum_values  ambient_temperature__median  \
1_1                         651.205651                    26.074776   
1_10                        638.985757                    25.428522   
1_100                       655.408600                    27.678987   
1_101                       670.949869                    28.365765   
1_102                       670.050957                    27.173414   
1_103                       658.662235                    26.911202   
1_104                       652.357845                    26.555841   
1_105                       654.841408                    28.678329   
1_106                       669.045415                    29.124595   
1_107                       681.225197                    28.406343   

       ambient_temperature__mean  ambient_temperature__length  \
1_1                    27.133569                         24.0   
1_10                   26.624407       

In [82]:

def add_fail_in_slice_column(df):
    # Initialize an empty list to store the fail_in_slice values
    fail_in_slice_values = []
    
    # Group the dataframe by the 'id' column (slice id)
    for slice_id, group in df.groupby('id'):
        # Check if there is at least one '1' in the 'failure' column in the current slice
        if group['failure'].sum() > 0:
            # If there is at least one failure, append 1 to the list for each row in this slice
            fail_in_slice_values.extend([1] * len(group))
        else:
            # If no failure, append 0 for each row in this slice
            fail_in_slice_values.extend([0] * len(group))
    
    # Add the new column to the dataframe
    df['fail_in_slice'] = fail_in_slice_values
    
    return df


In [83]:
overlap_50_with_fails = add_fail_in_slice_column(overlap_50_prepared)
print(overlap_50_with_fails.head())

                                   time  machine_id  machine_age  \
timestamp                                                          
2023-01-01 00:00:00 2023-01-01 00:00:00           1           12   
2023-01-01 01:00:00 2023-01-01 01:00:00           1           12   
2023-01-01 02:00:00 2023-01-01 02:00:00           1           12   
2023-01-01 03:00:00 2023-01-01 03:00:00           1           12   
2023-01-01 04:00:00 2023-01-01 04:00:00           1           12   

                    maintenance_schedule  ambient_temperature      load  \
timestamp                                                                 
2023-01-01 00:00:00                  Yes            20.294158  1.192844   
2023-01-01 01:00:00                   No            30.070520  1.217269   
2023-01-01 02:00:00                  Yes            27.847372  1.086986   
2023-01-01 03:00:00                  Yes            33.479584  1.386938   
2023-01-01 04:00:00                  Yes            22.465890  0.929623  

In [91]:
fail_in_slice_array = overlap_50_with_fails.groupby('id')['fail_in_slice'].first().to_numpy()
print(len(fail_in_slice_array))
print(overlap_50_features.count())

3645
ambient_temperature__sum_values              3645
ambient_temperature__median                  3645
ambient_temperature__mean                    3645
ambient_temperature__length                  3645
ambient_temperature__standard_deviation      3645
ambient_temperature__variance                3645
ambient_temperature__root_mean_square        3645
ambient_temperature__maximum                 3645
ambient_temperature__absolute_maximum        3645
ambient_temperature__minimum                 3645
load__sum_values                             3645
load__median                                 3645
load__mean                                   3645
load__length                                 3645
load__standard_deviation                     3645
load__variance                               3645
load__root_mean_square                       3645
load__maximum                                3645
load__absolute_maximum                       3645
load__minimum                                

In [92]:
overlap_50_features['fail_in_slice'] = fail_in_slice_array

In [93]:
overlap_50_features.head()

Unnamed: 0,ambient_temperature__sum_values,ambient_temperature__median,ambient_temperature__mean,ambient_temperature__length,ambient_temperature__standard_deviation,ambient_temperature__variance,ambient_temperature__root_mean_square,ambient_temperature__maximum,ambient_temperature__absolute_maximum,ambient_temperature__minimum,...,utilization_rate__median,utilization_rate__mean,utilization_rate__length,utilization_rate__standard_deviation,utilization_rate__variance,utilization_rate__root_mean_square,utilization_rate__maximum,utilization_rate__absolute_maximum,utilization_rate__minimum,fail_in_slice
1_1,651.205651,26.074776,27.133569,24.0,4.474481,20.020984,27.500028,34.605382,34.605382,20.294158,...,70.826981,73.721187,24.0,9.157248,83.855199,74.287742,93.898233,93.898233,60.673184,0
1_10,638.985757,25.428522,26.624407,24.0,4.336663,18.806646,26.975279,33.869767,33.869767,20.065394,...,80.133801,78.504917,24.0,9.882335,97.660536,79.124475,94.888392,94.888392,63.401868,0
1_100,655.4086,27.678987,27.308692,24.0,3.476891,12.088771,27.529137,34.641195,34.641195,20.99942,...,76.859942,78.788737,24.0,10.36307,107.393224,79.467341,94.499276,94.499276,64.796588,0
1_101,670.949869,28.365765,27.956245,24.0,4.192832,17.579842,28.268913,34.641195,34.641195,20.906548,...,75.756861,78.233513,24.0,9.434858,89.016549,78.800375,94.521618,94.521618,64.796588,0
1_102,670.050957,27.173414,27.91879,24.0,4.315326,18.622038,28.250325,34.354877,34.354877,20.50127,...,82.426092,79.321586,24.0,9.87625,97.540308,79.934062,94.521618,94.521618,60.852465,0
