In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os

In [2]:
# Define column structures
SENSOR_COLUMNS = {
    'equipment_id': np.int32,
    'timestamp': 'datetime64[ns]',
    'temperature': np.float32,
    'vibration': np.float32,
    'pressure': np.float32,
    'rotational_speed': np.float32,
    'power_output': np.float32,
    'noise_level': np.float32,
    'voltage': np.float32,
    'current': np.float32,
    'oil_viscosity': np.float32
}

MAINTENANCE_COLUMNS = {
    'equipment_id': np.int32,
    'date': 'datetime64[ns]',
    'maintenance_type': 'category',
    'description': str,
    'technician_id': np.int32,
    'duration_hours': np.float32,
    'cost': np.float32,
    'parts_replaced': str,
    'maintenance_result': 'category'
}

EQUIPMENT_COLUMNS = {
    'equipment_id': np.int32,
    'model': str,
    'manufacturer': 'category',
    'installation_date': 'datetime64[ns]',
    'max_temperature': np.float32,
    'max_pressure': np.float32,
    'max_rotational_speed': np.float32,
    'expected_lifetime_years': np.float32,
    'warranty_period_years': np.int32,
    'last_major_overhaul': 'datetime64[ns]',
    'location': str,
    'criticality': 'category'
}

OPERATIONAL_COLUMNS = {
    'equipment_id': np.int32,
    'date': 'datetime64[ns]',
    'production_rate': np.float32,
    'operating_hours': np.float32,
    'downtime_hours': np.float32,
    'operator_id': np.int32,
    'product_type': 'category',
    'raw_material_quality': 'category',
    'ambient_temperature': np.float32,
    'ambient_humidity': np.float32
}

In [3]:
def generate_sensor_data(num_equipment, num_days):
    start_date = datetime.now() - timedelta(days=num_days)
    date_range = pd.date_range(start=start_date, periods=num_days*2, freq='12H')
    
    data = []
    for equipment_id in range(1, num_equipment + 1):
        for timestamp in date_range:
            data.append({
                'equipment_id': equipment_id,
                'timestamp': timestamp,
                'temperature': np.random.normal(60, 10),
                'vibration': np.random.normal(0.5, 0.1),
                'pressure': np.random.normal(100, 20),
                'rotational_speed': np.random.normal(1000, 100),
                'power_output': np.random.normal(500, 50),
                'noise_level': np.random.normal(70, 5),
                'voltage': np.random.normal(220, 10),
                'current': np.random.normal(100, 10),
                'oil_viscosity': np.random.normal(50, 5)
            })
    
    df = pd.DataFrame(data)
    return df.astype(SENSOR_COLUMNS)

In [4]:
def generate_maintenance_logs(num_equipment, num_years):
    end_date = datetime.now()
    start_date = end_date - timedelta(days=365 * num_years)
    date_range = pd.date_range(start=start_date, end=end_date)
    
    data = []
    for equipment_id in range(1, num_equipment + 1):
        for date in date_range:
            if np.random.random() < 0.01:  # 1% chance of maintenance event
                maintenance_type = np.random.choice(['Routine', 'Repair', 'Replacement', 'Inspection'])
                data.append({
                    'equipment_id': equipment_id,
                    'date': date,
                    'maintenance_type': maintenance_type,
                    'description': f"{maintenance_type} maintenance performed",
                    'technician_id': np.random.randint(1, 51),
                    'duration_hours': np.random.uniform(1, 8),
                    'cost': np.random.uniform(100, 5000),
                    'parts_replaced': np.random.choice(['None', 'Bearings', 'Seals', 'Filters', 'Motor'], p=[0.6, 0.1, 0.1, 0.1, 0.1]),
                    'maintenance_result': np.random.choice(['Successful', 'Partial', 'Failed'], p=[0.8, 0.15, 0.05])
                })
    
    df = pd.DataFrame(data)
    return df.astype(MAINTENANCE_COLUMNS)

In [5]:
def generate_equipment_specs(num_equipment):
    data = []
    for equipment_id in range(1, num_equipment + 1):
        installation_date = datetime.now() - timedelta(days=np.random.randint(365, 3650))
        data.append({
            'equipment_id': equipment_id,
            'model': f"Model-{np.random.randint(1000, 9999)}",
            'manufacturer': np.random.choice(['ManufacturerA', 'ManufacturerB', 'ManufacturerC']),
            'installation_date': installation_date,
            'max_temperature': np.random.uniform(80, 100),
            'max_pressure': np.random.uniform(150, 200),
            'max_rotational_speed': np.random.uniform(1000, 2000),
            'expected_lifetime_years': np.random.uniform(10, 20),
            'warranty_period_years': np.random.randint(1, 6),
            'last_major_overhaul': installation_date + timedelta(days=np.random.randint(365, 1825)),
            'location': f"Section-{np.random.randint(1, 6)}",
            'criticality': np.random.choice(['High', 'Medium', 'Low'], p=[0.2, 0.5, 0.3])
        })
    
    df = pd.DataFrame(data)
    return df.astype(EQUIPMENT_COLUMNS)

In [6]:
def generate_operational_data(num_equipment, num_days):
    start_date = datetime.now() - timedelta(days=num_days)
    date_range = pd.date_range(start=start_date, periods=num_days)
    
    data = []
    for equipment_id in range(1, num_equipment + 1):
        for date in date_range:
            data.append({
                'equipment_id': equipment_id,
                'date': date,
                'production_rate': np.random.uniform(80, 100),
                'operating_hours': np.random.uniform(20, 24),
                'downtime_hours': np.random.uniform(0, 4),
                'operator_id': np.random.randint(1, 101),
                'product_type': np.random.choice(['TypeA', 'TypeB', 'TypeC']),
                'raw_material_quality': np.random.choice(['High', 'Medium', 'Low'], p=[0.7, 0.2, 0.1]),
                'ambient_temperature': np.random.normal(25, 5),
                'ambient_humidity': np.random.uniform(30, 70)
            })
    
    df = pd.DataFrame(data)
    return df.astype(OPERATIONAL_COLUMNS)

In [7]:
def get_dataframe_size(df):
    return df.memory_usage(deep=True).sum() / (1024 * 1024)  # Size in MB

def print_dataset_info(name, df, columns):
    print(f"\n{name} Dataset:")
    print(f"Number of rows: {df.shape[0]}")
    print(f"Size in memory: {get_dataframe_size(df):.2f} MB")
    print("Columns:")
    for col, dtype in columns.items():
        print(f"  - {col}: {dtype}")

In [8]:
# Generate datasets
num_equipment = 100
num_days = 365 # 1 year
num_years = 1

print("Generating datasets...")
sensor_data = generate_sensor_data(num_equipment, num_days)
maintenance_logs = generate_maintenance_logs(num_equipment, num_years)
equipment_specs = generate_equipment_specs(num_equipment)
operational_data = generate_operational_data(num_equipment, num_days)

Generating datasets...


  date_range = pd.date_range(start=start_date, periods=num_days*2, freq='12H')


In [9]:
# Print dataset information
print_dataset_info("Sensor Data", sensor_data, SENSOR_COLUMNS)
print_dataset_info("Maintenance Logs", maintenance_logs, MAINTENANCE_COLUMNS)
print_dataset_info("Equipment Specifications", equipment_specs, EQUIPMENT_COLUMNS)
print_dataset_info("Operational Data", operational_data, OPERATIONAL_COLUMNS)

# Save datasets to CSV files
sensor_data.to_csv('sensor_data.csv', index=False)
maintenance_logs.to_csv('maintenance_logs.csv', index=False)
equipment_specs.to_csv('equipment_specs.csv', index=False)
operational_data.to_csv('operational_data.csv', index=False)


Sensor Data Dataset:
Number of rows: 73000
Size in memory: 3.34 MB
Columns:
  - equipment_id: <class 'numpy.int32'>
  - timestamp: datetime64[ns]
  - temperature: <class 'numpy.float32'>
  - vibration: <class 'numpy.float32'>
  - pressure: <class 'numpy.float32'>
  - rotational_speed: <class 'numpy.float32'>
  - power_output: <class 'numpy.float32'>
  - noise_level: <class 'numpy.float32'>
  - voltage: <class 'numpy.float32'>
  - current: <class 'numpy.float32'>
  - oil_viscosity: <class 'numpy.float32'>

Maintenance Logs Dataset:
Number of rows: 370
Size in memory: 0.07 MB
Columns:
  - equipment_id: <class 'numpy.int32'>
  - date: datetime64[ns]
  - maintenance_type: category
  - description: <class 'str'>
  - technician_id: <class 'numpy.int32'>
  - duration_hours: <class 'numpy.float32'>
  - cost: <class 'numpy.float32'>
  - parts_replaced: <class 'str'>
  - maintenance_result: category

Equipment Specifications Dataset:
Number of rows: 100
Size in memory: 0.02 MB
Columns:
  - equi

In [10]:
# Calculate total size of CSV files
total_size_mb = sum(os.path.getsize(f) for f in ['sensor_data.csv', 'maintenance_logs.csv', 'equipment_specs.csv', 'operational_data.csv']) / (1024 * 1024)
print(f"\nTotal size of all CSV files: {total_size_mb:.2f} MB")


Total size of all CSV files: 11.41 MB
