In [8]:
!pip install scikit-learn 



In [9]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("âœ… Libraries imported successfully")

âœ… Libraries imported successfully


In [10]:
# Load datasets
equipment = pd.read_csv('C:/Users/lynda/OneDrive/Bureau/sousou/data/synthetic/equipment.csv')
maintenance = pd.read_csv('C:/Users/lynda/OneDrive/Bureau/sousou/data/synthetic/maintenance_records.csv')
failures = pd.read_csv('C:/Users/lynda/OneDrive/Bureau/sousou/data/synthetic/failure_events.csv')

# Convert dates
equipment['purchase_date'] = pd.to_datetime(equipment['purchase_date'])
equipment['last_service_date'] = pd.to_datetime(equipment['last_service_date'])
maintenance['maintenance_date'] = pd.to_datetime(maintenance['maintenance_date'])
failures['failure_date'] = pd.to_datetime(failures['failure_date'])

print(f"ðŸ“Š Data Loaded:")
print(f"   Equipment: {len(equipment)} records")
print(f"   Maintenance: {len(maintenance)} records")
print(f"   Failures: {len(failures)} records")

ðŸ“Š Data Loaded:
   Equipment: 100 records
   Maintenance: 2093 records
   Failures: 656 records


In [11]:
# Calculate equipment age
current_year = datetime.now().year
equipment['age'] = current_year - equipment['year_manufactured']

# Age groups
def categorize_age(age):
    if age <= 3:
        return 'New'
    elif age <= 7:
        return 'Mid_Age'
    elif age <= 12:
        return 'Old'
    else:
        return 'Very_Old'

equipment['age_group'] = equipment['age'].apply(categorize_age)

# Usage intensity (operating hours per year)
equipment['usage_intensity'] = equipment['operating_hours'] / equipment['age']

# Usage category
def categorize_usage(intensity):
    if intensity < 300:
        return 'Low'
    elif intensity < 800:
        return 'Medium'
    else:
        return 'High'

equipment['usage_category'] = equipment['usage_intensity'].apply(categorize_usage)

print("âœ… Equipment features created")
print(f"\nAge distribution:")
print(equipment['age_group'].value_counts())
print(f"\nUsage distribution:")
print(equipment['usage_category'].value_counts())

âœ… Equipment features created

Age distribution:
age_group
Old         38
Mid_Age     37
New         17
Very_Old     8
Name: count, dtype: int64

Usage distribution:
usage_category
Medium    45
Low       35
High      20
Name: count, dtype: int64


In [12]:
# Aggregate maintenance features by equipment
maintenance_agg = maintenance.groupby('equipment_id').agg({
    'record_id': 'count',
    'total_cost': ['sum', 'mean', 'std'],
    'downtime_hours': ['sum', 'mean'],
    'type_id': lambda x: (x == 1).sum()  # Count preventive maintenance
}).reset_index()

# Flatten column names
maintenance_agg.columns = ['equipment_id', 'maintenance_count', 'total_maintenance_cost',
                           'avg_maintenance_cost', 'std_maintenance_cost',
                           'total_downtime', 'avg_downtime', 'preventive_count']

# Calculate preventive maintenance ratio
maintenance_agg['preventive_ratio'] = maintenance_agg['preventive_count'] / maintenance_agg['maintenance_count']

# Merge with equipment
equipment_with_maint = equipment.merge(maintenance_agg, on='equipment_id', how='left')
equipment_with_maint = equipment_with_maint.fillna(0)

# Maintenance frequency (events per year)
equipment_with_maint['maintenance_frequency'] = equipment_with_maint['maintenance_count'] / equipment_with_maint['age']

print("âœ… Maintenance history features created")
print(f"\nMaintenance statistics:")
print(equipment_with_maint[['maintenance_count', 'preventive_ratio', 'maintenance_frequency']].describe())

âœ… Maintenance history features created

Maintenance statistics:
       maintenance_count  preventive_ratio  maintenance_frequency
count         100.000000        100.000000             100.000000
mean           20.930000          0.357875               2.579245
std            14.263222          0.179877               1.127871
min             0.000000          0.000000               0.000000
25%            10.000000          0.258152               2.000000
50%            21.000000          0.382784               2.654762
75%            28.000000          0.473732               3.314286
max            72.000000          0.750000               5.538462


In [13]:
# Aggregate failure features by equipment
failure_agg = failures.groupby('equipment_id').agg({
    'failure_id': 'count',
    'repair_cost': ['sum', 'mean'],
    'downtime_hours': ['sum', 'mean'],
    'severity': lambda x: (x == 'Critical').sum(),
    'prevented_by_maintenance': 'sum'
}).reset_index()

# Flatten column names
failure_agg.columns = ['equipment_id', 'failure_count', 'total_failure_cost',
                       'avg_failure_cost', 'total_failure_downtime',
                       'avg_failure_downtime', 'critical_failure_count',
                       'preventable_failure_count']

# Merge with equipment
equipment_full = equipment_with_maint.merge(failure_agg, on='equipment_id', how='left')
equipment_full = equipment_full.fillna(0)

# Calculate failure rate (failures per 1000 operating hours)
equipment_full['failure_rate'] = (equipment_full['failure_count'] / equipment_full['operating_hours']) * 1000

# MTBF (Mean Time Between Failures)
equipment_full['mtbf'] = equipment_full.apply(
    lambda x: x['operating_hours'] / x['failure_count'] if x['failure_count'] > 0 else 10000,
    axis=1
)

print("âœ… Failure history features created")
print(f"\nFailure statistics:")
print(equipment_full[['failure_count', 'failure_rate', 'mtbf']].describe())

âœ… Failure history features created

Failure statistics:
       failure_count  failure_rate          mtbf
count     100.000000    100.000000    100.000000
mean        6.560000      1.891573   1633.047046
std         5.047892      1.668454   2695.867612
min         0.000000      0.000000    138.111111
25%         3.000000      0.799697    421.125000
50%         6.000000      1.263668    791.500000
75%         9.000000      2.381705   1251.000000
max        25.000000      7.240547  10000.000000


In [14]:
def calculate_health_score(row):
    """Calculate equipment health score (0-100)"""
    score = 100
    
    # Age penalty (max -30 points)
    age_penalty = min(row['age'] * 2, 30)
    score -= age_penalty
    
    # Failure rate penalty (max -25 points)
    failure_penalty = min(row['failure_rate'] * 5, 25)
    score -= failure_penalty
    
    # Preventive maintenance bonus (max +15 points)
    preventive_bonus = row['preventive_ratio'] * 15
    score += preventive_bonus
    
    # Critical failure penalty (max -20 points)
    critical_penalty = row['critical_failure_count'] * 5
    score -= min(critical_penalty, 20)
    
    # MTBF bonus (max +10 points)
    mtbf_bonus = min(row['mtbf'] / 1000, 10)
    score += mtbf_bonus
    
    return max(0, min(100, score))

equipment_full['health_score'] = equipment_full.apply(calculate_health_score, axis=1)

# Health category
def categorize_health(score):
    if score >= 80:
        return 'Excellent'
    elif score >= 60:
        return 'Good'
    elif score >= 40:
        return 'Fair'
    else:
        return 'Poor'

equipment_full['health_category'] = equipment_full['health_score'].apply(categorize_health)

print("âœ… Health score calculated")
print(f"\nHealth score distribution:")
print(equipment_full['health_score'].describe())
print(f"\nHealth categories:")
print(equipment_full['health_category'].value_counts())

âœ… Health score calculated

Health score distribution:
count    100.000000
mean      79.710205
std       13.468730
min       50.335978
25%       68.082324
50%       81.834120
75%       88.706417
max      100.000000
Name: health_score, dtype: float64

Health categories:
health_category
Excellent    59
Good         33
Fair          8
Name: count, dtype: int64


In [15]:
# Target: Will fail in next 30 days (Binary Classification)
# Using recent failure history as proxy

last_date = failures['failure_date'].max()
equipment_full['will_fail_30d'] = 0

for idx, row in equipment_full.iterrows():
    equip_id = row['equipment_id']
    equip_failures = failures[failures['equipment_id'] == equip_id]
    
    if len(equip_failures) > 0:
        # Check if any failure in recent 90 days
        recent_failures = equip_failures[equip_failures['failure_date'] > (last_date - timedelta(days=90))]
        if len(recent_failures) > 0:
            equipment_full.at[idx, 'will_fail_30d'] = 1

print("âœ… Failure prediction target created")
print(f"\nTarget distribution:")
print(equipment_full['will_fail_30d'].value_counts())
print(f"\nFailure rate: {equipment_full['will_fail_30d'].mean()*100:.1f}%")

âœ… Failure prediction target created

Target distribution:
will_fail_30d
0    72
1    28
Name: count, dtype: int64

Failure rate: 28.0%


In [16]:
# Target: Remaining Useful Life (RUL) in hours

def estimate_rul(row):
    """Estimate remaining useful life in hours"""
    if row['failure_count'] == 0:
        expected_life = 5000
        return max(0, expected_life - row['operating_hours'])
    else:
        return max(0, row['mtbf'] * 0.8)

equipment_full['rul_hours'] = equipment_full.apply(estimate_rul, axis=1)

print("âœ… RUL target created")
print(f"\nRUL statistics:")
print(equipment_full['rul_hours'].describe())

âœ… RUL target created

RUL statistics:
count     100.000000
mean      939.477637
std      1046.051716
min       110.488889
25%       336.900000
50%       633.200000
75%      1000.800000
max      4753.000000
Name: rul_hours, dtype: float64


In [17]:
# Target: Remaining Useful Life (RUL) in hours

def estimate_rul(row):
    """Estimate remaining useful life in hours"""
    if row['failure_count'] == 0:
        expected_life = 5000
        return max(0, expected_life - row['operating_hours'])
    else:
        return max(0, row['mtbf'] * 0.8)

equipment_full['rul_hours'] = equipment_full.apply(estimate_rul, axis=1)

print("âœ… RUL target created")
print(f"\nRUL statistics:")
print(equipment_full['rul_hours'].describe())

âœ… RUL target created

RUL statistics:
count     100.000000
mean      939.477637
std      1046.051716
min       110.488889
25%       336.900000
50%       633.200000
75%      1000.800000
max      4753.000000
Name: rul_hours, dtype: float64


In [18]:
# Label encoding for categorical variables
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
categorical_cols = ['equipment_type', 'brand', 'location', 'age_group', 
                    'usage_category', 'health_category']

for col in categorical_cols:
    le = LabelEncoder()
    equipment_full[f'{col}_encoded'] = le.fit_transform(equipment_full[col])
    label_encoders[col] = le

print("âœ… Categorical variables encoded")
print(f"\nEncoded columns created: {len(categorical_cols)}")

âœ… Categorical variables encoded

Encoded columns created: 6


In [19]:
# Define feature sets for each ML task

# Features for Failure Prediction (Classification)
failure_prediction_features = [
    'age', 'operating_hours', 'usage_intensity',
    'maintenance_count', 'preventive_ratio', 'maintenance_frequency',
    'failure_count', 'failure_rate', 'mtbf',
    'health_score', 'critical_failure_count',
    'equipment_type_encoded', 'age_group_encoded', 'usage_category_encoded'
]

# Features for RUL Estimation (Regression)
rul_features = [
    'age', 'operating_hours', 'usage_intensity',
    'maintenance_count', 'preventive_ratio',
    'failure_count', 'failure_rate', 'mtbf',
    'avg_failure_cost', 'health_score',
    'equipment_type_encoded', 'usage_category_encoded'
]

print("âœ… Feature sets defined")
print(f"\nFailure Prediction: {len(failure_prediction_features)} features")
print(f"RUL Estimation: {len(rul_features)} features")

âœ… Feature sets defined

Failure Prediction: 14 features
RUL Estimation: 12 features


In [20]:
# Split for Failure Prediction
X_failure = equipment_full[failure_prediction_features]
y_failure = equipment_full['will_fail_30d']

X_train_fail, X_test_fail, y_train_fail, y_test_fail = train_test_split(
    X_failure, y_failure, test_size=0.2, random_state=42, stratify=y_failure
)

# Split for RUL Estimation
X_rul = equipment_full[rul_features]
y_rul = equipment_full['rul_hours']

X_train_rul, X_test_rul, y_train_rul, y_test_rul = train_test_split(
    X_rul, y_rul, test_size=0.2, random_state=42
)

print("âœ… Train-test split completed")
print(f"\nFailure Prediction:")
print(f"   Training: {len(X_train_fail)} samples")
print(f"   Test: {len(X_test_fail)} samples")
print(f"\nRUL Estimation:")
print(f"   Training: {len(X_train_rul)} samples")
print(f"   Test: {len(X_test_rul)} samples")

âœ… Train-test split completed

Failure Prediction:
   Training: 80 samples
   Test: 20 samples

RUL Estimation:
   Training: 80 samples
   Test: 20 samples


In [21]:
# Save processed data for modeling
equipment_full.to_csv('../data/processed/equipment_features.csv', index=False)
X_train_fail.to_csv('../data/processed/X_train_failure.csv', index=False)
X_test_fail.to_csv('../data/processed/X_test_failure.csv', index=False)
y_train_fail.to_csv('../data/processed/y_train_failure.csv', index=False)
y_test_fail.to_csv('../data/processed/y_test_failure.csv', index=False)

print("âœ… Processed data saved")
print(f"\nFiles created in data/processed/:")
print("   - equipment_features.csv")
print("   - X_train_failure.csv, X_test_failure.csv")
print("   - y_train_failure.csv, y_test_failure.csv")

âœ… Processed data saved

Files created in data/processed/:
   - equipment_features.csv
   - X_train_failure.csv, X_test_failure.csv
   - y_train_failure.csv, y_test_failure.csv


In [22]:
print("="*80)
print("FEATURE ENGINEERING COMPLETE!")
print("="*80)

print(f"\nðŸ“Š Final Dataset:")
print(f"   Total equipment: {len(equipment_full)}")
print(f"   Total features: {len(equipment_full.columns)}")
print(f"   ML-ready features: {len(failure_prediction_features)}")

print(f"\nðŸŽ¯ Targets Created:")
print(f"   Failure prediction (binary): {equipment_full['will_fail_30d'].value_counts().to_dict()}")
print(f"   RUL estimation (continuous): {equipment_full['rul_hours'].describe()['mean']:.0f} hours avg")

print(f"\nâœ… Ready for Model Development!")
print(f"   Next: Build failure prediction model")

FEATURE ENGINEERING COMPLETE!

ðŸ“Š Final Dataset:
   Total equipment: 100
   Total features: 46
   ML-ready features: 14

ðŸŽ¯ Targets Created:
   Failure prediction (binary): {0: 72, 1: 28}
   RUL estimation (continuous): 939 hours avg

âœ… Ready for Model Development!
   Next: Build failure prediction model
