In [11]:
# ======================================
# 🔍 Predictive Maintenance Analysis (AI4I 2020)
# ======================================

In [12]:
# 📦 Import Libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
# 📊 Settings
sns.set(style="whitegrid", context="talk")
plt.rcParams['figure.facecolor'] = 'white'
os.makedirs("charts", exist_ok=True)

In [15]:
# ======================================
# 🧹 STEP 1: Load & Clean the Dataset
# ======================================

# Load dataset
df = pd.read_csv("ai4i2020.csv")

print("✅ Data loaded successfully.")
print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
display(df.head())

✅ Data loaded successfully.
Rows: 10000, Columns: 14


Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [16]:
# Clean column names
df.columns = [c.strip().replace(" ", "_").replace("[", "").replace("]", "") for c in df.columns]

In [17]:
# Convert to numeric where needed
num_cols = ['Air_temperature_K','Process_temperature_K','Rotational_speed_rpm','Torque_Nm','Tool_wear_min']
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce')

In [18]:
# ======================================
# ⚙️ STEP 2: Feature Engineering
# ======================================

In [19]:
# Temperature difference
df['Temp_Diff'] = df['Process_temperature_K'] - df['Air_temperature_K']

In [20]:
# Custom severity score
df['Severity_Score'] = (df['Torque_Nm']*0.3 + df['Rotational_speed_rpm']*0.1 + df['Tool_wear_min']*0.6)

In [21]:
# Air condition category
df['Air_Condition'] = pd.cut(
    df['Air_temperature_K'],
    bins=[0,295,305,400],
    labels=['Cold','Moderate','Hot']
)

In [22]:
# Remove invalid or missing values
df = df.dropna(subset=num_cols)
df = df[(df['Torque_Nm'] > 0) & (df['Rotational_speed_rpm'] > 0)]

In [23]:
print("✅ Data cleaned & enriched successfully.")
display(df.describe())

✅ Data cleaned & enriched successfully.


Unnamed: 0,UDI,Air_temperature_K,Process_temperature_K,Rotational_speed_rpm,Torque_Nm,Tool_wear_min,Machine_failure,TWF,HDF,PWF,OSF,RNF,Temp_Diff,Severity_Score
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,300.00493,310.00556,1538.7761,39.98691,107.951,0.0339,0.0046,0.0115,0.0095,0.0098,0.0019,10.00063,230.644283
std,2886.89568,2.000259,1.483734,179.284096,9.968934,63.654147,0.180981,0.067671,0.106625,0.097009,0.098514,0.04355,1.001094,41.167979
min,1.0,295.3,305.7,1168.0,3.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.6,142.74
25%,2500.75,298.3,308.8,1423.0,33.2,53.0,0.0,0.0,0.0,0.0,0.0,0.0,9.3,197.2325
50%,5000.5,300.1,310.1,1503.0,40.1,108.0,0.0,0.0,0.0,0.0,0.0,0.0,9.8,229.935
75%,7500.25,301.5,311.1,1612.0,46.8,162.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,263.7125
max,10000.0,304.5,313.8,2886.0,76.6,253.0,1.0,1.0,1.0,1.0,1.0,1.0,12.1,413.24


In [24]:
# ======================================
# 📈 STEP 3: Exploratory Data Analysis
# ======================================

In [25]:
# Machine failure overview
failure_rate = df['Machine_failure'].mean() * 100
print(f"⚙️ Machine Failure Rate: {failure_rate:.2f}%")

⚙️ Machine Failure Rate: 3.39%


In [26]:
# Failure by type
failures_by_type = df.groupby('Type')['Machine_failure'].mean() * 100
print("\nFailure rate by Product Type:")
display(failures_by_type)


Failure rate by Product Type:


Type
H    2.093719
L    3.916667
M    2.769436
Name: Machine_failure, dtype: float64

In [27]:
# Temperature difference summary
display(df[['Temp_Diff','Machine_failure']].groupby('Machine_failure').describe())

Unnamed: 0_level_0,Temp_Diff,Temp_Diff,Temp_Diff,Temp_Diff,Temp_Diff,Temp_Diff,Temp_Diff,Temp_Diff
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Machine_failure,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,9661.0,10.021571,0.988422,7.6,9.3,9.8,11.0,12.1
1,339.0,9.403835,1.164445,7.6,8.4,9.3,10.5,12.0


In [28]:
# ======================================
# 📊 STEP 4: Data Visualization
# ======================================

In [29]:
def save_chart(title):
    """Helper to save charts into /charts folder"""
    plt.title(title)
    plt.tight_layout()
    plt.savefig(f"charts/{title.replace(' ','_').lower()}.png", bbox_inches='tight')
    plt.close()

In [30]:
# Failure Rate
plt.pie([failure_rate, 100-failure_rate], labels=['Failure','No Failure'],
        autopct='%1.1f%%', colors=['#ff6666','#66b3ff'])
save_chart("Machine Failure Rate")

In [32]:
# Failures by Product Type
sns.barplot(x=failures_by_type.index, y=failures_by_type.values,hue=failures_by_type.index ,palette='crest')
plt.ylabel("Failure Rate (%)")
save_chart("Failure Rate by Product Type")

In [34]:
# Temperature Difference vs Failure
sns.boxplot(data=df, x='Machine_failure', y='Temp_Diff',hue='Machine_failure' ,palette='coolwarm')
plt.xlabel("Machine Failure (0=No, 1=Yes)")
plt.ylabel("Temperature Difference (K)")
save_chart("Temperature Difference vs Machine Failure")

In [36]:
# Torque & RPM vs Failure
fig, ax = plt.subplots(1,2, figsize=(12,5))
sns.boxplot(data=df, x='Machine_failure', y='Torque_Nm', ax=ax[0],hue='Machine_failure', palette='Blues')
sns.boxplot(data=df, x='Machine_failure', y='Rotational_speed_rpm', ax=ax[1],hue='Machine_failure' ,palette='Reds')
ax[0].set_title("Torque vs Failure")
ax[1].set_title("RPM vs Failure")
plt.tight_layout()
plt.savefig("charts/Torque_RPM_vs_Failure.png", bbox_inches='tight')
plt.close()

In [37]:
# Tool Wear Distribution
sns.histplot(data=df, x='Tool_wear_min', hue='Machine_failure', kde=True, bins=30, palette='viridis')
save_chart("Tool Wear Distribution by Failure")

In [38]:
# Correlation Heatmap
corr = df[['Air_temperature_K','Process_temperature_K','Rotational_speed_rpm','Torque_Nm',
           'Tool_wear_min','Machine_failure','Temp_Diff','Severity_Score']].corr()
plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
save_chart("Correlation Heatmap")

In [40]:
# Failure Type Distribution
types = ['TWF','HDF','PWF','OSF','RNF']
sums = df[types].sum().sort_values(ascending=False)
sns.barplot(x=sums.index, y=sums.values,hue=sums.index, palette='magma')
plt.ylabel("Count")
save_chart("Failure Type Distribution")

In [41]:
# Torque vs Temperature Difference
sns.scatterplot(data=df, x='Torque_Nm', y='Temp_Diff', hue='Machine_failure', palette='Set2', alpha=0.7)
save_chart("Torque vs Temperature Difference")

In [42]:
# Severity Score Distribution
sns.histplot(df['Severity_Score'], kde=True, color='teal')
save_chart("Severity Score Distribution")

In [44]:
# Severity Score by Type
avg = df.groupby('Type')['Severity_Score'].mean().sort_values(ascending=False)
sns.barplot(x=avg.index, y=avg.values,hue=avg.index ,palette='flare')
plt.ylabel("Avg Severity Score")
save_chart("Average Severity Score by Product Type")

In [46]:
# Environmental Conditions vs Failures
summary = df.groupby('Air_Condition')['Machine_failure'].agg(['count','sum'])
summary['Failure_Rate'] = 100 * summary['sum'] / summary['count']
sns.barplot(x=summary.index, y=summary['Failure_Rate'],hue=summary.index ,palette='rocket')
plt.ylabel("Failure Rate (%)")
save_chart("Failure Rate by Ambient Temperature")

  summary = df.groupby('Air_Condition')['Machine_failure'].agg(['count','sum'])


In [47]:
# Extra - RPM vs Torque
sns.scatterplot(data=df, x='Rotational_speed_rpm', y='Torque_Nm', hue='Machine_failure', alpha=0.6)
save_chart("RPM vs Torque (Colored by Failure)")

In [49]:
# Severity vs Failure
sns.boxplot(data=df, x='Machine_failure', y='Severity_Score', hue='Machine_failure',palette='coolwarm')
save_chart("Severity Score by Failure Status")