Modeling Notebook:

# Loading Packages

In [None]:
import pandas as pd

# Loading Data

In [None]:
# Define the path to your CSV file
# file_path = r"C:/Users/johne/Downloads/IWC_Work_Orders_Extract.csv"
file_path = r"C:/Users/matt/Downloads/IWC_Work_Orders_Extract.csv"
# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Glypse of Data

In [None]:
# Display the first 5 rows of the DataFrame
print("Head of the DataFrame:")
print(df.head())

# Creating new variables:

In [None]:
# Calculating Time Between Maintenance Events Section
# Convert 'EXECUTION_START_DATE' to datetime format
df['EXECUTION_START_DATE'] = pd.to_datetime(df['EXECUTION_START_DATE'], errors='coerce')

# Drop rows with invalid 'EXECUTION_START_DATE'
df.dropna(subset=['EXECUTION_START_DATE'], inplace=True)

# Sort the DataFrame by 'EQUIPMENT_ID' and 'EXECUTION_START_DATE'
df.sort_values(by=['EQUIPMENT_ID', 'EXECUTION_START_DATE'], inplace=True)

# Calculate the time difference between consecutive 'EXECUTION_START_DATE' for each 'EQUIPMENT_ID'
df['TIME_BETWEEN_MAINTENANCE'] = df.groupby('EQUIPMENT_ID')['EXECUTION_START_DATE'].diff().dt.days

# Display the first 5 rows to verify the new column
print("Head of the DataFrame with Time Between Maintenance:")
print(df.head())

# Model

In [None]:
# Model
# Predictive Model Section
# Drop rows with missing 'TIME_BETWEEN_MAINTENANCE' (since the first entry for each equipment will have NaN)
df.dropna(subset=['TIME_BETWEEN_MAINTENANCE'], inplace=True)

# Selecting features for the predictive model
# We will use 'TIME_BETWEEN_MAINTENANCE' and other relevant features to predict the next maintenance event
features = ['TIME_BETWEEN_MAINTENANCE', 'EQUIPMENT_ID', 'ACTUAL_WORK_IN_MINUTES']
X = df[features]
y = df['TIME_BETWEEN_MAINTENANCE'].shift(-1)  # Target is the next 'TIME_BETWEEN_MAINTENANCE' value

# Drop the last row since it has NaN target value
y.dropna(inplace=True)
X = X.iloc[:-1, :]

# Encoding categorical features ('EQUIPMENT_ID')
X = pd.get_dummies(X, columns=['EQUIPMENT_ID'])

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling Features Section
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Training a Predictive Model to Estimate Maintenance Time Section
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Model Evaluation Section
y_pred = model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error for Predictive Model: {mse}')

# Average Maintenance Time Prediction per Equipment ID Section
avg_maintenance_time = df.groupby('EQUIPMENT_ID')['TIME_BETWEEN_MAINTENANCE'].mean().reset_index()

# Visualization Section
plt.figure(figsize=(12, 6))
sns.barplot(x='EQUIPMENT_ID', y='TIME_BETWEEN_MAINTENANCE', data=avg_maintenance_time)
plt.xlabel('Equipment ID')
plt.ylabel('Average Time Between Maintenance (Days)')
plt.title('Average Maintenance Time per Equipment ID')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

# Glypse of Data Section
# Display the first 5 rows of the DataFrame
print("Head of the DataFrame:")
print(df.head())