***Biggest and most accurate winner of the BMW hack***

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
df = pd.read_csv('train.csv')

# Step 1: Data Preprocessing
# Convert timestamp to datetime
df['message_timestamp'] = pd.to_datetime(df['message_timestamp'])

# Extract useful time-based features
df['hour'] = df['message_timestamp'].dt.hour
df['day_of_week'] = df['message_timestamp'].dt.dayofweek
df['shift'] = df['hour'].apply(lambda x: 'Night' if x < 6 else 'Day' if x < 14 else 'Evening')

# One-hot encode categorical features like shift and car part
df = pd.get_dummies(df, columns=['shift', 'physical_part_type'], drop_first=True)

# Drop columns that won't be useful for the model
df.drop(columns=['message_timestamp', 'physical_part_id', 'weekday'], inplace=True)

# Step 2: Feature and Target Selection
X = df.drop(columns=['status'])  # Assuming 'quality_label' is the target column
y = df['status'].apply(lambda x: 1 if x == 'OK' else 0)

# Step 3: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Model Training
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 5: Predictions
y_pred = model.predict(X_test)

# Step 6: Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Feature Importance
importances = model.feature_importances_
feature_importances = pd.Series(importances, index=X.columns)
feature_importances.sort_values(ascending=False, inplace=True)
print("Feature Importances:\n", feature_importances)

# Optional: Visualize the feature importances
import matplotlib.pyplot as plt
feature_importances.plot(kind='bar')
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Feature Importance in Predicting Quality')
plt.show()


ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [4]:
import pandas as pd

data = pd.read_csv("train.csv")
data['message_timestamp']

0        2024-09-01 22:05:44
1        2024-09-01 22:07:29
2        2024-09-01 22:07:55
3        2024-09-01 22:08:38
4        2024-09-01 22:08:56
                ...         
44813    2024-11-25 00:50:58
44814    2024-11-25 00:53:59
44815    2024-11-25 00:55:16
44816    2024-11-25 00:56:10
44817    2024-11-25 00:57:50
Name: message_timestamp, Length: 44818, dtype: object

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from dowhy import CausalModel
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('train.csv')

# Step 1: Data Preprocessing
# Convert message_timestamp to datetime
df['message_timestamp'] = pd.to_datetime(df['message_timestamp'])

# Extract useful time-based features
df['hour'] = df['message_timestamp'].dt.hour
df['day_of_week'] = df['message_timestamp'].dt.dayofweek
df['shift'] = df['hour'].apply(lambda x: 'Night' if x < 6 else 'Day' if x < 14 else 'Evening')

# One-hot encode categorical features like shift and car part
df = pd.get_dummies(df, columns=['shift', 'physical_part_type'], drop_first=True)

# Drop columns that won't be useful for the model
df.drop(columns=['message_timestamp', 'physical_part_id', 'weekday'], inplace=True)

# Step 2: Feature and Target Selection
X = df.drop(columns=['status'])  # Assuming 'quality_label' is the target column
y = df['status'].apply(lambda x: 1 if x == 'OK' else 0)

# Step 3: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Imputer
from sklearn.impute import SimpleImputer

# Create an imputer object with strategy set to 'mean' (replace with median or other)
imputer = SimpleImputer(strategy='mean')

# Fit the imputer on the training data
imputer.fit(X_train)

# Transform both training and testing data
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

# ---------------------------------------

# Step 4: Model Training
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 5: Predictions
y_pred = model.predict(X_test)

# Step 6: Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Feature Importance
importances = model.feature_importances_
feature_importances = pd.Series(importances, index=X.columns)
feature_importances.sort_values(ascending=False, inplace=True)
print("Feature Importances:\n", feature_importances)

# Optional: Visualize the feature importances
feature_importances.plot(kind='bar')
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Feature Importance in Predicting Quality')
plt.show()

# Step 7: Causal Discovery using DoWhy
# Define the causal model
causal_model = CausalModel(
    data=df,
    treatment='hour',  # Example treatment variable
    outcome='status'
)

# Identify the causal effect
identified_estimand = causal_model.identify_effect()
print("Identified Estimand:\n", identified_estimand)

# Estimate the causal effect
causal_estimate = causal_model.estimate_effect(
    identified_estimand,
    method_name="backdoor.propensity_score_matching"
)
print("Causal Estimate:\n", causal_estimate)

# Refutation to test the robustness of the estimate
refutation = causal_model.refute_estimate(
    identified_estimand, causal_estimate,
    method_name="placebo_treatment_refuter"
)
print("Refutation:\n", refutation)


 's0_sensor15_kilonewton_step1' 's2_sensor4_millimeter_step1'
 's2_sensor5_millimeter_step1' 's0_sensor26_millimeter_step1'
 's2_sensor8_millimeter_step1' 's3_sensor1_meter_step1'
 's4_sensor10_kilonewton_step2' 's0_sensor35_kilonewton_step1'
 's4_sensor12_millimeter_step2' 's2_sensor15_millimeter_step1'
 's2_sensor16_millimeter_step1' 's4_sensor13_kilonewton_step2'
 's0_sensor53_millimeter_step1' 's2_sensor22_millimeter_step1'
 's2_sensor23_millimeter_step1']. At least one non-missing value is needed for imputation with strategy='mean'.
 's0_sensor15_kilonewton_step1' 's2_sensor4_millimeter_step1'
 's2_sensor5_millimeter_step1' 's0_sensor26_millimeter_step1'
 's2_sensor8_millimeter_step1' 's3_sensor1_meter_step1'
 's4_sensor10_kilonewton_step2' 's0_sensor35_kilonewton_step1'
 's4_sensor12_millimeter_step2' 's2_sensor15_millimeter_step1'
 's2_sensor16_millimeter_step1' 's4_sensor13_kilonewton_step2'
 's0_sensor53_millimeter_step1' 's2_sensor22_millimeter_step1'
 's2_sensor23_millimeter

Accuracy: 0.9502454261490406
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.05      0.10       466
           1       0.95      1.00      0.97      8498

    accuracy                           0.95      8964
   macro avg       0.90      0.53      0.54      8964
weighted avg       0.95      0.95      0.93      8964



ValueError: Length of values (359) does not match length of index (376)

In [None]:
# Updated

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from dowhy import CausalModel
import matplotlib.pyplot as plt


# Load the dataset
df = pd.read_csv('train.csv')

# Step 1: Data Preprocessing
# Convert message_timestamp to datetime
df['message_timestamp'] = pd.to_datetime(df['message_timestamp'])

# Extract useful time-based features
df['hour'] = df['message_timestamp'].dt.hour
df['day_of_week'] = df['message_timestamp'].dt.dayofweek
df['shift'] = df['hour'].apply(lambda x: 'Night' if x < 6 else 'Day' if x < 14 else 'Evening')

# One-hot encode categorical features like shift and car part
df = pd.get_dummies(df, columns=['shift', 'physical_part_type'], drop_first=True)

# Drop columns that won't be useful for the model
df.drop(columns=['message_timestamp', 'physical_part_id', 'weekday'], inplace=True)

# Step 2: Feature and Target Selection
X = df.drop(columns=['status'])  # Assuming 'quality_label' is the target column
y = df['status'].apply(lambda x: 1 if x == 'OK' else 0)

# Class IMBALANCE PROBLEM
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Model with class weight adjustment
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model.fit(X_train_res, y_train_res)

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train_res, y_train_res)

# Best model after tuning
best_model = grid_search.best_estimator_

# Predictions
y_pred = best_model.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


# # Step 3: Train-Test Split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Imputer
# from sklearn.impute import SimpleImputer

# # Create an imputer object with strategy set to 'mean' (replace with median or other)
# imputer = SimpleImputer(strategy='mean')

# # Fit the imputer on the training data
# imputer.fit(X_train)

# # Transform both training and testing data
# X_train = imputer.transform(X_train)
# X_test = imputer.transform(X_test)

# # ---------------------------------------

# # Step 4: Model Training
# model = RandomForestClassifier(n_estimators=100, random_state=42)
# model.fit(X_train, y_train)

# # Step 5: Predictions
# y_pred = model.predict(X_test)

# # Step 6: Evaluation
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print("Classification Report:\n", classification_report(y_test, y_pred))

# # Feature Importance
# importances = model.feature_importances_
# feature_importances = pd.Series(importances, index=X.columns)
# feature_importances.sort_values(ascending=False, inplace=True)
# print("Feature Importances:\n", feature_importances)

# # Optional: Visualize the feature importances
# feature_importances.plot(kind='bar')
# plt.xlabel('Features')
# plt.ylabel('Importance')
# plt.title('Feature Importance in Predicting Quality')
# plt.show()

# # Step 7: Causal Discovery using DoWhy
# # Define the causal model
# causal_model = CausalModel(
#     data=df,
#     treatment='hour',  # Example treatment variable
#     outcome='status'
# )

# # Identify the causal effect
# identified_estimand = causal_model.identify_effect()
# print("Identified Estimand:\n", identified_estimand)

# # Estimate the causal effect
# causal_estimate = causal_model.estimate_effect(
#     identified_estimand,
#     method_name="backdoor.propensity_score_matching"
# )
# print("Causal Estimate:\n", causal_estimate)

# # Refutation to test the robustness of the estimate
# refutation = causal_model.refute_estimate(
#     identified_estimand, causal_estimate,
#     method_name="placebo_treatment_refuter"
# )
# print("Refutation:\n", refutation)


Fitting 3 folds for each of 54 candidates, totalling 162 fits
Accuracy: 0.9489067380633646
Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.05      0.10       466
           1       0.95      1.00      0.97      8498

    accuracy                           0.95      8964
   macro avg       0.77      0.53      0.54      8964
weighted avg       0.93      0.95      0.93      8964

