In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
import numpy as np

In [2]:
# Load the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
# Identify the common columns between train and test excluding 'id'
common_columns = train_data.columns.intersection(test_data.columns).tolist()
common_columns.remove('id')

In [4]:
# Reindex the test data to have the same columns as the training data
test_data = test_data.reindex(columns=common_columns)


In [5]:
# Initialize the imputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')


In [6]:
# Fit the imputer on the training data and transform both training and test data
train_data_imputed = imputer.fit_transform(train_data[common_columns])
test_data_imputed = imputer.transform(test_data)


In [22]:
# Define the feature and target columns
feature_columns = train_data.columns.difference(['id'] + target_columns).tolist()
target_columns = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']


In [23]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    train_data[feature_columns],
    train_data[target_columns],
    test_size=0.2,
    random_state=42
)

In [24]:
# Initialize and fit the RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [25]:
# Predict probabilities for the validation set
val_predictions = model.predict_proba(X_val)


In [31]:
# Calculate the AUC for each defect category
auc_scores = {}
for i, target in enumerate(target_columns):
    # Extract the predicted probabilities for the positive class (index 1)
    positive_class_probs = val_predictions[i][:, 1]
    # Calculate the AUC score
    auc_score = roc_auc_score(y_val[target], positive_class_probs)
    # Store the AUC score in the dictionary
    auc_scores[target] = auc_score

# Now auc_scores will contain the AUC scores for each target

In [32]:
# Calculate the average AUC across all defect categories
average_auc = np.mean(list(auc_scores.values()))

In [33]:
# Create a DataFrame for the imputed test data with the correct column names
test_data_imputed_df = pd.DataFrame(test_data_imputed, columns=feature_columns)

In [34]:
# Predict probabilities for the test set
test_predictions = model.predict_proba(test_data_imputed_df)

In [35]:
# Convert the list of arrays to a single array with the correct shape
test_predictions_array = np.hstack(test_predictions)

In [36]:
# Reshape the predictions to have 7 columns, one for each defect category
test_predictions_reshaped = test_predictions_array.reshape(-1, 7)


In [43]:
# Assuming 'test_predictions' is a list of arrays, where each array corresponds to the predicted probabilities for one defect category
# Convert the list of arrays to a single array with the correct shape
test_predictions_array = np.hstack([probs[:, 1].reshape(-1, 1) for probs in test_predictions])

# Ensure the number of rows in the reshaped predictions matches the number of IDs
if len(test_data['id']) != test_predictions_array.shape[0]:
    raise ValueError(f"Number of rows in predictions ({test_predictions_array.shape[0]}) does not match number of IDs ({len(test_data['id'])}).")

# Create the submission DataFrame
submission_df = pd.DataFrame(test_predictions_array, columns=target_columns)
submission_df.insert(0, 'id', test_data['id'])

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission.csv', index=False)

In [41]:

# Print the average AUC score
print(f'Average AUC: {average_auc}')

Average AUC: 0.8726609590070139
