In [90]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, roc_auc_score

In [91]:
# Step 1: Load the dataset
file_path = '/content/dataset.csv'
data = pd.read_csv(file_path)


In [92]:
# Step 2: Handle missing values
data.fillna('Unknown', inplace=True)

In [93]:
# Step 3: Encode categorical columns
categorical_columns = ['type', 'pack_size_label', 'short_composition1', 'short_composition2']
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

In [94]:
# Step 4: Select features and target
features = ['price(₹)', 'manufacturer_name', 'type', 'pack_size_label', 'short_composition1']
target = 'Is_discontinued'

In [95]:
X = data[features]
y = data[target]

In [96]:
# Step 5: Scale the price feature for modeling
scaler = StandardScaler()
X['scaled_price'] = scaler.fit_transform(X[['price(₹)']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['scaled_price'] = scaler.fit_transform(X[['price(₹)']])


In [97]:
# Step 6: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [98]:
# Step 7: Train the Random Forest model with class weights
rf_model_with_weights = RandomForestClassifier(
    random_state=42, n_estimators=100, max_depth=10, class_weight="balanced"
)
rf_model_with_weights.fit(
    X_train[['scaled_price', 'type', 'pack_size_label', 'short_composition1']],
    y_train
)


In [99]:
# Step 8: Make predictions on the test set
y_pred_with_weights = rf_model_with_weights.predict(
    X_test[['scaled_price', 'type', 'pack_size_label', 'short_composition1']]
)
y_prob_with_weights = rf_model_with_weights.predict_proba(
    X_test[['scaled_price', 'type', 'pack_size_label', 'short_composition1']]
)[:, 1]


In [100]:
# Step 9: Evaluate the model
roc_auc_with_weights = roc_auc_score(y_test, y_prob_with_weights)
classification_rep_with_weights = classification_report(y_test, y_pred_with_weights)

In [101]:
# Print evaluation metrics
print("ROC-AUC Score:", roc_auc_with_weights)
print("Classification Report:\n", classification_rep_with_weights)

ROC-AUC Score: 0.6945533251262543
Classification Report:
               precision    recall  f1-score   support

       False       0.98      0.78      0.87     49187
        True       0.07      0.48      0.12      1608

    accuracy                           0.77     50795
   macro avg       0.52      0.63      0.49     50795
weighted avg       0.95      0.77      0.84     50795



In [102]:
# Step 10: Save the predictions to a CSV file
X_test['predicted_class'] = y_pred_with_weights
X_test['predicted_prob'] = y_prob_with_weights

In [104]:
# Include original price and manufacturer name in the output
output_columns = ['price(₹)', 'manufacturer_name', 'predicted_class', 'predicted_prob']
predictions_corrected_with_weights = X_test[output_columns]

# Save the file locally in Colab
output_file_path_colab = 'predictions_corrected_with_weights.csv'
predictions_corrected_with_weights.to_csv(output_file_path_colab, index=False)
print(f"Predictions saved locally in Colab as: {output_file_path_colab}")

Predictions saved locally in Colab as: predictions_corrected_with_weights.csv
