In [10]:
import pandas as pd

# Load the updated dataset
df = pd.read_csv("/content/Account_Age_Orders_Rating_Discount.csv")

# Step 1: Drop duplicates
df.drop_duplicates(inplace=True)

# Step 2: Handle missing values in numeric columns
for col in df.select_dtypes(include=['float64', 'int64']).columns:
    df[col].fillna(df[col].mean(), inplace=True)

# Step 3: No categorical columns to encode

# Step 4: Show dataset summary
print("✅ Cleaned dataset shape:", df.shape)
print("🧼 Remaining missing values:\n", df.isnull().sum())


✅ Cleaned dataset shape: (9997, 4)
🧼 Remaining missing values:
 account_age_days    0
total_orders        0
product_rating      0
discount_percent    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


In [11]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Load the updated dataset
df = pd.read_csv("/content/Account_Age_Orders_Rating_Discount.csv")

# Features and Target
X = df.drop('discount_percent', axis=1)
y = df['discount_percent']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Confirm shapes
print("✅ X_train shape:", X_train.shape)
print("✅ X_test shape:", X_test.shape)


✅ X_train shape: (8000, 3)
✅ X_test shape: (2000, 3)


In [13]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("/content/Account_Age_Orders_Rating_Discount.csv")

# Split features and target
X = df.drop('discount_percent', axis=1)
y = df['discount_percent']

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Create DMatrix from training and test sets
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define XGBoost parameters
params = {
    'objective': 'reg:squarederror',  # regression objective
    'eval_metric': 'rmse',
    'eta': 0.1,
    'max_depth': 6,
    'seed': 42
}

# Dictionary to store training history
evals_result = {}

# Train the model
model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=200,
    evals=[(dtrain, 'train'), (dtest, 'eval')],
    early_stopping_rounds=10,
    evals_result=evals_result,
    verbose_eval=False
)

# Save model
model.save_model("xgb_discount_model.json")

# Save training history to CSV
history_df = pd.DataFrame({
    'train_rmse': evals_result['train']['rmse'],
    'eval_rmse': evals_result['eval']['rmse']
})
history_df.to_csv("training_history.csv", index_label="iteration")

print(f"✅ Model trained for {len(history_df)} iterations. Saved as 'xgb_discount_model.json'")


✅ Model trained for 200 iterations. Saved as 'xgb_discount_model.json'


In [14]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Predict using the best iteration
y_pred = model.predict(dtest, iteration_range=(0, model.best_iteration + 1))

# Evaluation metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"📊 RMSE: {rmse:.2f}")
print(f"📊 MAE: {mae:.2f}")
print(f"📊 R² Score: {r2:.2f}")


📊 RMSE: 0.09
📊 MAE: 0.07
📊 R² Score: 1.00


In [15]:
import matplotlib.pyplot as plt

# Plot Train vs Eval RMSE
plt.figure(figsize=(8, 5))
plt.plot(history_df['train_rmse'], label='Train RMSE')
plt.plot(history_df['eval_rmse'], label='Eval RMSE')
plt.axvline(model.best_iteration, color='red', linestyle='--', label='Best Iteration')
plt.title("XGBoost Training History")
plt.xlabel("Boosting Round")
plt.ylabel("RMSE")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("training_history_plot.png")
plt.close()


In [16]:
# Scatter plot: Actual vs Predicted
plt.figure(figsize=(6, 6))
plt.scatter(y_test, y_pred, alpha=0.6, color='blue', edgecolors='white')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', linewidth=2)
plt.xlabel("Actual Discount %")
plt.ylabel("Predicted Discount %")
plt.title("Actual vs Predicted Values")
plt.tight_layout()
plt.savefig("actual_vs_predicted_plot.png")
plt.close()


In [21]:
import xgboost as xgb
import pandas as pd

# Load trained model
model = xgb.Booster()
model.load_model("xgb_discount_model.json")

print("🎯 Discount Prediction System (type 'exit' to stop)\n")

while True:
    try:
        # Get user input
        age_input = input("Enter account_age_days (1–365): ")
        if age_input.lower() == 'exit':
            break
        orders_input = input("Enter total_orders (0–100): ")
        if orders_input.lower() == 'exit':
            break
        rating_input = input("Enter product_rating (1.0–5.0): ")
        if rating_input.lower() == 'exit':
            break

        # Convert to numeric
        age = int(age_input)
        orders = int(orders_input)
        rating = float(rating_input)

        # Optional: input validation
        if not (0 <= age <= 365):
            print("⚠️ Account age must be between 1 and 365.")
            continue
        if not (0 <= orders <= 100):
            print("⚠️ Total orders must be between 0 and 100.")
            continue
        if not (1.0 <= rating <= 5.0):
            print("⚠️ Product rating must be between 1.0 and 5.0.")
            continue

        # Create input DataFrame
        data = pd.DataFrame([{
            'account_age_days': age,
            'total_orders': orders,
            'product_rating': rating
        }])
        dmatrix = xgb.DMatrix(data)

        # Predict
        prediction = model.predict(dmatrix)[0]
        print(f"🧾 Predicted Discount: {round(prediction, 2)}%\n")

    except Exception as e:
        print(f"⚠️ Error: {e}\nPlease enter valid numeric values.\n")


🎯 Discount Prediction System (type 'exit' to stop)

Enter account_age_days (1–365): 0
Enter total_orders (0–100): 10
Enter product_rating (1.0–5.0): 4.5
🧾 Predicted Discount: 3.059999942779541%

Enter account_age_days (1–365): 0
Enter total_orders (0–100): 11
Enter product_rating (1.0–5.0): 4.5
🧾 Predicted Discount: 3.0999999046325684%



KeyboardInterrupt: Interrupted by user

In [24]:
import shutil
from google.colab import files

# Replace 'your_folder' with the name/path of your folder
folder_path = '/content/COLAB_XBOOST'
zip_name = 'your_folder.zip'

# Create ZIP file
shutil.make_archive('/content/COLAB', 'zip', folder_path)

# Download the ZIP file
files.download(zip_name)


FileNotFoundError: Cannot find file: your_folder.zip