<a href="https://colab.research.google.com/github/mounishwaran/gemstone_price_detection/blob/main/gemstone.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded = files.upload()


Saving gemstone.csv to gemstone.csv


In [None]:
import io, json, joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

fname = next(iter(uploaded.keys()))
print("Using uploaded file:", fname)

df = pd.read_csv(io.BytesIO(uploaded[fname]))
print("Data shape:", df.shape)
df.head()

TARGET_COL = "price"
numeric_cols = ["carat", "depth", "table", "x", "y", "z"]
categorical_cols = ["cut", "color", "clarity"]

df = df.dropna(subset=[TARGET_COL])
X = df[numeric_cols + categorical_cols]
y = df[TARGET_COL]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_cols),
    ("cat", categorical_transformer, categorical_cols)
])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_val)
print("MSE:", mean_squared_error(y_val, y_pred))
print("R2 :", r2_score(y_val, y_pred))

joblib.dump(pipeline, "gemstone_pipeline.joblib")
json.dump({
    "numeric_cols": numeric_cols,
    "categorical_cols": categorical_cols,
    "feature_order": numeric_cols + categorical_cols
}, open("model_meta.json", "w"))

print("Model saved successfully!")


Using uploaded file: gemstone.csv
Data shape: (193573, 11)
MSE: 892386.3633987491
R2 : 0.9447766216795184
Model saved successfully!


In [None]:
import pandas as pd
import joblib, json

pipeline = joblib.load("gemstone_pipeline.joblib")
meta = json.load(open("model_meta.json"))
expected = meta["feature_order"]

data = [
    {"carat": 1.52, "cut": "Premium", "color": "F", "clarity": "VS2",
     "depth": 62.2, "table": 58.0, "x": 7.27, "y": 7.33, "z": 4.55},
    {"carat": 0.70, "cut": "Ideal", "color": "G", "clarity": "VS1",
     "depth": 61.2, "table": 57.0, "x": 5.69, "y": 5.73, "z": 3.50},
    {"carat": 2.03, "cut": "Very Good", "color": "J", "clarity": "SI2",
     "depth": 62.0, "table": 58.0, "x": 8.06, "y": 8.12, "z": 5.05}
]

df = pd.DataFrame(data)

for col in expected:
    if col not in df.columns:
        df[col] = pd.NA

preds = pipeline.predict(df[expected])

df["predicted_price"] = preds
df["predicted_price_rounded"] = df["predicted_price"].round(2)

print("Number of input rows:", len(df))
print("Number of predictions:", len(preds))
df


Number of input rows: 3
Number of predictions: 3


Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,predicted_price,predicted_price_rounded
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,11378.395017,11378.4
1,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,3534.268509,3534.27
2,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13360.305967,13360.31
