In [1]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
import pandas as pd
import joblib

In [2]:
# Load the dataset
df = pd.read_csv('train.csv')

In [3]:
df.head(2)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387


In [4]:
# Identify features and target
X = df.drop(columns=['price','id'])
y = df['price']

In [5]:
# Identify numerical and categorical columns
numeric_features = X.select_dtypes(include=['number']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

In [6]:
numeric_features

['carat', 'depth', 'table', 'x', 'y', 'z']

In [7]:
categorical_features

['cut', 'color', 'clarity']

In [8]:
# Preprocessing for numerical and categorical features
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [9]:
# Create a pipeline with the preprocessor and the model
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

In [10]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Fit the pipeline to the data
model_pipeline.fit(X_train, y_train)

In [12]:
# Save the pipeline
joblib.dump(model_pipeline, 'model_pipeline.joblib')

['model_pipeline.joblib']

In [13]:
X_train

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
83475,0.32,Premium,E,SI1,61.6,58.0,4.38,4.41,2.71
160324,1.20,Premium,F,VS2,62.6,57.0,6.81,6.76,4.25
101740,1.50,Ideal,I,VS2,62.2,55.0,7.30,7.26,4.53
180341,1.67,Premium,I,SI2,61.9,59.0,7.65,7.61,4.71
48480,1.00,Good,H,VS2,63.7,60.0,6.34,6.30,4.02
...,...,...,...,...,...,...,...,...,...
119879,0.50,Very Good,E,SI1,60.2,61.0,5.11,5.15,3.09
103694,1.91,Very Good,F,SI1,62.3,62.0,7.85,7.79,4.87
131932,1.22,Premium,G,VS2,62.8,58.0,6.82,6.74,4.26
146867,0.31,Very Good,G,VVS1,61.1,56.0,4.37,4.40,2.67


In [14]:
# Load the pipeline
loaded_pipeline = joblib.load('model_pipeline.joblib')

In [15]:
# Check the type and steps of the loaded pipeline
print(type(loaded_pipeline)) 

<class 'sklearn.pipeline.Pipeline'>


In [16]:
print(loaded_pipeline.named_steps.keys())

dict_keys(['preprocessor', 'model'])


In [17]:
# Prepare example data
feature = pd.DataFrame({
    'carat': [0.3],
    'cut': ['Good'],
    'color': ['G'],
    'clarity': ['VS2'],
    'depth': [61.5],
    'table': [57.0],
    'x': [4.0],
    'y': [4.0],
    'z': [2.5]
})

In [19]:
print(feature.head()) 

   carat   cut color clarity  depth  table    x    y    z
0    0.3  Good     G     VS2   61.5   57.0  4.0  4.0  2.5


In [20]:
print(feature.dtypes)

carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
x          float64
y          float64
z          float64
dtype: object


In [21]:
preprocessor = loaded_pipeline.named_steps['preprocessor']
print(preprocessor) 

ColumnTransformer(transformers=[('num', StandardScaler(),
                                 ['carat', 'depth', 'table', 'x', 'y', 'z']),
                                ('cat', OneHotEncoder(drop='first'),
                                 ['cut', 'color', 'clarity'])])


In [22]:
prediction = loaded_pipeline.predict(feature)

In [23]:
prediction

array([1085.63502283])