In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import  LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [2]:
df = pd.read_csv('data/data.csv')

In [3]:
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [4]:
df.drop('id',axis=1,inplace=True)

In [5]:
X = df.iloc[:,:-1]

In [6]:
y = df.iloc[:,[-1]]

In [7]:
numerical_cols = list(X.dtypes[X.dtypes!='O'].index)
numerical_cols

['carat', 'depth', 'table', 'x', 'y', 'z']

In [8]:
categorical_cols = list(X.dtypes[X.dtypes=='O'].index)
categorical_cols

['cut', 'color', 'clarity']

In [9]:
# Define the custom ranking for each ordinal variable
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [10]:
numerical_pipeline = Pipeline(
    steps = [
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ]
)

categorical_pipeline = Pipeline(
    steps = [
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
        ('scaler',StandardScaler())
    ]
)



In [11]:
preprocessor = ColumnTransformer([
    ('numerical_pipeline',numerical_pipeline,numerical_cols),
    ('categorical_pipeline',categorical_pipeline,categorical_cols)
])

In [12]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=1)

In [13]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [14]:
X_train

Unnamed: 0,numerical_pipeline__carat,numerical_pipeline__depth,numerical_pipeline__table,numerical_pipeline__x,numerical_pipeline__y,numerical_pipeline__z,categorical_pipeline__cut,categorical_pipeline__color,categorical_pipeline__clarity
0,-0.844417,0.168517,-1.161714,-0.861366,-0.880279,-0.851571,0.872832,-0.937070,-0.647863
1,-0.153743,-0.847856,-0.119350,0.039220,0.071402,-0.006798,0.872832,-1.552691,0.018795
2,0.774350,0.722902,-0.640532,0.912787,0.878065,0.954496,0.872832,-1.552691,-1.314520
3,-1.060252,0.260914,-1.682896,-1.266629,-1.251887,-1.230263,0.872832,0.294171,2.018768
4,0.644849,-0.201073,-0.119350,0.822729,0.859938,0.823410,0.872832,-0.321450,-1.314520
...,...,...,...,...,...,...,...,...,...
145174,-0.822833,0.907697,-0.119350,-0.897389,-0.889342,-0.822441,-1.142722,-1.552691,0.685453
145175,-0.844417,0.076119,-1.161714,-0.888383,-0.880279,-0.866136,0.872832,-0.321450,2.018768
145176,-0.887584,0.353312,-0.640532,-0.969436,-0.952788,-0.938962,0.872832,-0.321450,2.018768
145177,-0.153743,0.168517,-1.265950,0.012202,0.035148,0.036897,0.872832,-0.321450,0.018795


In [15]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

In [16]:
models = {
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'ElasticNet':ElasticNet()
}

In [17]:
def evaluate_model(models,X_train,X_test,y_train,y_test):
    reports = {}
    for i in range(len(models)):
        model = list(models.values())[i]
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)

        r2_value = r2_score(y_test,y_pred)

        reports[list(models.keys())[i]] = r2_value
    

    best_model_score = max(list(reports.values()))

    best_model_name = list(reports.keys())[list(reports.values()).index(best_model_score)]

    return (f'Best Model Name:{best_model_name},Best Model Score : {best_model_score}')



In [18]:
a = evaluate_model(models,X_train,X_test,y_train,y_test)


In [19]:
a

'Best Model Name:Lasso,Best Model Score : 0.9370467381780303'