# 0 Importing Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

# 1 Importing Data

In [2]:
df = pd.read_csv('output/clean_data.csv')

# 2 Splitting Data

In [3]:
X = df.drop(columns=['price'])
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.75, random_state=0)

# 3 Scaling Data

In [4]:
from sklearn.preprocessing import MinMaxScaler

In [5]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = scaler.transform(y_test.values.reshape(-1, 1))

# 4 Training simple Model

In [6]:
model = LinearRegression()
model.fit(X=X_train_scaled, y=y_train_scaled)

# 5 Predicting

In [7]:
y_pred = model.predict(X_test_scaled)

# 6 Revert Scaling

In [8]:
y_pred = scaler.inverse_transform(y_pred)

# 6 Evaluating simple Model

In [9]:
print("Root mean squared error: %.2f" % root_mean_squared_error(y_test, y_pred))

Root mean squared error: 24714.59


# Looking for the best model

In [10]:
import itertools
import sys

In [13]:
best_rmse = None
best_combination = None

for combination in range(1, len(X_train.columns)+1):
    for subset in itertools.combinations(X_train.columns, combination):
        model = LinearRegression()
        model.fit(X_train[list(subset)], y_train)
        y_pred = model.predict(X_test[list(subset)])
        rmse = root_mean_squared_error(y_test, y_pred)

        if rmse < sys.float_info.max:
            best_rmse = rmse
            best_combination = subset

In [14]:
print("Best combination=", best_combination)
print("RMSE=", best_rmse)

Best combination= ('carat', 'clarity', 'color', 'cut', 'x dimension', 'y dimension', 'z dimension', 'depth', 'table')
RMSE= 24714.590691104437
