In [1]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer

from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.linear_model import Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_selection import SelectKBest, f_regression

In [2]:
path_train = "train_diamants.csv"

In [3]:
df = pd.read_csv(path_train)
df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.55,Ideal,I,SI1,61.7,55.0,1134,5.29,5.31,3.27
1,0.94,Premium,E,SI1,62.4,56.0,4580,6.27,6.24,3.90
2,0.33,Premium,E,SI2,60.0,59.0,594,4.49,4.47,2.69
3,0.31,Ideal,D,VS2,59.4,56.0,879,4.45,4.40,2.63
4,2.01,Very Good,H,SI1,62.8,59.0,17759,7.99,8.04,5.03
...,...,...,...,...,...,...,...,...,...,...
40450,1.50,Very Good,D,SI1,60.7,62.0,11442,7.30,7.33,4.44
40451,1.51,Ideal,D,SI1,61.9,57.0,11834,7.35,7.42,4.57
40452,0.42,Ideal,G,SI2,60.9,56.0,971,4.87,4.89,2.97
40453,2.05,Premium,E,SI2,59.7,59.0,17237,8.39,8.27,4.97


In [4]:
print(df.info())
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    40455 non-null  float64
 1   cut      40455 non-null  object 
 2   color    40455 non-null  object 
 3   clarity  40455 non-null  object 
 4   depth    40455 non-null  float64
 5   table    40455 non-null  float64
 6   price    40455 non-null  int64  
 7   x        40455 non-null  float64
 8   y        40455 non-null  float64
 9   z        40455 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 3.1+ MB
None


Unnamed: 0,carat,depth,table,price,x,y,z
count,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0
mean,0.797227,61.746219,57.451551,3938.457274,5.728667,5.73272,3.53753
std,0.47444,1.420742,2.230915,3998.851904,1.123597,1.153284,0.70956
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,945.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.69,5.71,3.53
75%,1.04,62.5,59.0,5324.5,6.54,6.53,4.03
max,5.01,78.2,95.0,18818.0,10.74,58.9,31.8


## Поработаем с данными

#### Отсортируем значения длины, ширины и глубины бриллианта (`x`>`y`>`z`)

In [5]:
df[['x', 'y', 'z']] = pd.DataFrame(
    df[['x', 'y', 'z']].apply(lambda row: sorted(row, reverse=True), axis=1).to_list(), 
    index=df.index
)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.55,Ideal,I,SI1,61.7,55.0,1134,5.31,5.29,3.27
1,0.94,Premium,E,SI1,62.4,56.0,4580,6.27,6.24,3.9
2,0.33,Premium,E,SI2,60.0,59.0,594,4.49,4.47,2.69
3,0.31,Ideal,D,VS2,59.4,56.0,879,4.45,4.4,2.63
4,2.01,Very Good,H,SI1,62.8,59.0,17759,8.04,7.99,5.03


#### Заполним нули

In [6]:
columns_to_fill = ['x', 'y', 'z']

df[columns_to_fill] = df[columns_to_fill].replace(0, np.nan)
knn_imputer = KNNImputer(n_neighbors=10)
df[columns_to_fill] = knn_imputer.fit_transform(df[columns_to_fill])

#### Исправим выбросы, установив порог и разделив все значения, его превышающие, на 10

In [7]:
threshold = 20 
df.loc[df['x'] > threshold, 'x'] /= 10

In [8]:
df.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0
mean,0.797227,61.746219,57.451551,3938.457274,5.753756,5.707712,3.538239
std,0.47444,1.420742,2.230915,3998.851904,1.120958,1.111938,0.692549
min,0.2,43.0,43.0,326.0,3.18,3.71,1.07
25%,0.4,61.0,56.0,945.0,4.73,4.7,2.91
50%,0.7,61.8,57.0,2401.0,5.73,5.68,3.53
75%,1.04,62.5,59.0,5324.5,6.56,6.51,4.03
max,5.01,78.2,95.0,18818.0,10.74,10.54,8.06


#### попробуем заново пересчитать столбец depth, в нем тоже были ошибки

In [9]:
df = df.drop(columns=['depth'])

In [10]:
df['depth'] = 200 * df['z'] / (df['x'] + df['y'])

#### Закодируем переменные для дальнейшей обработки


In [11]:
cut_mapping = {'Fair': 1, 'Good': 2, 'Very Good': 3, 'Premium': 4, 'Ideal': 5}
color_mapping = {'J': 1, 'I': 2, 'H': 3, 'G': 4, 'F': 5, 'E': 6, 'D': 7}
clarity_mapping = {'I1': 1, 'SI2': 2, 'SI1': 3, 'VS2': 4, 'VS1': 5, 'VVS2': 6, 'VVS1': 7, 'IF': 8}

df['cut'] = df['cut'].map(cut_mapping)
df['color'] = df['color'].map(color_mapping)
df['clarity'] = df['clarity'].map(clarity_mapping)
df.head()

Unnamed: 0,carat,cut,color,clarity,table,price,x,y,z,depth
0,0.55,5,2,3,55.0,1134,5.31,5.29,3.27,61.698113
1,0.94,4,6,3,56.0,4580,6.27,6.24,3.9,62.35012
2,0.33,4,6,2,59.0,594,4.49,4.47,2.69,60.044643
3,0.31,5,7,4,56.0,879,4.45,4.4,2.63,59.435028
4,2.01,3,3,3,59.0,17759,8.04,7.99,5.03,62.75733


#### Сплитуем

In [12]:
X = df.drop(columns=['price'])
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Масштабируем данные для дальнейшего использования в моделях

In [13]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Теперь посмотрим модели регрессии

##### Кастомная функция irq mse

In [14]:
def irq_mse(y_true, y_pred):
    delta = y_pred - y_true
    Q25 = np.quantile(delta, 0.25)
    Q75 = np.quantile(delta, 0.75)
    irq = Q75-Q25
    mask = (delta<(Q25-1.5*irq)) | (delta>(Q75+1.5*irq))
    if (mask.sum()==0):
        res=0
    else: res = -sum((delta[mask])**2) / mask.sum()
    return res

### Линейная регрессия

Это простая модель, она нечувствительна к выбросам, мультимодальности данных и плохо работает с нелинейными зависимостями, так что, она не учтет сложные нелинейные зависимости между характеристиками бриллиантов и их ценой. Зная, что нам важны в первую очередь выбросы, не будем терять время. 

### SVR

In [15]:
svr_model = SVR(kernel='rbf', C=100.0, epsilon=0.01)
svr_model.fit(X_train_scaled, y_train)

y_pred = svr_model.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
custom_irq_mse = irq_mse(y_test, y_pred)

print("Mean squared error (MSE):", mse)
print("R^2 score:", r2)
print("irq mse:", custom_irq_mse)

Mean squared error (MSE): 578796.3464983962
R^2 score: 0.963140793917042
irq mse: -3644306.2399879117


### Ридж-регрессия

In [16]:
ridge_model = Ridge(alpha=0.01)
ridge_model.fit(X_train_scaled, y_train)
y_pred = ridge_model.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
custom_irq_mse = irq_mse(y_test, y_pred)

print("Mean squared error (MSE):", mse)
print("R^2 score:", r2)
print("irq mse:", custom_irq_mse)

Mean squared error (MSE): 1432594.6221139908
R^2 score: 0.9087687738022318
irq mse: -15717414.851228017


### Лассо-регрессия

In [17]:
lasso_model = Lasso(alpha=0.01, max_iter=10000)
lasso_model.fit(X_train_scaled, y_train)
y_pred = lasso_model.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
custom_irq_mse = irq_mse(y_test, y_pred)

print("Mean squared error (MSE):", mse)
print("R^2 score:", r2)
print("irq mse:", custom_irq_mse)

Mean squared error (MSE): 1432592.1167711653
R^2 score: 0.9087689333487599
irq mse: -15716952.856497832


### KNN

In [18]:
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('poly_features', PolynomialFeatures(degree=1, interaction_only=True, include_bias=False)),
    ('select_k', SelectKBest(score_func=f_regression, k=9)),
    ('knn', KNeighborsRegressor(n_neighbors=10, weights='distance', p=1))
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
custom_irq_mse = irq_mse(y_test, y_pred)

print("Mean squared error (MSE):", mse)
print("R^2 score:", r2)
print("irq mse:", custom_irq_mse)

Mean squared error (MSE): 342461.1023017981
R^2 score: 0.978191216268892
irq mse: -2160860.403923631
