In [3]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_selection import SelectKBest, f_regression

In [5]:
# Путь к тренировочному набору
path_train = "train_diamants.csv"
# Путь к тестовому набору
path_test  = "test_diamants.csv"

In [6]:
df_train = pd.read_csv(path_train)
df_test = pd.read_csv(path_test)

In [7]:
# Блок обучения модели

In [8]:
df_train[['x', 'y', 'z']] = pd.DataFrame(
    df_train[['x', 'y', 'z']].apply(lambda row: sorted(row, reverse=True), axis=1).to_list(), 
    index=df_train.index
)
df_test[['x', 'y', 'z']] = pd.DataFrame(
    df_test[['x', 'y', 'z']].apply(lambda row: sorted(row, reverse=True), axis=1).to_list(), 
    index=df_test.index
)

In [9]:
columns_to_fill = ['x', 'y', 'z']

df_train[columns_to_fill] = df_train[columns_to_fill].replace(0, np.nan)
knn_imputer = KNNImputer(n_neighbors=10)
df_train[columns_to_fill] = knn_imputer.fit_transform(df_train[columns_to_fill])

df_test[columns_to_fill] = df_test[columns_to_fill].replace(0, np.nan)
knn_imputer = KNNImputer(n_neighbors=10)
df_test[columns_to_fill] = knn_imputer.fit_transform(df_test[columns_to_fill])

In [10]:
threshold = 20 
df_train.loc[df_train['x'] > threshold, 'x'] /= 10

In [11]:
df_train = df_train.drop(columns=['depth'])
df_test = df_test.drop(columns=['depth'])

df_train['depth'] = 200 * df_train['z'] / (df_train['x'] + df_train['y'])
df_test['depth'] = 200 * df_test['z'] / (df_test['x'] + df_test['y'])

In [12]:
cut_mapping = {'Fair': 1, 'Good': 2, 'Very Good': 3, 'Premium': 4, 'Ideal': 5}
color_mapping = {'J': 1, 'I': 2, 'H': 3, 'G': 4, 'F': 5, 'E': 6, 'D': 7}
clarity_mapping = {'I1': 1, 'SI2': 2, 'SI1': 3, 'VS2': 4, 'VS1': 5, 'VVS2': 6, 'VVS1': 7, 'IF': 8}

df_train['cut'] = df_train['cut'].map(cut_mapping)
df_train['color'] = df_train['color'].map(color_mapping)
df_train['clarity'] = df_train['clarity'].map(clarity_mapping)

df_test['cut'] = df_test['cut'].map(cut_mapping)
df_test['color'] = df_test['color'].map(color_mapping)
df_test['clarity'] = df_test['clarity'].map(clarity_mapping)

In [13]:
X_test = df_test
X_train = df_train.drop('price',axis = 1)
y_train = df_train['price']

In [14]:
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('poly_features', PolynomialFeatures(degree=1, interaction_only=True, include_bias=False)),
    ('select_k', SelectKBest(score_func=f_regression, k=9)),
    ('knn', KNeighborsRegressor(n_neighbors=10, weights='distance', p=1))
])

pipeline.fit(X_train, y_train)

In [15]:
# Блок предсказания с использованием тестового набора

In [16]:
y_predict = pipeline.predict(X_test)

In [18]:
# Вектора предсказанных значений y_predict полученый на основане тестового набора
y_predict = y_predict
y_predict

array([9264.28874526, 1575.95780034, 2794.89455242, ..., 2428.10206862,
       1544.78001572, 2002.13000985])