In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

df= pd.read_pickle(r'D:\Nik\Paper1\Data\complete_dataset.pkl')
df.head(5)

Unnamed: 0,Lat,Lon,Month,SSS,SST,z,c
0,-4.5883,-28.9983,7,36.036,27.405,0.0,1540.75481
1,-4.5883,-28.9983,7,36.036,27.405,1.988939,1540.785954
2,-4.5883,-28.9983,7,36.036,27.405,3.977859,1540.810581
3,-4.5883,-28.9983,7,36.036,27.405,5.96676,1540.840507
4,-4.5883,-28.9983,7,36.036,27.405,7.955641,1540.848169


In [2]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA

# Preprocessing steps remain the same
numerical_features = ['Lat', 'Lon', 'SSS', 'SST', 'z']
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_features = ['Month']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Pipeline now includes PCA
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('pca', PCA(n_components=0.90)),  # Retain 95% of variance
                           ('model', KNeighborsRegressor())])


In [3]:
param_grid = {
    'model__n_neighbors': range(1, 6),  # Trying k from 1 to 29
    # Optionally, add more parameters here, e.g., 'pca__n_components': [0.90, 0.95, 0.99]
}


In [4]:
X = df.drop('c', axis=1)  # Dropping the target column to get the features
y = df['c']  # Extracting the target column

In [7]:
from sklearn.model_selection import GridSearchCV

# Assuming X and y are your features and target variable
grid_search = GridSearchCV(pipeline, param_grid,cv=2, scoring='neg_mean_squared_error', verbose=1)
grid_search.fit(X, y)


Fitting 2 folds for each of 5 candidates, totalling 10 fits


GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='median')),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         ['Lat',
                                                                          'Lon',
                                                                          'SSS',
                                                                          'SST',
                                                                          'z']),
   

In [8]:
# Split data into 'X' features and 'y' target label, train-test split, and fitting the pipeline
X = df.drop('c', axis=1)
y = df['c']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = pipeline.predict(X_test)

from sklearn.metrics import mean_squared_error, r2_score
print(f"R^2: {r2_score(y_test, y_pred)}")
print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False)}")


R^2: 0.9977804124844103
RMSE: 1.0331704975478542


In [9]:
# Assuming grid_search is your GridSearchCV object and it's already fitted
best_parameters = grid_search.best_params_
best_score = grid_search.best_score_

print("Best parameters found: ", best_parameters)
print("Best score found: ", best_score)

# Best k (number of neighbors) for KNeighborsRegressor
best_k = best_parameters['model__n_neighbors']
print("Best k (number of neighbors): ", best_k)


Best parameters found:  {'model__n_neighbors': 5}
Best score found:  -5.231396878906357
Best k (number of neighbors):  5


In [None]:
df1 = pd.read_pickle(r'D:\Nik\Hy\artifacts\data\2900.pkl')
df1

In [None]:
def add_predictions(df):

    # Compute features for prediction
   
    X = df[['Latitude', 'Longitude', 'month', 'SSS', 'SST', 'Depth']]
    X.columns = ['Lat', 'Lon', 'Month', 'SSS', 'SST', 'z']
    predictions = pipeline.predict(X)
    df['c_knn'] = predictions
    return df

In [None]:
add_predictions(df1)

In [None]:
import os
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt
df_selected = df1[["Depth", "Latitude", "Longitude", "c_1", "c_knn"]]

# Function to calculate RMSE and R2
def calculate_metrics(g):
    rmse = sqrt(mean_squared_error(g['c_1'], g['c_knn']))
    r2 = r2_score(g['c_1'], g['c_knn'])
    return pd.Series({'RMSE': rmse, 'R2': r2})

# Group by 'Depth' and apply the calculation
metrics_per_depth = df_selected.groupby('Depth').apply(calculate_metrics)

# Generate CSV file name from the original .pkl file name
csv_file_name = r'D:\Nik\Hy\notebook\2900_knn.csv'

# Save the DataFrame to CSV
metrics_per_depth.to_csv(csv_file_name)

print(f"Saved metrics to {csv_file_name}")
# # Return the DataFrame with metrics for depth, for any further usage
# return metrics_per_depth

In [10]:
df

Unnamed: 0,Lat,Lon,Month,SSS,SST,z,c
0,-4.5883,-28.9983,07,36.036000,27.405000,0.000000,1540.754810
1,-4.5883,-28.9983,07,36.036000,27.405000,1.988939,1540.785954
2,-4.5883,-28.9983,07,36.036000,27.405000,3.977859,1540.810581
3,-4.5883,-28.9983,07,36.036000,27.405000,5.966760,1540.840507
4,-4.5883,-28.9983,07,36.036000,27.405000,7.955641,1540.848169
...,...,...,...,...,...,...,...
75309894,20.7500,-109.2500,11,34.307568,27.476866,2276.225098,1494.997638
75309895,-4.7500,-105.7500,04,34.715111,28.512173,722.400024,1486.226113
75309896,-16.7500,70.7500,05,34.884384,26.517609,1409.150024,1489.159069
75309897,26.7500,158.2500,07,35.064331,27.854837,2084.034912,1491.616522


In [11]:
df.sample(frac=0.01).to_pickle(r'D:\Nik\Paper1\Data\complete_dataset_sample.pkl')