In [53]:
import pandas as pd

In [54]:
train = pd.read_csv("Data/train.csv")
test = pd.read_csv("Data/test.csv")
extra = pd.read_csv("Data/training_extra.csv")
submission = pd.read_csv("Data/sample_submission.csv")

In [55]:
merge4knn = pd.concat([train.drop(["id", "Price"], axis=1), test.drop(["id"], axis=1), extra.drop(["id", "Price"], axis=1)])

In [56]:
merge4knn.head()

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723
1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537
2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376
3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722
4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338


In [57]:
merge4knn.isna().sum()

Brand                   132985
Material                116575
Size                     92166
Compartments                 0
Laptop Compartment      103495
Waterproof               99135
Style                   109333
Color                   140402
Weight Capacity (kg)      1885
dtype: int64

In [58]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier

def knn_categorical_imputation(df, categorical_columns, k=5):
    """
    Fills missing categorical data using K-Nearest Neighbors (KNN) imputation.

    Parameters:
    df (pd.DataFrame): Input dataframe with missing values.
    categorical_columns (list): List of categorical column names to impute.
    k (int): Number of neighbors to consider for KNN. Default is 5.

    Returns:
    pd.DataFrame: Dataframe with missing categorical values filled.
    """
    df_filled = df.copy()
    
    for col in categorical_columns:
        if df_filled[col].isnull().sum() == 0:
            continue  # Skip if no missing values
        
        # Split data into training and missing sets
        train_data = df_filled.dropna(subset=[col])
        test_data = df_filled[df_filled[col].isnull()]
        
        if test_data.empty:
            continue
        
        # Features and target
        features = df_filled.columns.drop(col)
        X_train = train_data[features]
        y_train = train_data[col]
        X_test = test_data[features]
        
        # Preprocessors for numerical and categorical features
        numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns
        categorical_features = X_train.select_dtypes(include=['object', 'category']).columns
        
        numerical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ])
        
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ])
        
        preprocessor = ColumnTransformer(transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])
        
        # KNN Classifier Pipeline
        knn_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', KNeighborsClassifier(n_neighbors=k))
        ])
        
        knn_pipeline.fit(X_train, y_train)
        predicted = knn_pipeline.predict(X_test)
        
        # Update the dataframe with imputed values
        df_filled.loc[df_filled[col].isnull(), col] = predicted
    
    return df_filled

import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsRegressor

def knn_numerical_imputation(df, numerical_columns, k=5):
    """
    Fills missing numerical data using K-Nearest Neighbors (KNN) regression.

    Parameters:
    df (pd.DataFrame): Input dataframe with missing values.
    numerical_columns (list): List of numerical column names to impute.
    k (int): Number of neighbors to consider for KNN. Default is 5.

    Returns:
    pd.DataFrame: Dataframe with missing numerical values filled.
    """
    df_filled = df.copy()
    
    for col in numerical_columns:
        if df_filled[col].isnull().sum() == 0:
            continue  # Skip if no missing values
        
        # Split data into training and missing sets
        train_data = df_filled.dropna(subset=[col])
        test_data = df_filled[df_filled[col].isnull()]
        
        if test_data.empty:
            continue
        
        # Features and target
        features = df_filled.columns.drop(col)
        X_train = train_data[features]
        y_train = train_data[col]
        X_test = test_data[features]
        
        # Preprocessors for numerical and categorical features
        numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns
        categorical_features = X_train.select_dtypes(include=['object', 'category']).columns
        
        numerical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),  # Impute missing numerical features
            ('scaler', StandardScaler())  # Standardize numerical features
        ])
        
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing categorical features
            ('encoder', OneHotEncoder(handle_unknown='ignore'))  # Encode categorical features
        ])
        
        preprocessor = ColumnTransformer(transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])
        
        # KNN Regressor Pipeline
        knn_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('regressor', KNeighborsRegressor(n_neighbors=k))
        ])
        
        knn_pipeline.fit(X_train, y_train)
        predicted = knn_pipeline.predict(X_test)
        
        # Update the dataframe with imputed values
        df_filled.loc[df_filled[col].isnull(), col] = predicted
    
    return df_filled

In [59]:
miss_cat_cols = list(merge4knn.columns)
miss_cat_cols.remove("Weight Capacity (kg)")
miss_cat_cols.remove("Compartments")

In [60]:
miss_cat_cols

['Brand',
 'Material',
 'Size',
 'Laptop Compartment',
 'Waterproof',
 'Style',
 'Color']

In [61]:

filled_data = knn_categorical_imputation(merge4knn, miss_cat_cols)

In [62]:
filled_data["Weight Capacity (kg)"] = knn_numerical_imputation(filled_data,["Weight Capacity (kg)"])["Weight Capacity (kg)"]

In [72]:
filled_data.isna().sum()

Brand                   0
Material                0
Size                    0
Compartments            0
Laptop Compartment      0
Waterproof              0
Style                   0
Color                   0
Weight Capacity (kg)    0
dtype: int64

In [69]:
filled_train = filled_data.iloc[:300000].copy()
filled_test = filled_data.iloc[300000:500000].copy()
filled_extra = filled_data.iloc[500000:].copy()

In [70]:
filled_train[["id","Price"]] = train[["id","Price"]]
filled_extra[["id","Price"]] = extra[["id","Price"]]
filled_test["id"] = test["id"]
filled_test["Price"] = pd.NA

In [71]:
filled_train.to_csv("Data/filled_train.csv", index=False)
filled_test.to_csv("Data/filled_test.csv", index=False)
filled_extra.to_csv("Data/filled_extra.csv", index=False)