In [69]:
import pandas as pd
from scipy.stats import zscore, uniform
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.svm import SVR

In [70]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train = train.drop(['X1','X2','X3','X5','X8','X10'], axis=1)
test = test.drop(['X1','X2','X3','X5','X8','X10'], axis=1)

In [71]:
print(train.head())
print(test.head())

         X4        X6      X7      X9                X11     Y
0  0.016047  249.8092  OUT049  Medium  Supermarket Type1  8.23
1  0.019278   48.2692  OUT018  Medium  Supermarket Type2  6.09
2  0.016760  141.6180  OUT049  Medium  Supermarket Type1  7.65
3  0.000000  182.0950  OUT010     NaN      Grocery Store  6.60
4  0.000000   53.8614  OUT013    High  Supermarket Type1  6.90
         X4        X6      X7      X9                X11
0  0.021273  229.1326  OUT027  Medium  Supermarket Type3
1  0.027588   86.1856  OUT013    High  Supermarket Type1
2  0.111782  182.3292  OUT010     NaN      Grocery Store
3  0.029606   65.4142  OUT049  Medium  Supermarket Type1
4  0.129141  109.4886  OUT017     NaN  Supermarket Type1


In [72]:
train_cleaned = train.copy()
test_cleaned = test.copy()

# Replace missing values in the train dataset
for column in train_cleaned.columns:
    if train_cleaned[column].dtype == 'object': #string
        # Replace missing values with mode for categorical columns
        mode = train_cleaned[column].mode()[0]
        train_cleaned[column].fillna(mode, inplace=True)

    elif train_cleaned[column].dtype in ['int64', 'float64']:
        # Replace missing values with median for numerical columns
        median = train_cleaned[column].median()
        train_cleaned[column].fillna(median, inplace=True)


# Replace missing values in the test dataset
for column in test_cleaned.columns:
    if test_cleaned[column].dtype == 'object':
        # Replace missing values with mode for categorical columns
        mode = test_cleaned[column].mode()[0]
        test_cleaned[column].fillna(mode, inplace=True)

    elif test_cleaned[column].dtype in ['int64', 'float64']:
        # Replace missing values with median for numerical columns
        median = test_cleaned[column].median()
        test_cleaned[column].fillna(median, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_cleaned[column].fillna(median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_cleaned[column].fillna(mode, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting 

In [73]:
# Removing Numerical Outliers from Training Dataset
numeric_cols = train_cleaned.select_dtypes(include=['float64', 'int64']).columns

# Check if 'X4' is in the numeric columns and exclude it
if 'X4' in numeric_cols:
    numeric_cols = numeric_cols.drop('X4')  # X4 has high skewness; Z-score method is not suitable for it

# Detect and replace outliers for the remaining numeric columns using Z-score
z_scores = train_cleaned[numeric_cols].apply(zscore)
threshold = 3
outliers = (z_scores.abs() > threshold).any(axis=1)

for column in numeric_cols:
    median = train_cleaned[column].median()
    train_cleaned.loc[outliers, column] = median

print(f"Replaced outliers in numerical columns of training data with their median values using Z-score.")

# Handle outliers in 'X4' using the IQR method
Q1 = train_cleaned['X4'].quantile(0.25)
Q3 = train_cleaned['X4'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
X4_outliers = (train_cleaned['X4'] < lower_bound) | (train_cleaned['X4'] > upper_bound)
train_cleaned.loc[X4_outliers, 'X4'] = train_cleaned['X4'].median()
print(f"Removed outliers from 'X4' using the IQR method.")

# Store the cleaned training data after outlier removal
train_cleaned_no_outliers = train_cleaned.copy()

# Repeat the same logic for the test dataset
numeric_cols_test = test_cleaned.select_dtypes(include=['float64', 'int64']).columns

if 'X4' in numeric_cols_test:
    numeric_cols_test = numeric_cols_test.drop('X4')  # Exclude X4 for the same reason as training

# Detect and replace outliers in the test dataset using Z-score
z_scores_test = test_cleaned[numeric_cols_test].apply(zscore)
outliers_test = (z_scores_test.abs() > threshold).any(axis=1)

for column in numeric_cols_test:
    median = test_cleaned[column].median()
    test_cleaned.loc[outliers_test, column] = median

print(f"Replaced outliers in numerical columns of test data with their median values using Z-score.")

# Handle outliers in 'X4' using the IQR method for the test dataset
Q1_test = test_cleaned['X4'].quantile(0.25)
Q3_test = test_cleaned['X4'].quantile(0.75)
IQR_test = Q3_test - Q1_test
lower_bound_test = Q1_test - 1.5 * IQR_test
upper_bound_test = Q3_test + 1.5 * IQR_test

X4_outliers_test = (test_cleaned['X4'] < lower_bound_test) | (test_cleaned['X4'] > upper_bound_test)
test_cleaned.loc[X4_outliers_test, 'X4'] = test_cleaned['X4'].median()

print(f"Removed outliers from 'X4' in test data using the IQR method.")

# Store the cleaned test data after outlier removal
test_cleaned_no_outliers = test_cleaned.copy()


Replaced outliers in numerical columns of training data with their median values using Z-score.
Removed outliers from 'X4' using the IQR method.
Replaced outliers in numerical columns of test data with their median values using Z-score.
Removed outliers from 'X4' in test data using the IQR method.


In [74]:
train_cleaned_no_outliers = pd.get_dummies(train_cleaned_no_outliers)
test_cleaned_no_outliers = pd.get_dummies(test_cleaned_no_outliers)

In [75]:
Train_Numerical_columns = train_cleaned_no_outliers.select_dtypes(include=['float64', 'int64']).columns.drop('Y')
Test_Numerical_columns = test_cleaned_no_outliers.select_dtypes(include=['float64', 'int64']).columns

# Standardize the data
Scaler = StandardScaler()
train_cleaned_no_outliers[Train_Numerical_columns] = Scaler.fit_transform(train_cleaned_no_outliers[Train_Numerical_columns])
test_cleaned_no_outliers[Test_Numerical_columns] = Scaler.fit_transform(test_cleaned_no_outliers[Test_Numerical_columns])

In [76]:

X = train_cleaned_no_outliers.drop('Y', axis=1)
y = train_cleaned_no_outliers['Y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

svr = SVR(kernel='rbf')
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ["rbf"],
    'epsilon': [0.1, 0.2, 0.3, 0.4, 0.5]
}

grid_search = GridSearchCV(estimator=svr,param_grid=param_grid,cv=5,scoring='neg_mean_absolute_error',verbose=3,n_jobs=-1)

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", -grid_search.best_score_)


Fitting 5 folds for each of 125 candidates, totalling 625 fits
Best Parameters: {'C': 100, 'epsilon': 0.1, 'gamma': 0.01, 'kernel': 'rbf'}
Best Score: 0.4067048716162633


In [77]:
model = SVR(C=100,epsilon=0.1, kernel= 'rbf', gamma=0.01)
model.fit(X_train, y_train)

error_prediciton = model.predict(X_test)
mae = mean_absolute_error(y_test, error_prediciton)
print(mae)

predictions = model.predict(test_cleaned_no_outliers)

output_test = pd.DataFrame({
    'row_id': range(len(predictions)),
    'Y': predictions
})

output_test.to_csv('submission.csv', index=False)


0.40923403596113983
