In [53]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, svm
from sklearn.svm import LinearSVR
from sklearn.preprocessing import StandardScaler, LabelEncoder
#from sklearn.neighbors import KNeighborsClassifier
#from sklearn.linear_model import LogisticRegression
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.datasets import make_regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import (SelectKBest, f_regression, mutual_info_regression)
from sklearn.metrics import (mean_absolute_error, mean_absolute_percentage_error,
                             mean_squared_error, r2_score, root_mean_squared_error)
from sklearn.model_selection import GridSearchCV, RepeatedKFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer




In [20]:
def preprocess_data(filename):
    df=pd.read_csv(filename, 
               names=["Id", "MSSubclass", "MSZoning", "LotFrontage", "LotArea", "Street", "Alley", 
                      "LotShape", "LandContour", "Utilities", "LotConfig", "LandSlope", "Neighborhood", 
                      "Condition1", "Condition2", "BldgType", "HouseStyle", "OverallQual", "OverallCond", 
                      "YearBuilt", "YearRemodAdd", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", 
                      "MasVnrType", "MasVnrArea", "ExterQual", "ExterCond", "Foundation", "BsmtQual", "BsmtCond", 
                      "BsmtExposure", "BsmtFinType1", "BsmtFinSF1", "BsmtFinType2", "BsmtFinSF2", "BsmtUnfSF", 
                      "TotalBsmtSF", "Heating", "HeatingQC", "CentralAir", "Electrical", "1stFlrSF", "2ndFlrSF", 
                      "LowQualFinSF", "GrLivArea", "BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath", "BedroomAbvGr", 
                      "KitchenAbvGr", "KitchenQual", "TotRmsAbvGrd", "Functional", "Fireplaces", "FireplaceQu", "GarageType", 
                      "GarageYrBlt", "GarageFinish", "GarageCars", "GarageArea", "GarageQual", "GarageCond", "PavedDrive", 
                      "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch", "PoolArea", "PoolQC", "Fence", 
                      "MiscFeature", "MiscVal", "MoSold", "YrSold", "SaleType", "SaleCondition", "SalePrice"])

    le = LabelEncoder()
    for col in df.columns:
       df[col] = le.fit_transform(df[col])

    x = df[(list(df.columns[:-1]))]
    y = df['SalePrice']
    return x,y

    #x_train, y_train, x_test, y_test = train_test_split(x, y, test_size=0.2)
    #return x_train, y_train, x_test, y_test, x, y



In [59]:
def preprocess_data_KNR(filename):
    train_data = pd.read_csv(filename)

    X_train = train_data.drop('SalePrice', axis=1)
    y_train = train_data['SalePrice']
    #X_test = test_data.copy()

    numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = X_train.select_dtypes(include=['object']).columns

    numerical_transformer = SimpleImputer(strategy='median')
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

    X_train_preprocessed = preprocessor.fit_transform(X_train)
    
    return X_train_preprocessed, y_train
    #X_test_preprocessed = preprocessor.transform(X_test)

In [None]:
def SVM():
        # Preprocess the data
    x, y = preprocess_data("train.csv")

    # Standardize the features
    scaler = StandardScaler()
    x_scaled = scaler.fit_transform(x)

    # Set up the parameter grid
    param_grid = {'C': [0.1, 1, 10], 'epsilon': [0.1, 0.2, 0.3]}

    # Initialize LinearSVR
    svr_model = LinearSVR(dual='auto', random_state=42)

    # Initialize RepeatedKFold
    rkf = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)

    # Initialize GridSearchCV
    grid_search = GridSearchCV(estimator=svr_model, param_grid=param_grid, cv=rkf, scoring='neg_mean_squared_error')

    # Fit the grid search to the data
    grid_search.fit(x_scaled, y)

    # Get the best parameters and best estimator
    best_params = grid_search.best_params_
    best_estimator = grid_search.best_estimator_

    # Print the best parameters
    print("Best Parameters:", best_params)

    # Print the best estimator
    print("Best Estimator:", best_estimator)

    # Get feature importance based on coefficients of best estimator
    feature_importance = np.abs(best_estimator.coef_)

    # Sort features by importance
    sorted_indices = np.argsort(feature_importance)[::-1]

    # Define the number of features to select
    num_selected_features = 5

    # Select the top features
    selected_feature_indices = sorted_indices[:num_selected_features]

    # Calculate RMSE using the best estimator
    y_pred = best_estimator.predict(x_scaled)
    print("Root Mean Squared Error (RMSE):", root_mean_squared_error(y, y_pred))
    print("R2 Score:", r2_score(y, y_pred))  
    print("Mean Absolute Percentage Error (MAPE):", mean_absolute_percentage_error(y, y_pred) * 100)        
    # Print the indices of the selected features
    #print("Indices of selected features:", selected_feature_indices)

In [45]:
def getModelScores(model, x, y, param_grid):

    # Standardize the features
    scaler = StandardScaler()
    x_scaled = scaler.fit_transform(x)


    # Initialize RepeatedKFold
    rkf = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)

    # Initialize GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=rkf, scoring='neg_mean_squared_error')

    # Fit the grid search to the data
    grid_search.fit(x_scaled, y)

    # Get the best parameters and best estimator
    best_params = grid_search.best_params_
    best_estimator = grid_search.best_estimator_

    # Print the best parameters
    print("Best Parameters:", best_params)
    
    # Print the best estimator
    print("Best Estimator:", best_estimator)

    # Calculate RMSE using the best estimator
    y_pred = best_estimator.predict(x_scaled)
    print("Root Mean Squared Error (RMSE):", root_mean_squared_error(y, y_pred))
    print("R2 Score:", r2_score(y, y_pred))  
    print("Mean Absolute Percentage Error (MAPE):", mean_absolute_percentage_error(y, y_pred) * 100)        
    # Print the indices of the selected features
    #print("Indices of selected features:", selected_feature_indices)

In [69]:
def KNN():
    # Preprocess the data
    x, y = preprocess_data_KNR("train.csv")
    X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)
    #print(y)
    # Standardize the features
    scaler = StandardScaler()
    x_scaled = scaler.fit_transform(X_train)

    # Set up the parameter grid for KNeighborsRegressor
    param_grid = {
        'n_neighbors': [3, 5, 7],
        'weights': ['uniform', 'distance'],
        #'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'leaf_size': [20, 30, 40],
        'p': [1, 2]  # Minkowski metric parameter
    }

    # Initialize KNeighborsRegressor
    knn_model = KNeighborsRegressor()

    # Initialize RepeatedKFold
    rkf = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)

    # Initialize GridSearchCV
    grid_search = GridSearchCV(estimator=knn_model, param_grid=param_grid, cv=rkf, scoring='neg_mean_squared_error')

    # Fit the grid search to the data
    grid_search.fit(X_train, y_train)

    # Get the best parameters and best estimator
    best_params = grid_search.best_params_
    best_estimator = grid_search.best_estimator_

    # Print the best parameters
    print("Best Parameters:", best_params)

    # Print the best estimator
    print("Best Estimator:", best_estimator)

    # Calculate RMSE using the best estimator
    y_pred = best_estimator.predict(X_train)
    print("Root Mean Squared Error (RMSE):", root_mean_squared_error(y_test, y_pred))
    print("R2 Score:", r2_score(y_test, y_pred))  
    print("Mean Absolute Percentage Error (MAPE):", mean_absolute_percentage_error(y_test, y_pred) * 100)  

In [43]:
x, y = preprocess_data("train.csv")


In [70]:
KNN()

ValueError: Cannot center sparse matrices: pass `with_mean=False` instead. See docstring for motivation and alternatives.

In [49]:

svr_model = LinearSVR(dual='auto', random_state=42)
knn_model_3 = KNeighborsRegressor(n_neighbors=3)
knn_model_5 = KNeighborsRegressor(n_neighbors=5)

print("Scores For LinearSVR:")
linearSVR_params = {'C': [0.1, 1, 10], 'epsilon': [0.1, 0.2, 0.3]}

getModelScores(svr_model, x, y, linearSVR_params)

knr_params = {
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [20, 30, 40],
    'p': [1, 2]  # Minkowski metric parameter
    }

print("Scores For KNeighborsRegressor with 3 neighbors:")
#getModelScores(knn_model_3, x, y, knr_params)

print("Scores For KNeighborsRegressor with 5 neighbors:")
#getModelScores(knn_model_5, x, y, knr_params)

Scores For LinearSVR:




Best Parameters: {'C': 10, 'epsilon': 0.1}
Best Estimator: LinearSVR(C=10, dual='auto', epsilon=0.1, random_state=42)
Root Mean Squared Error (RMSE): 156.94936394043836
R2 Score: 0.27403256905760787
Mean Absolute Percentage Error (MAPE): 1.7628639295876115e+17
Scores For KNeighborsRegressor with 3 neighbors:
Best Parameters: {'algorithm': 'auto', 'leaf_size': 20, 'p': 1, 'weights': 'distance'}
Best Estimator: KNeighborsRegressor(leaf_size=20, n_neighbors=3, p=1, weights='distance')
Root Mean Squared Error (RMSE): 0.0
R2 Score: 1.0
Mean Absolute Percentage Error (MAPE): 0.0
Scores For KNeighborsRegressor with 5 neighbors:
Best Parameters: {'algorithm': 'auto', 'leaf_size': 20, 'p': 1, 'weights': 'distance'}
Best Estimator: KNeighborsRegressor(leaf_size=20, p=1, weights='distance')
Root Mean Squared Error (RMSE): 0.0
R2 Score: 1.0
Mean Absolute Percentage Error (MAPE): 0.0
