In [79]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, svm
from sklearn.svm import LinearSVR
from sklearn.preprocessing import StandardScaler, LabelEncoder
#from sklearn.neighbors import KNeighborsClassifier
#from sklearn.linear_model import LogisticRegression
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.datasets import make_regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import (SelectKBest, f_regression, mutual_info_regression)
from sklearn.metrics import (make_scorer, mean_absolute_error, mean_absolute_percentage_error,
                             mean_squared_error, r2_score, root_mean_squared_error)
from sklearn.model_selection import GridSearchCV, RepeatedKFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer




In [145]:
#RN just works for the Linear SVR
def preprocess_data(filename):
    df=pd.read_csv(filename, 
               names=["Id", "MSSubclass", "MSZoning", "LotFrontage", "LotArea", "Street", "Alley", 
"LotShape", "LandContour", "Utilities", "LotConfig", "LandSlope", "Neighborhood", 
"Condition1", "Condition2", "BldgType", "HouseStyle", "OverallQual", "OverallCond", 
"YearBuilt", "YearRemodAdd", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", 
"MasVnrType", "MasVnrArea", "ExterQual", "ExterCond", "Foundation", "BsmtQual", "BsmtCond", 
"BsmtExposure", "BsmtFinType1", "BsmtFinSF1", "BsmtFinType2", "BsmtFinSF2", "BsmtUnfSF", 
"TotalBsmtSF", "Heating", "HeatingQC", "CentralAir", "Electrical", "1stFlrSF", "2ndFlrSF", 
"LowQualFinSF", "GrLivArea", "BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath", "BedroomAbvGr", 
"KitchenAbvGr", "KitchenQual", "TotRmsAbvGrd", "Functional", "Fireplaces", "FireplaceQu", "GarageType", 
"GarageYrBlt", "GarageFinish", "GarageCars", "GarageArea", "GarageQual", "GarageCond", "PavedDrive", 
"WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch", "PoolArea", "PoolQC", "Fence", 
"MiscFeature", "MiscVal", "MoSold", "YrSold", "SaleType", "SaleCondition", "SalePrice"])

    le = preprocessing.LabelEncoder()
    for col in df.columns:
       df[col] = le.fit_transform(df[col])

    x = df[(list(df.columns[:-1]))]
    y = df['SalePrice']

    return x,y

In [None]:
def SVM():
        # Preprocess the data
    x, y = preprocess_data("train.csv")

    # Standardize the features
    scaler = StandardScaler()
    x_scaled = scaler.fit_transform(x)

    # Set up the parameter grid
    param_grid = {'C': [0.1, 1, 10], 'epsilon': [0.1, 0.2, 0.3]}

    # Initialize LinearSVR
    svr_model = LinearSVR(dual='auto', random_state=42)

    # Initialize RepeatedKFold
    rkf = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)

    # Initialize GridSearchCV
    grid_search = GridSearchCV(estimator=svr_model, param_grid=param_grid, cv=rkf, scoring='neg_mean_squared_error')

    # Fit the grid search to the data
    grid_search.fit(x_scaled, y)

    # Get the best parameters and best estimator
    best_params = grid_search.best_params_
    best_estimator = grid_search.best_estimator_

    # Print the best parameters
    print("Best Parameters:", best_params)

    # Print the best estimator
    print("Best Estimator:", best_estimator)

    # Get feature importance based on coefficients of best estimator
    feature_importance = np.abs(best_estimator.coef_)

    # Sort features by importance
    sorted_indices = np.argsort(feature_importance)[::-1]

    # Define the number of features to select
    num_selected_features = 5

    # Select the top features
    selected_feature_indices = sorted_indices[:num_selected_features]

    # Calculate RMSE using the best estimator
    y_pred = best_estimator.predict(x_scaled)
    print("Root Mean Squared Error (RMSE):", root_mean_squared_error(y, y_pred))
    print("R2 Score:", r2_score(y, y_pred))  
    print("Mean Absolute Percentage Error (MAPE):", mean_absolute_percentage_error(y, y_pred) * 100)        
    # Print the indices of the selected features
    #print("Indices of selected features:", selected_feature_indices)

In [167]:
def getLinearSVRScores(param_grid):
    model = LinearSVR(dual='auto', random_state=42)
    x,y = preprocess_data("train.csv")
    # Standardize the features
    scaler = StandardScaler()
    x_scaled = scaler.fit_transform(x)

    # Initialize RepeatedKFold
    rkf = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)

    # Initialize GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=rkf, scoring='neg_mean_squared_error')

    # Fit the grid search to the data
    grid_search.fit(x_scaled, y)

    # Get the best parameters and best estimator
    best_params = grid_search.best_params_
    best_estimator = grid_search.best_estimator_

    # Print the best parameters
    print("Best Parameters:", best_params)
    
    # Print the best estimator
    print("Best Estimator:", best_estimator)

    # Calculate RMSE using the best estimator
    y_pred = best_estimator.predict(x_scaled)
    print("Mean Absolute Error (MAE):", mean_absolute_error(y, y_pred))
    print("Mean Squared Error (MSE):", mean_squared_error(y, y_pred))
    print("Root Mean Squared Error (RMSE):", root_mean_squared_error(y, y_pred))
    print("R2 Score:", r2_score(y, y_pred))  
    print("Mean Absolute Percentage Error (MAPE):", mean_absolute_percentage_error(y, y_pred) * 100)        
    # Print the indices of the selected features
    #print("Indices of selected features:", selected_feature_indices)

In [157]:
def preprocess_data_KNN(filename):
    train_data = pd.read_csv(filename)
    X_train = train_data.drop(train_data.columns[-1], axis=1)
    y_train = train_data.columns[:-1]

    # Define preprocessing steps
    numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = X_train.select_dtypes(include=['object']).columns

    numerical_transformer = SimpleImputer(strategy='median')
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])
    return X_train, y_train, preprocessor

In [163]:
def getKNNScores(param_grid):
    train_data = pd.read_csv('train.csv')
    X_train = train_data.drop('SalePrice', axis=1)
    y_train = train_data['SalePrice']

    # Define preprocessing steps
    numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = X_train.select_dtypes(include=['object']).columns

    numerical_transformer = SimpleImputer(strategy='median')
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

    # Define feature selection
    feature_selection = SelectKBest(score_func=mutual_info_regression)

    # Define KNN regressor
    regressor = KNeighborsRegressor()

    # Define the pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('feature_selection', feature_selection),
        ('regressor', regressor)
    ])

    # Define parameter grid for GridSearchCV
    param_grid = {
        'feature_selection__k': [5, 8, 10, 20, 30],  # Number of top features to select
        'regressor__n_neighbors': [3, 5, 7],  # Number of neighbors for KNN
        'regressor__weights': ['uniform', 'distance'],  # Weight function used in prediction
        'regressor__algorithm': ['auto', 'ball_tree', 'kd_tree'],  # Algorithm used to compute nearest neighbors
        'regressor__leaf_size': [20, 30, 40],  # Leaf size passed to BallTree or KDTree
        'regressor__p': [1, 2],  # Power parameter for the Minkowski metric
    }

    # Define scoring function (optional)
    scoring = {
        'MAE' : make_scorer(mean_absolute_error),
        'MSE' : make_scorer(mean_squared_error),
        'MAPE': make_scorer(mean_absolute_percentage_error),
        'RMSE': make_scorer(root_mean_squared_error),
        'R2 Score': make_scorer(r2_score)
    }

    # Define cross-validation strategy
    cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)

    # Instantiate GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring=scoring, refit='RMSE', verbose=1)

    # Fit GridSearchCV
    grid_search.fit(X_train, y_train)

    # Get best parameters and results
    best_params = grid_search.best_params_
    best_results = grid_search.cv_results_
    print(best_results)
    # Print best parameters and results
    print("Best parameters found: ", best_params)   
    print("Best MAE score: ", best_results['mean_test_MAE'][grid_search.best_index_], "with params:", grid_search.cv_results_['params'][grid_search.best_index_])
    print("Best MSE score: ", best_results['mean_test_MSE'][grid_search.best_index_], "with params:", grid_search.cv_results_['params'][grid_search.best_index_])
    print("Best MAPE score: ", best_results['mean_test_MAPE'][grid_search.best_index_], "with params:", grid_search.cv_results_['params'][grid_search.best_index_])
    print("Best RMSE score: ", grid_search.best_score_, "with params:", grid_search.cv_results_['params'][grid_search.best_index_])
    print("Best R2 score: ", best_results['mean_test_R2 Score'][grid_search.best_index_], "with params:", grid_search.cv_results_['params'][grid_search.best_index_])


In [166]:


knr_params = {
    'feature_selection__k': [10, 20, 30],  # Number of top features to select
    'regressor__n_neighbors': [3, 5, 7],  # Number of neighbors for KNN
    'regressor__weights': ['uniform', 'distance'],  # Weight function used in prediction
    'regressor__algorithm': ['auto', 'ball_tree', 'kd_tree'],  # Algorithm used to compute nearest neighbors
    'regressor__leaf_size': [20, 30, 40],  # Leaf size passed to BallTree or KDTree
    'regressor__p': [1, 2],  # Power parameter for the Minkowski metric
    }

print("Scores For KNeighborsRegressor:")
getKNNScores( knr_params)

Scores For KNeighborsRegressor:
Fitting 15 folds for each of 540 candidates, totalling 8100 fits


KeyboardInterrupt: 

In [168]:

linearSVR_params = {
    'C': [0.1, 1, 10],  # Regularization parameter
    'epsilon': [0.1, 0.2, 0.3],  # Epsilon in the epsilon-insensitive loss function
}

getLinearSVRScores(linearSVR_params)




Best Parameters: {'C': 10, 'epsilon': 0.1}
Best Estimator: LinearSVR(C=10, dual='auto', epsilon=0.1, random_state=42)
Mean Absolute Error (MAE): 84.56103142339627
Mean Squared Error (MSE): 24633.10284130817
Root Mean Squared Error (RMSE): 156.94936394043836
R2 Score: 0.27403256905760787
Mean Absolute Percentage Error (MAPE): 1.7628639295876115e+17
