In [38]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, svm
#from sklearn.neighbors import KNeighborsClassifier
#from sklearn.linear_model import LogisticRegression
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.datasets import make_regression
from sklearn.feature_selection import (SelectKBest, f_regression, mutual_info_regression)
from sklearn.metrics import (mean_absolute_error, mean_absolute_percentage_error,
                             mean_squared_error, r2_score, root_mean_squared_error)
from sklearn.model_selection import GridSearchCV, RepeatedKFold, train_test_split
from sklearn.pipeline import Pipeline




In [56]:

# feature selection
def select_features(x_train, y_train, x_test, k):
 #k is the number of features
 # configure to select a subset of features
 fs = SelectKBest(score_func=mutual_info_regression, k=k)
 # learn relationship from training data
 fs.fit(x_train, y_train)
 # transform train input data
 x_train_fs = fs.transform(x_train)
 # transform test input data
 x_test_fs = fs.transform(x_test)
 return x_train_fs, x_test_fs, fs
 

SyntaxError: positional argument follows keyword argument (2912874877.py, line 5)

In [54]:

def findBestFeatures(model, x, y):
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    # define the pipeline to evaluate
    fs = SelectKBest(score_func=mutual_info_regression)
    pipeline = Pipeline(steps=[('sel',fs), ('lr', model)])
    # define the grid
    grid = dict()
    grid['sel__k'] = [i for i in range(1, x.shape[1]+1)]
    # define the grid search
    search = GridSearchCV(pipeline, grid, scoring='neg_root_mean_squared_error', n_jobs=-1, cv=cv)
    # perform the search
    results = search.fit(x, y)
    # summarize best
    print('Best RMSE: %.3f' % results.best_score_)
    print('Best Config: %s' % results.best_params_)
    # summarize all
    means = results.cv_results_['mean_test_score']
    params = results.cv_results_['params']
    for mean, param in zip(means, params):
        print(">%.3f with: %r" % (mean, param))

In [39]:
def preprocess_data(filename):
    df=pd.read_csv(filename, 
               names=["Id", "MSSubclass", "MSZoning", "LotFrontage", "LotArea", "Street", "Alley", 
                      "LotShape", "LandContour", "Utilities", "LotConfig", "LandSlope", "Neighborhood", 
                      "Condition1", "Condition2", "BldgType", "HouseStyle", "OverallQual", "OverallCond", 
                      "YearBuilt", "YearRemodAdd", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", 
                      "MasVnrType", "MasVnrArea", "ExterQual", "ExterCond", "Foundation", "BsmtQual", "BsmtCond", 
                      "BsmtExposure", "BsmtFinType1", "BsmtFinSF1", "BsmtFinType2", "BsmtFinSF2", "BsmtUnfSF", 
                      "TotalBsmtSF", "Heating", "HeatingQC", "CentralAir", "Electrical", "1stFlrSF", "2ndFlrSF", 
                      "LowQualFinSF", "GrLivArea", "BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath", "BedroomAbvGr", 
                      "KitchenAbvGr", "KitchenQual", "TotRmsAbvGrd", "Functional", "Fireplaces", "FireplaceQu", "GarageType", 
                      "GarageYrBlt", "GarageFinish", "GarageCars", "GarageArea", "GarageQual", "GarageCond", "PavedDrive", 
                      "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch", "PoolArea", "PoolQC", "Fence", 
                      "MiscFeature", "MiscVal", "MoSold", "YrSold", "SaleType", "SaleCondition", "SalePrice"])

    le = preprocessing.LabelEncoder()
    for col in df.columns:
       df[col] = le.fit_transform(df[col])

    x = df[(list(df.columns[:-1]))]
    y = df['SalePrice']
    x_train, y_train, x_test, y_test = train_test_split(x, y, test_size=0.2)
    return x_train, y_train, x_test, y_test, x, y


In [None]:
def SVM(x_train, x_test, y_train, y_test):
    clf = svm.SVR()

    x_train_fs, x_test_fs, fs = select_features(x_train, y_train, x_test)
    # fit the model
    clf.fit(x_train_fs, y_train)
    # evaluate the model
    yhat = clf.predict(x_test_fs)
    # evaluate predictions
    #mae = mean_absolute_error(y_test, yhat)
    #print("MAE:", mae)
    #clf.fit(x_train, y_train)
    #y_pred = clf.predict(x_test)
    print("MSE: ", mean_squared_error(y_test, yhat))
    print("MAPE: ", mean_absolute_percentage_error(y_test, yhat))
    print("RMSE: ", root_mean_squared_error(y_test, yhat))
    print("R^2: ", r2_score(y_test, yhat))

In [55]:
x_train, y_train, x_test, y_test, x, y = preprocess_data("train.csv")
#findBestFeatures(svm.SVR(), x, y)
#x_test, y_test = preprocess_data("test.csv")
SVM(x_train, y_train, x_test, y_test)


a
Best RMSE: -162.353
Best Config: {'sel__k': 1}
>-162.353 with: {'sel__k': 1}
>-180.239 with: {'sel__k': 2}
>-175.282 with: {'sel__k': 3}
>-171.086 with: {'sel__k': 4}
>-170.513 with: {'sel__k': 5}
>-169.509 with: {'sel__k': 6}
>-168.942 with: {'sel__k': 7}
>-168.760 with: {'sel__k': 8}
>-168.442 with: {'sel__k': 9}
>-168.652 with: {'sel__k': 10}
>-168.697 with: {'sel__k': 11}
>-169.107 with: {'sel__k': 12}
>-169.888 with: {'sel__k': 13}
>-170.169 with: {'sel__k': 14}
>-170.272 with: {'sel__k': 15}
>-170.302 with: {'sel__k': 16}
>-170.545 with: {'sel__k': 17}
>-170.802 with: {'sel__k': 18}
>-171.002 with: {'sel__k': 19}
>-171.202 with: {'sel__k': 20}
>-171.274 with: {'sel__k': 21}
>-171.376 with: {'sel__k': 22}
>-171.443 with: {'sel__k': 23}
>-171.716 with: {'sel__k': 24}
>-171.855 with: {'sel__k': 25}
>-172.091 with: {'sel__k': 26}
>-173.116 with: {'sel__k': 27}
>-173.533 with: {'sel__k': 28}
>-173.880 with: {'sel__k': 29}
>-174.191 with: {'sel__k': 30}
>-174.367 with: {'sel__k': 31}