In [67]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, svm
#from sklearn.neighbors import KNeighborsClassifier
#from sklearn.linear_model import LogisticRegression
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.datasets import make_regression
from sklearn.feature_selection import (SelectKBest, f_regression, mutual_info_regression)
from sklearn.metrics import (mean_absolute_error, mean_absolute_percentage_error,
                             mean_squared_error, r2_score, root_mean_squared_error)
from sklearn.model_selection import GridSearchCV, RepeatedKFold, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer



In [54]:

# feature selection
def select_features(x_train, y_train, x_test, k):
 #k is the number of features
 # configure to select a subset of features
 fs = SelectKBest(score_func=mutual_info_regression, k=k)
 # learn relationship from training data
 fs.fit(x_train, y_train)
 # transform train input data
 x_train_fs = fs.transform(x_train)
 # transform test input data
 x_test_fs = fs.transform(x_test)
 return x_train_fs, x_test_fs, fs
 

In [55]:

def findBestFeatures(model, x, y):
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    # define the pipeline to evaluate
    fs = SelectKBest(score_func=mutual_info_regression)
    pipeline = Pipeline(steps=[('sel',fs), ('lr', model)])
    # define the grid
    grid = dict()
    grid['sel__k'] = [i for i in range(1, x.shape[1]+1)]
    # define the grid search
    search = GridSearchCV(pipeline, grid, scoring='neg_root_mean_squared_error', n_jobs=-1, cv=cv)
    # perform the search
    results = search.fit(x, y)
    # summarize best
    print('Best RMSE: %.3f' % results.best_score_)
    print('Best Config: %s' % results.best_params_)
    # summarize all
    means = results.cv_results_['mean_test_score']
    params = results.cv_results_['params']
    for mean, param in zip(means, params):
        print(">%.3f with: %r" % (mean, param))

In [60]:
def preprocess_data(filename):
    df=pd.read_csv(filename, 
               names=["Id", "MSSubclass", "MSZoning", "LotFrontage", "LotArea", "Street", "Alley", 
                      "LotShape", "LandContour", "Utilities", "LotConfig", "LandSlope", "Neighborhood", 
                      "Condition1", "Condition2", "BldgType", "HouseStyle", "OverallQual", "OverallCond", 
                      "YearBuilt", "YearRemodAdd", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", 
                      "MasVnrType", "MasVnrArea", "ExterQual", "ExterCond", "Foundation", "BsmtQual", "BsmtCond", 
                      "BsmtExposure", "BsmtFinType1", "BsmtFinSF1", "BsmtFinType2", "BsmtFinSF2", "BsmtUnfSF", 
                      "TotalBsmtSF", "Heating", "HeatingQC", "CentralAir", "Electrical", "1stFlrSF", "2ndFlrSF", 
                      "LowQualFinSF", "GrLivArea", "BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath", "BedroomAbvGr", 
                      "KitchenAbvGr", "KitchenQual", "TotRmsAbvGrd", "Functional", "Fireplaces", "FireplaceQu", "GarageType", 
                      "GarageYrBlt", "GarageFinish", "GarageCars", "GarageArea", "GarageQual", "GarageCond", "PavedDrive", 
                      "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch", "PoolArea", "PoolQC", "Fence", 
                      "MiscFeature", "MiscVal", "MoSold", "YrSold", "SaleType", "SaleCondition", "SalePrice"])

    le = preprocessing.LabelEncoder()
    for col in df.columns:
       df[col] = le.fit_transform(df[col])

    x = df[(list(df.columns[:-1]))]
    y = df['SalePrice']
    x_train, y_train, x_test, y_test = train_test_split(x, y, test_size=0.2)
    return x_train, y_train, x_test, y_test, x, y
 


<bound method NDFrame.head of Empty DataFrame
Columns: [Id, MSSubClass, MSZoning, LotFrontage, LotArea, Street, Alley, LotShape, LandContour, Utilities, LotConfig, LandSlope, Neighborhood, Condition1, Condition2, BldgType, HouseStyle, OverallQual, OverallCond, YearBuilt, YearRemodAdd, RoofStyle, RoofMatl, Exterior1st, Exterior2nd, MasVnrType, MasVnrArea, ExterQual, ExterCond, Foundation, BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinSF1, BsmtFinType2, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, Heating, HeatingQC, CentralAir, Electrical, 1stFlrSF, 2ndFlrSF, LowQualFinSF, GrLivArea, BsmtFullBath, BsmtHalfBath, FullBath, HalfBath, BedroomAbvGr, KitchenAbvGr, KitchenQual, TotRmsAbvGrd, Functional, Fireplaces, FireplaceQu, GarageType, GarageYrBlt, GarageFinish, GarageCars, GarageArea, GarageQual, GarageCond, PavedDrive, WoodDeckSF, OpenPorchSF, EnclosedPorch, 3SsnPorch, ScreenPorch, PoolArea, PoolQC, Fence, MiscFeature, MiscVal, MoSold, YrSold, SaleType, SaleCondition]
Index: []

[0 rows 

In [61]:
def SVM(x_train, x_test, y_train, y_test):
    clf = svm.SVR()

    x_train_fs, x_test_fs, fs = select_features(x_train, y_train, x_test,1)
    # fit the model
    clf.fit(x_train_fs, y_train)
    # evaluate the model
    yhat = clf.predict(x_test_fs)
    # evaluate predictions
    #mae = mean_absolute_error(y_test, yhat)
    #print("MAE:", mae)
    #clf.fit(x_train, y_train)
    #y_pred = clf.predict(x_test)
    print("MSE: ", mean_squared_error(y_test, yhat))
    print("MAPE: ", mean_absolute_percentage_error(y_test, yhat))
    print("RMSE: ", root_mean_squared_error(y_test, yhat))
    print("R^2: ", r2_score(y_test, yhat))


In [78]:
x_train, y_train, x_test, y_test, x, y = preprocess_data("train.csv")
#findBestFeatures(svm.SVR(), x, y)
#x_test, y_test = preprocess_data("test.csv")
SVM(x_train, y_train, x_test, y_test)
x_train.head


MSE:  30175.550338920748
MAPE:  4707112249266531.0
RMSE:  173.7111117312901
R^2:  0.18275214213266755


<bound method NDFrame.head of         Id  MSSubclass  MSZoning  LotFrontage  LotArea  Street  Alley  \
695   1122           8         5           61      668       1      3   
850   1295          12         4           90     1030       1      3   
247    625           3         5           79      960       1      1   
782   1219           9         4           75      730       1      3   
1024    29           0         4           53      556       1      3   
...    ...         ...       ...          ...      ...     ...    ...   
1240   269           4         4           74      946       1      3   
1272   304           4         4          111      963       1      3   
1214   240          12         4          111       37       1      3   
474    877           4         4           10      415       1      3   
508    915           4         1           85      794       1      3   

      LotShape  LandContour  Utilities  ...  ScreenPorch  PoolArea  PoolQC  \
695          4 

In [79]:
# Load your dataset
data = pd.read_csv('train.csv')  # Update this to the path of your data file
# Display the first few rows of the DataFrame

# Assuming 'data' is your complete dataset and 'SalePrice' is your target column
X = data.drop('SalePrice', axis=1)  # Drop the target column from the feature set
y = data['SalePrice']               # Target variable

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

In [71]:
# Drop rows with missing values
x_train = x_train.dropna()
y_train = y_train.loc[x_train.index]  # Ensure y_train only contains rows that exist in x_train
x_test = x_test.dropna()
y_test = y_test.loc[x_test.index]  # Ensure y_test only contains rows that exist in x_test

print("x_train shape:", x_train.shape)  # Should show (some_number, number_of_features)
print("y_train shape:", y_train.shape)  # Should show (same_number_as_x_train,)
print("x_test shape:", x_test.shape)    # Should show (some_other_number, number_of_features)
print("y_test shape:", y_test.shape)    # Should show (same_number_as_x_test,)


x_train shape: (0, 80)
y_train shape: (0,)
x_test shape: (0, 80)
y_test shape: (0,)


In [73]:
categorical_features = x_train.select_dtypes(include=['object']).columns

# Create a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', SimpleImputer(strategy='constant', fill_value=0), x_train.columns[x_train.dtypes != 'object'])
    ],
    remainder='passthrough'  # handle all other columns normally
)

# Create a KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)

# Create and train the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler(with_mean=False)),  # Adjust for sparse matrix compatibility
    ('classifier', knn)
])

# Fit the pipeline on the training data
pipeline.fit(x_train, y_train)

# Make predictions on the test data
y_pred = pipeline.predict(x_test)

# Output predictions
print("Predictions:", y_pred)

ValueError: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.