In [None]:
#Importing libraries necessary for the project 
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
trainset = pd.read_csv("train.csv")         #Importing training dataset        
dataset = trainset.drop(['Id'], axis = 1)   #Dropping Id column from the dataset

 

#Defining function to handle missing features
def handle_missing(features):
    # the data description states that NA refers to typical ('Typ') values
    features['Functional'] = features['Functional'].fillna('Typ')
    features['Electrical'] = features['Electrical'].fillna("SBrkr")
    features['KitchenQual'] = features['KitchenQual'].fillna("TA")
    
    # Replace the missing values in each of the columns below with their mode
    features['Exterior1st'] = features['Exterior1st'].fillna(features['Exterior1st'].mode()[0])
    features['Exterior2nd'] = features['Exterior2nd'].fillna(features['Exterior2nd'].mode()[0])
    features['SaleType'] = features['SaleType'].fillna(features['SaleType'].mode()[0])
    features['MSZoning'] = features.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))
    
    # the data description stats that NA refers to "No Pool"
    features["PoolQC"] = features["PoolQC"].fillna("None")
    
    # Replacing the missing values with 0, since no garage = no cars in garage
    for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
        features[col] = features[col].fillna(0)
        
    # Replacing the missing values with None
    for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
        features[col] = features[col].fillna('None')
        
    # NaN values for these categorical basement features, means there's no basement
    for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
        features[col] = features[col].fillna('None')
        
    # Group the by neighborhoods, and fill in missing value by the median LotFrontage of the neighborhood
    features['LotFrontage'] = features.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
    # We have no particular intuition around how to fill in the rest of the categorical features
    # So we replace their missing values with None
    objects = []
    for i in features.columns:
        if features[i].dtype == object:
            objects.append(i)
    features.update(features[objects].fillna('None'))
        
    # And we do the same thing for numerical features, but this time with 0s
    numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    numeric = []
    for i in features.columns:
        if features[i].dtype in numeric_dtypes:
            numeric.append(i)
    features.update(features[numeric].fillna(0))    
    return features
#%%
#Defining Function to preprocess data
def preprocess(dataset):
    #Calling function to replace missing values
    dataset = handle_missing(dataset)
    
    #Seperating Categorical columns
    catogrical_column = [column for column in dataset.columns if dataset[column].dtypes=='object']
      
    #Label Encoding Categorical Data 
    from sklearn.preprocessing import LabelEncoder
    for feature in catogrical_column:
         encoder = LabelEncoder()       
         dataset[feature] = encoder.fit_transform(dataset[feature])        
    return dataset
#%%
#Preprocessing data
dataset = preprocess(dataset)
# Generating Feature and Output Datasets
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values
##Taking log(1+x) transform of SalesPrice
y = np.log1p(y)
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 310)
# Support Vector Regressor
from sklearn.svm import SVR
svr = SVR(C= 20, epsilon= 0.008, gamma=0.0003)      #Initialising SVR object
svr.fit(X_train, y_train)                           #Training data

 

y_pred=svr.predict(X_test)                          #Finding predicted values with test data
rmse = mean_squared_error(y_test, y_pred, squared = False)
print('RMSE on testing half of train dataset:', rmse)
y_pred1=svr.predict(X)                          #Finding predicted values with entire data
rmse1 = mean_squared_error(y, y_pred1, squared = False)
print('RMSE on entire train dataset:', rmse1)

In [1]:
import pandas as pd
