In [72]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.svm import SVC,SVR
from sklearn.metrics import accuracy_score,classification_report
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import joblib
import requests
from io import StringIO
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
import numpy as np

In [54]:
data=pd.read_csv('Bengaluru_House_Data.csv')

In [55]:
data.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [56]:
data

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.00
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.00
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.00
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.00
...,...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,ArsiaEx,3453,4.0,0.0,231.00
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,,3600,5.0,,400.00
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,Mahla T,1141,2.0,1.0,60.00
13318,Super built-up Area,18-Jun,Padmanabhanagar,4 BHK,SollyCl,4689,4.0,1.0,488.00


In [57]:
# Preprocessing steps

# Dropping the 'society' column
data = data.drop('society', axis=1)

# Imputing missing values
data['location'] = data['location'].fillna(data['location'].mode()[0])
data['bath'] = data['bath'].fillna(data['bath'].median())
data['balcony'] = data['balcony'].fillna(data['balcony'].median())

# Handling the 'size' column (Extracting the number of bedrooms)
data['size'] = data['size'].fillna(data['size'].mode()[0])
data['bedrooms'] = data['size'].apply(lambda x: int(x.split(' ')[0]))

# Handling the 'total_sqft' column
# Converting ranges to a single number by averaging them
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0]) + float(tokens[1])) / 2
    try:
        return float(x)
    except:
        return None

data['total_sqft'] = data['total_sqft'].apply(convert_sqft_to_num)
data = data.dropna()  # Dropping rows where total_sqft could not be converted

# Displaying the modified dataframe
data.head()


Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price,bedrooms
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,1056.0,2.0,1.0,39.07,2
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,2600.0,5.0,3.0,120.0,4
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,1440.0,2.0,3.0,62.0,3
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,1521.0,3.0,1.0,95.0,3
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,1200.0,2.0,1.0,51.0,2


In [58]:
X=data.drop('price',axis=1)

In [59]:
y=data['price']

In [60]:
categorical_column=X.select_dtypes(include=['object']).columns

In [61]:
categorical_column

Index(['area_type', 'availability', 'location', 'size'], dtype='object')

In [62]:
numerical_column=X.select_dtypes(include=['int64','float64']).columns

In [63]:
numerical_column

Index(['total_sqft', 'bath', 'balcony', 'bedrooms'], dtype='object')

In [75]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_column),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_column)
    ])

In [76]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR())
])

In [77]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)

In [78]:
pipeline.fit(X_train,y_train)

In [79]:
X_test

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,bedrooms
4634,Super built-up Area,Ready To Move,Kalyan nagar,3 BHK,1800.0,3.0,2.0,3
10625,Plot Area,Ready To Move,Vivek Nagar,2 Bedroom,1600.0,2.0,1.0,2
4259,Built-up Area,Ready To Move,EPIP Zone,2 BHK,1125.0,2.0,1.0,2
6014,Super built-up Area,18-Aug,Iblur Village,3 BHK,1920.0,3.0,2.0,3
9995,Super built-up Area,Ready To Move,Hoodi,3 BHK,1715.0,2.0,2.0,3
...,...,...,...,...,...,...,...,...
4675,Super built-up Area,Ready To Move,Marathahalli,3 BHK,1595.0,3.0,2.0,3
9979,Super built-up Area,20-Dec,Old Madras Road,2 BHK,1165.0,2.0,1.0,2
356,Super built-up Area,Ready To Move,Banashankari3rd stage bigbazar,3 BHK,1762.0,3.0,1.0,3
9940,Built-up Area,Ready To Move,Gollahalli,2 BHK,1050.0,2.0,1.0,2


In [80]:
y_pred=pipeline.predict(X_test)

In [81]:
rmse=np.sqrt(mean_squared_error(y_test,y_pred))

In [82]:
rmse

142.8063921556827

In [83]:
# Q2. You have built an SVM regression model and are trying to decide between using MSE or R-squared as
# your evaluation metric. Which metric would be more appropriate if your goal is to predict the actual price
# of a house as accurately as possible?

When choosing between Mean Squared Error (MSE) and R-squared as an evaluation metric for an SVM regression model, especially in the context of predicting house prices, it's important to consider what each metric primarily measures and what your specific goal is.

1. **Mean Squared Error (MSE):**
   - **Definition**: MSE calculates the average of the squared differences between the predicted and actual values. It directly measures the average magnitude of the prediction errors.
   - **Interpretation**: The lower the MSE, the better the model's performance. A value of 0 indicates perfect predictions.
   - **Usefulness**: MSE is useful when you want to give more weight to larger errors (since it squares the errors) and when you need a metric that is in the same units as the dependent variable (after taking the square root for RMSE).

2. **R-squared (Coefficient of Determination):**
   - **Definition**: R-squared measures the proportion of the variance in the dependent variable that is predictable from the independent variables. It is a relative measure of fit.
   - **Interpretation**: The closer R-squared is to 1, the better the model explains the variability of the response data around its mean. However, it doesn't necessarily imply accurate predictions.
   - **Usefulness**: R-squared is useful for comparing the fit of different models and for understanding how well the model as a whole explains the variability of the data.

### Choosing the Metric for House Price Prediction

- If your goal is to **predict the actual price of a house as accurately as possible**, MSE (or RMSE, which is the square root of MSE) would generally be more appropriate. This is because MSE directly measures how far your model's predictions are from the actual values, which aligns well with your goal of accurate predictions.

- R-squared, while informative about the model's explanatory power, doesn't directly convey how far off the predictions are in terms of the actual house prices. A high R-squared model can still have significant predictive errors.

### Conclusion

For the specific goal of accurate price prediction, prioritize MSE or RMSE. They directly reflect the accuracy of the predictions in the units of the target variable (house prices), which is crucial for practical, real-world applications like house price prediction.

In [84]:
# Q3. You have a dataset with a significant number of outliers and are trying to select an appropriate
# regression metric to use with your SVM model. Which metric would be the most appropriate in this
# scenario?

When dealing with a dataset that contains a significant number of outliers, the choice of an appropriate regression metric becomes crucial, especially for an SVM model. The key is to choose a metric that is robust against the influence of outliers. Here are the common regression metrics, with a focus on their sensitivity to outliers:

1. **Mean Squared Error (MSE) / Root Mean Squared Error (RMSE):**
   - These metrics are sensitive to outliers since they square the errors before averaging. Therefore, large deviations (outliers) have a disproportionately large effect on the final metric.

2. **Mean Absolute Error (MAE):**
   - MAE calculates the average absolute difference between the predicted values and the actual values. It treats all deviations from the true values equally and is less sensitive to outliers compared to MSE or RMSE.

3. **R-squared (Coefficient of Determination):**
   - While R-squared is a popular metric for evaluating the overall fit of the model, it does not specifically address outliers' influence and is more about explaining the variability in the data.

4. **Median Absolute Error:**
   - Similar to MAE but uses the median of the absolute differences. It's even more robust against outliers than MAE since the median is unaffected by extreme values.

### Most Appropriate Metric for Outlier-Rich Data

Given that your dataset contains a significant number of outliers, **Mean Absolute Error (MAE)** or **Median Absolute Error** would be more appropriate metrics for your SVM model. Both are robust to outliers:

- **MAE** is generally preferred if you want a metric that is easy to interpret and more commonly used, as it gives an average error.
  
- **Median Absolute Error** can be used if your data contains very extreme outliers, as it completely ignores the influence of these extreme values.

### Conclusion

In summary, for a dataset with many outliers, MAE or Median Absolute Error are advisable choices because of their robustness to the influence of outliers, which will give you a more reliable measure of your SVM model's performance.

In [85]:
# Q4. You have built an SVM regression model using a polynomial kernel and are trying to select the best
# metric to evaluate its performance. You have calculated both MSE and RMSE and found that both values
# are very close. Which metric should you choose to use in this case?

When choosing between Mean Squared Error (MSE) and Root Mean Squared Error (RMSE) for evaluating the performance of your SVM regression model with a polynomial kernel, and you find that both values are very close, the decision should be based on the specific aspects of these metrics that are most relevant to your analysis and interpretation needs.

1. **Mean Squared Error (MSE):**
   - **Definition**: MSE is the average of the squares of the errors between the predicted and actual values.
   - **Interpretation**: It gives more weight to larger errors due to squaring.
   - **Unit**: The unit of MSE is the square of the unit of the target variable.

2. **Root Mean Squared Error (RMSE):**
   - **Definition**: RMSE is the square root of MSE.
   - **Interpretation**: RMSE is more interpretable in the context of the target variable since it's in the same units as the target variable.
   - **Unit**: The unit of RMSE is the same as the unit of the target variable.

### Choosing Between MSE and RMSE

- If both MSE and RMSE values are very close, it implies that the errors are uniformly small as RMSE is more sensitive to larger errors (due to the squaring of errors in MSE). 

- **Interpretability**: RMSE is often preferred because it is in the same units as the target variable, making it more interpretable. For instance, if you are predicting house prices in dollars, RMSE will also be in dollars, which makes it easier to understand the magnitude of the errors.

- **Sensitivity to Larger Errors**: If you are particularly concerned about larger errors and their impact on the model, RMSE's emphasis on larger errors (due to the square root transformation) makes it a slightly more sensitive metric to use.

### Conclusion

In your case, where both MSE and RMSE are very close, **RMSE** would typically be the more preferable choice due to its interpretability in the same units as the target variable. This ease of interpretation can be particularly valuable in communicating your model's performance to stakeholders who might not be familiar with the technical aspects of these metrics.

In [86]:
# Q5. You are comparing the performance of different SVM regression models using different kernels (linear,
# polynomial, and RBF) and are trying to select the best evaluation metric. Which metric would be most
# appropriate if your goal is to measure how well the model explains the variance in the target variable?

When the goal is to measure how well a model explains the variance in the target variable, the most appropriate evaluation metric is typically the **R-squared (Coefficient of Determination)**. Here's why it's suitable for comparing different SVM regression models with various kernels (linear, polynomial, and RBF):

1. **R-squared (Coefficient of Determination):**
   - **Definition**: R-squared is a statistical measure that represents the proportion of the variance for the dependent variable that's explained by the independent variables in a regression model.
   - **Interpretation**: An R-squared value of 1 indicates that the regression model perfectly explains all the variability in the target variable. Conversely, a value of 0 indicates that the model explains none of the variability.
   - **Usefulness**: It provides a measure of how well observed outcomes are replicated by the model, based on the proportion of total variation of outcomes explained by the model.

### Why R-squared is Suitable for Your Goal:

- **Consistency Across Different Models**: R-squared offers a consistent basis for comparison across models with different kernels. It directly measures the proportion of the variance in the dependent variable that is predictable from the independent variables.

- **Interpretability**: It is easy to understand and interpret, even for those who are not statisticians. This makes it useful for presenting your findings to a broader audience.

- **Comparative Evaluation**: When you want to compare the performance of models not just in terms of error minimization but in terms of their ability to explain the variance in the data, R-squared is more informative.

### Limitations:

- **Non-Error Metric**: Unlike MSE or MAE, R-squared is not an error metric per se. It doesn’t tell you how much error your model is making in terms of the units of the target variable.

- **Sensitive to Overfitting**: In some cases, a high R-squared value can be misleading, especially if the model is overfitting.

### Conclusion:

For the purpose of assessing how well different SVM regression models (with different kernels) explain the variance in your target variable, R-squared is the most suitable metric. However, it's often beneficial to look at R-squared in conjunction with other performance metrics like MSE or MAE to get a more comprehensive understanding of model performance.