<a href="https://colab.research.google.com/github/manekgnath/Machine-Learning/blob/main/ML_project_2_Linear_Regression_Ames_House_price_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np

In [None]:
# Load the dataset
data= pd.read_csv("/content/AmesHousing.tsv", delimiter='\t')

In [None]:
# Dropping unnecessary columns (identifiers and those with too many missing values)
data = data.drop(columns=['Order', 'PID', 'Alley', 'Pool QC', 'Fence', 'Misc Feature', 'Fireplace Qu'])

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Data columns (total 75 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   MS SubClass      2930 non-null   int64  
 1   MS Zoning        2930 non-null   object 
 2   Lot Frontage     2440 non-null   float64
 3   Lot Area         2930 non-null   int64  
 4   Street           2930 non-null   object 
 5   Lot Shape        2930 non-null   object 
 6   Land Contour     2930 non-null   object 
 7   Utilities        2930 non-null   object 
 8   Lot Config       2930 non-null   object 
 9   Land Slope       2930 non-null   object 
 10  Neighborhood     2930 non-null   object 
 11  Condition 1      2930 non-null   object 
 12  Condition 2      2930 non-null   object 
 13  Bldg Type        2930 non-null   object 
 14  House Style      2930 non-null   object 
 15  Overall Qual     2930 non-null   int64  
 16  Overall Cond     2930 non-null   int64  
 17  Year Built    

In [None]:
# Identifying numerical and categorical columns
numerical_columns = ['Lot Frontage', 'Lot Area', 'Year Built', 'Year Remod/Add', 'Mas Vnr Area', 'BsmtFin SF 1',
                     'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF',
                     'Gr Liv Area', 'Garage Area', 'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch',
                     '3Ssn Porch', 'Screen Porch', 'Pool Area', 'Misc Val','SalePrice']
categorical_columns = ['MS Zoning', 'Street', 'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config',
                       'Land Slope', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style',
                       'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Exter Qual',
                       'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1',
                       'BsmtFin Type 2', 'Heating', 'Heating QC', 'Central Air', 'Electrical', 'Kitchen Qual',
                       'Functional', 'Garage Type', 'Garage Finish', 'Garage Qual', 'Garage Cond', 'Paved Drive',
                       'Sale Type', 'Sale Condition']

In [None]:
# Handling missing values

num_imputer = SimpleImputer(strategy='mean')
data[numerical_columns] = num_imputer.fit_transform(data[numerical_columns])

cat_imputer = SimpleImputer(strategy='most_frequent')
data[categorical_columns] = cat_imputer.fit_transform(data[categorical_columns])

In [None]:
# Encoding categorical variables
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_cat = pd.DataFrame(encoder.fit_transform(data[categorical_columns]))

In [None]:
# Ensuring the encoded dataframe has proper indexing

encoded_cat.index = data.index

In [None]:
# Dropping original categorical columns and merging encoded ones

data = data.drop(columns=categorical_columns)
data = pd.concat([data, encoded_cat], axis=1)

In [None]:
data.isnull().sum().sum()

164

In [None]:
data.dropna(inplace=True)

In [None]:
# Separating predictors and target variable

X = data.drop(columns=['SalePrice'])
y = data['SalePrice']

In [None]:
# Converting all column names to strings
X.columns = X.columns.astype(str)

In [None]:
numerical_columns2 = ['Lot Frontage', 'Lot Area', 'Year Built', 'Year Remod/Add', 'Mas Vnr Area', 'BsmtFin SF 1',
                     'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF',
                     'Gr Liv Area', 'Garage Area', 'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch',
                     '3Ssn Porch', 'Screen Porch', 'Pool Area', 'Misc Val']

In [None]:
# Scaling numerical features
scaler = StandardScaler()
X[numerical_columns2] = scaler.fit_transform(X[numerical_columns2])

In [None]:
# Splitting the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Training the model
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
model.score(X_train, y_train)

0.9229249690786273

In [None]:
# Making predictions

y_pred = model.predict(X_test)


In [None]:
# Evaluating the model

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)


In [None]:

# Printing the results

print(f"R^2 Score: {r2}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")

R^2 Score: 0.8047880721686262
Mean Squared Error (MSE): 1355962639.8759801
Mean Absolute Error (MAE): 17068.45041732838


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from scipy.stats import zscore
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the model with Ridge Regression
model = Ridge(alpha=1.0)  # Ridge regularization
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)

# Evaluating the model
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Printing the results
print(f"R^2 Score: {r2}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")


R^2 Score: 0.8178006010985873
Mean Squared Error (MSE): 1265576241.4865634
Mean Absolute Error (MAE): 16471.258909584623
