In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

First, we import the data

In [2]:
# Load the dataset
data = pd.read_csv(r'C:\Users\paul zhan\Desktop\house_price\Housing.csv')

Let's display the data.

In [3]:
print(data)

        price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0    13300000  7420         4          2        3      yes        no       no   
1    12250000  8960         4          4        4      yes        no       no   
2    12250000  9960         3          2        2      yes        no      yes   
3    12215000  7500         4          2        2      yes        no      yes   
4    11410000  7420         4          1        2      yes       yes      yes   
..        ...   ...       ...        ...      ...      ...       ...      ...   
540   1820000  3000         2          1        1      yes        no      yes   
541   1767150  2400         3          1        1       no        no       no   
542   1750000  3620         2          1        1      yes        no       no   
543   1750000  2910         3          1        1       no        no       no   
544   1750000  3850         3          1        2      yes        no       no   

    hotwaterheating aircond

Now, we will process the data and see if there are missing value.

In [4]:
# Check for missing values
missing_values = data.isnull()  # or data.isna()

# Count the missing values in each column
missing_count = missing_values.sum()

# Display the count of missing values for each column
print(missing_count)

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64


I found out that the linear regression dont take none numerical value so we are going to cconvert yes and no into binary.

In [5]:
# Define a mapping for 'yes' and 'no' to 1 and 0
mapping = {'yes': 1, 'no': 0}

# Apply the mapping to specific columns
columns_to_convert = ['mainroad', 'guestroom', 'basement','hotwaterheating','airconditioning', 'prefarea']  # List the columns you want to convert

data[columns_to_convert] = data[columns_to_convert].applymap(mapping.get)

In [6]:
print(data)

        price  area  bedrooms  bathrooms  stories  mainroad  guestroom  \
0    13300000  7420         4          2        3         1          0   
1    12250000  8960         4          4        4         1          0   
2    12250000  9960         3          2        2         1          0   
3    12215000  7500         4          2        2         1          0   
4    11410000  7420         4          1        2         1          1   
..        ...   ...       ...        ...      ...       ...        ...   
540   1820000  3000         2          1        1         1          0   
541   1767150  2400         3          1        1         0          0   
542   1750000  3620         2          1        1         1          0   
543   1750000  2910         3          1        1         0          0   
544   1750000  3850         3          1        2         1          0   

     basement  hotwaterheating  airconditioning  parking  prefarea  \
0           0                0           

we can see that the data contains no missing value.

let's split the training set and the testing set.  what we want is house price therefore the target Y will be price the rest will be in X since it might be useful information for the training

In [25]:
# Separate features (X) and target (y)
X = data[["area", "bedrooms", "bathrooms", "stories", "mainroad", "guestroom", "basement", "hotwaterheating", "airconditioning", "parking", "prefarea"]]
y = data['price']

In [38]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a linear regression model
model = LinearRegression()

# Train the model on the training data
model.fit(X_train, y_train)
print(X_test)

# Make predictions on the test data
y_pred = model.predict(X_test)


# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


     area  bedrooms  bathrooms  stories  mainroad  guestroom  basement  \
316  5900         4          2        2         0          0         1   
77   6500         3          2        3         1          0         0   
360  4040         2          1        1         1          0         0   
90   5000         3          1        2         1          0         0   
493  3960         3          1        1         1          0         0   
..    ...       ...        ...      ...       ...        ...       ...   
15   6000         4          1        2         1          0         1   
357  6930         4          1        2         0          0         0   
39   6000         4          2        4         1          0         0   
54   6000         3          2        2         1          1         0   
155  6100         3          2        1         1          0         1   

     hotwaterheating  airconditioning  parking  prefarea  
316                0                0        1      

Now, lets make a prediction for house price. 

In [9]:
data_for_pred = pd.read_csv(r"C:\Users\paul zhan\Desktop\house_dataset_pred.csv")

In [10]:
print(data_for_pred)

    area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0   1140         4          2        3      yes       yes       no   
1   3000         4          4        4      yes        no       no   
2   9960         3          2        2      yes        no      yes   
3   4000         4          2        2       no        no      yes   
4  16000         4          1        2      yes       yes      yes   
5  23000         3          3        1      yes        no      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0              no             yes        0      yes        furnished  
1              no             yes        3       no        furnished  
2             yes              no        1      yes   semi-furnished  
3              no             yes        3      yes        furnished  
4              no             yes        2       no        furnished  
5              no             yes        2      yes   semi-furnished  


In [11]:
# Define a mapping for 'yes' and 'no' to 1 and 0
mapping = {'yes': 1, 'no': 0}

# Apply the mapping to specific columns
columns_to_convert = ['mainroad', 'guestroom', 'basement','hotwaterheating','airconditioning', 'prefarea']  # List the columns you want to convert

data_for_pred[columns_to_convert] = data_for_pred[columns_to_convert].applymap(mapping.get)

In [12]:
print(data_for_pred)

    area  bedrooms  bathrooms  stories  mainroad  guestroom  basement  \
0   1140         4          2        3         1          1         0   
1   3000         4          4        4         1          0         0   
2   9960         3          2        2         1          0         1   
3   4000         4          2        2         0          0         1   
4  16000         4          1        2         1          1         1   
5  23000         3          3        1         1          0         1   

   hotwaterheating  airconditioning  parking  prefarea furnishingstatus  
0                0                1        0         1        furnished  
1                0                1        3         0        furnished  
2                1                0        1         1   semi-furnished  
3                0                1        3         1        furnished  
4                0                1        2         0        furnished  
5                0                1        2

In [41]:
X_pred = data_for_pred[["area", "bedrooms", "bathrooms", "stories", "mainroad", "guestroom", "basement", "hotwaterheating", "airconditioning", "parking", "prefarea"]]
print(X_pred)

    area  bedrooms  bathrooms  stories  mainroad  guestroom  basement  \
0   1140         4          2        3         1          1         0   
1   3000         4          4        4         1          0         0   
2   9960         3          2        2         1          0         1   
3   4000         4          2        2         0          0         1   
4  16000         4          1        2         1          1         1   
5  23000         3          3        1         1          0         1   

   hotwaterheating  airconditioning  parking  prefarea  
0                0                1        0         1  
1                0                1        3         0  
2                1                0        1         1  
3                0                1        3         1  
4                0                1        2         0  
5                0                1        2         1  


In [43]:
prediction = model.predict(X_pred)
print(prediction)

[ 6113365.28326472  9063003.93293962  8047270.19177733  6887358.49033546
  8402611.83930149 12173025.74073119]
