In [1]:
!pip install pandas numpy matplotlib scikit-learn





In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

print("Libraries Imported Successfully! ✅")


Libraries Imported Successfully! ✅


In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic data
num_samples = 1000

sqft = np.random.randint(500, 5000, num_samples)  # House size in square feet
bedrooms = np.random.randint(1, 6, num_samples)  # Number of bedrooms
bathrooms = np.random.randint(1, 4, num_samples)  # Number of bathrooms
age = np.random.randint(0, 100, num_samples)  # Age of the house

# Categorical data for location
locations = ['Downtown', 'Suburb', 'Countryside']
location = np.random.choice(locations, num_samples)

# Define a basic price formula with some randomness
price = (sqft * 300) + (bedrooms * 10000) + (bathrooms * 5000) - (age * 200) + np.random.randint(-50000, 50000, num_samples)

# Create DataFrame
df = pd.DataFrame({
    'sqft': sqft,
    'bedrooms': bedrooms,
    'bathrooms': bathrooms,
    'age': age,
    'location': location,
    'price': price
})

# Display first few rows
df.head()


Unnamed: 0,sqft,bedrooms,bathrooms,age,location,price
0,1360,2,1,31,Countryside,466820
1,4272,3,3,66,Countryside,1288468
2,3592,1,2,66,Downtown,1100127
3,966,1,2,27,Downtown,345010
4,4926,2,1,43,Suburb,1514653


In [4]:
# Check dataset information
df.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sqft       1000 non-null   int32 
 1   bedrooms   1000 non-null   int32 
 2   bathrooms  1000 non-null   int32 
 3   age        1000 non-null   int32 
 4   location   1000 non-null   object
 5   price      1000 non-null   int32 
dtypes: int32(5), object(1)
memory usage: 27.5+ KB


In [5]:
df.describe()


Unnamed: 0,sqft,bedrooms,bathrooms,age,price
count,1000.0,1000.0,1000.0,1000.0,1000.0
mean,2815.422,2.99,1.973,49.713,873971.8
std,1255.514921,1.427564,0.820332,28.533328,378230.7
min,503.0,1.0,1.0,0.0,153292.0
25%,1749.5,2.0,1.0,25.0,547036.5
50%,2862.5,3.0,2.0,50.0,880084.0
75%,3849.5,4.0,3.0,74.0,1186257.0
max,4999.0,5.0,3.0,99.0,1573378.0


In [7]:
df.isnull().sum()

sqft         0
bedrooms     0
bathrooms    0
age          0
location     0
price        0
dtype: int64

In [8]:
from sklearn.preprocessing import StandardScaler

# Create a scaler object
scaler = StandardScaler()

# Select columns that need scaling (all numeric features)
features_to_scale = ['sqft', 'bedrooms', 'bathrooms', 'age']

# Fit and transform the scaler to the selected features
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])

# Check the first few rows to verify
df.head()


Unnamed: 0,sqft,bedrooms,bathrooms,age,location,price
0,-1.159803,-0.693836,-1.186699,-0.656158,Countryside,466820
1,1.160724,0.007008,1.252559,0.571092,Countryside,1288468
2,0.618843,-1.394681,0.03293,0.571092,Downtown,1100127
3,-1.473776,-1.394681,0.03293,-0.796415,Downtown,345010
4,1.681887,-0.693836,-1.186699,-0.235386,Suburb,1514653


In [9]:
# Calculate the IQR for relevant columns
Q1 = df[features_to_scale].quantile(0.25)
Q3 = df[features_to_scale].quantile(0.75)
IQR = Q3 - Q1

# Identify outliers using the IQR rule
outliers = ((df[features_to_scale] < (Q1 - 1.5 * IQR)) | (df[features_to_scale] > (Q3 + 1.5 * IQR)))

# Check how many outliers are present
outliers_count = outliers.sum()

outliers_count


sqft         0
bedrooms     0
bathrooms    0
age          0
dtype: int64

In [10]:
# Log transform the 'price' feature if it's skewed
df['price'] = df['price'].apply(lambda x: np.log(x))

# Verify the change
df.head()


Unnamed: 0,sqft,bedrooms,bathrooms,age,location,price
0,-1.159803,-0.693836,-1.186699,-0.656158,Countryside,13.053699
1,1.160724,0.007008,1.252559,0.571092,Countryside,14.068964
2,0.618843,-1.394681,0.03293,0.571092,Downtown,13.910936
3,-1.473776,-1.394681,0.03293,-0.796415,Downtown,12.751329
4,1.681887,-0.693836,-1.186699,-0.235386,Suburb,14.230697


In [11]:
# Apply one-hot encoding to the 'location' column
df = pd.get_dummies(df, columns=['location'], drop_first=True)

# Check the first few rows to ensure the encoding worked
df.head()


Unnamed: 0,sqft,bedrooms,bathrooms,age,price,location_Downtown,location_Suburb
0,-1.159803,-0.693836,-1.186699,-0.656158,13.053699,False,False
1,1.160724,0.007008,1.252559,0.571092,14.068964,False,False
2,0.618843,-1.394681,0.03293,0.571092,13.910936,True,False
3,-1.473776,-1.394681,0.03293,-0.796415,12.751329,True,False
4,1.681887,-0.693836,-1.186699,-0.235386,14.230697,False,True


In [12]:
from sklearn.model_selection import train_test_split

# Separate features (X) and target (y)
X = df.drop('price', axis=1)  # Drop 'price' column to create features
y = df['price']  # 'price' is the target variable

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shapes of the train and test sets
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")


X_train shape: (800, 6)
X_test shape: (200, 6)
y_train shape: (800,)
y_test shape: (200,)


In [13]:
from sklearn.linear_model import LinearRegression

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Check the predictions
print(f"Predictions on test data: {y_pred[:5]}")


Predictions on test data: [14.04351937 13.3255599  14.2998671  14.41648852 13.84173334]


In [14]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae}")

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")

# Calculate R-Squared (R²) score
r2 = r2_score(y_test, y_pred)
print(f"R-Squared (R²): {r2}")


Mean Absolute Error (MAE): 0.11516222988718675
Mean Squared Error (MSE): 0.02195296716011054
R-Squared (R²): 0.9260408732124527
