In [31]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
from sklearn.impute import SimpleImputer
matplotlib.rcParams["figure.figsize"] = (20,10)



import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split  # Make sure to import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


In [32]:
train_data = pd.read_csv('train_.csv')
test_data = pd.read_csv('test_.csv')
avg_rent_data = pd.read_csv('avg_rent_.csv')
dist_from_city_data = pd.read_csv('dist_from_city_centre_.csv')

train_data.head()

Unnamed: 0,ID,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [33]:
# Drop unnecessary columns
train_data.drop(['society', 'availability'], axis=1, inplace=True)
test_data.drop(['society', 'availability'], axis=1, inplace=True)

In [34]:
train_data.head()
test_data.head()

Unnamed: 0,ID,area_type,location,size,total_sqft,bath,balcony
0,0,Super built-up Area,Chamrajpet,2 BHK,650,1.0,1.0
1,1,Super built-up Area,7th Phase JP Nagar,3 BHK,1370,2.0,1.0
2,2,Super built-up Area,Whitefield,3 BHK,1725,3.0,2.0
3,3,Built-up Area,Jalahalli,2 BHK,1000,2.0,0.0
4,4,Plot Area,TC Palaya,1 Bedroom,1350,1.0,0.0


In [35]:
print(train_data.isnull().sum())
print(test_data.isnull().sum())
print(avg_rent_data.isnull().sum())
print(dist_from_city_data.isnull().sum())

ID              0
area_type       0
location        1
size           14
total_sqft      0
bath           65
balcony       504
price           0
dtype: int64
ID              0
area_type       0
location        0
size            2
total_sqft      0
bath            8
balcony       105
dtype: int64
location         0
avg_2bhk_rent    0
dtype: int64
location          0
dist_from_city    0
dtype: int64


In [36]:
# Handle missing values in train and test datasets
train_data.fillna(method='ffill', inplace=True)
test_data.fillna(method='ffill', inplace=True)

# 2. Feature Engineering:

In [37]:
# Merge additional datasets with train and test datasets
train_data = train_data.merge(avg_rent_data, on='location', how='left')
train_data = train_data.merge(dist_from_city_data, on='location', how='left')

In [38]:
test_data = test_data.merge(avg_rent_data, on='location', how='left')
test_data= test_data.merge(dist_from_city_data, on='location', how='left')

In [39]:
train_data.head()
test_data.head()

Unnamed: 0,ID,area_type,location,size,total_sqft,bath,balcony,avg_2bhk_rent,dist_from_city
0,0,Super built-up Area,Chamrajpet,2 BHK,650,1.0,1.0,15875.0,6.7
1,1,Super built-up Area,7th Phase JP Nagar,3 BHK,1370,2.0,1.0,,11.0
2,2,Super built-up Area,Whitefield,3 BHK,1725,3.0,2.0,14981.0,17.3
3,3,Built-up Area,Jalahalli,2 BHK,1000,2.0,0.0,11000.0,16.6
4,4,Plot Area,TC Palaya,1 Bedroom,1350,1.0,0.0,,12.2


In [40]:
import pandas as pd

# Assuming you have already loaded the train_data and test_data DataFrames

# Convert 'total_sqft' column to numeric, coerce errors to NaN
train_data['total_sqft'] = pd.to_numeric(train_data['total_sqft'], errors='coerce')
test_data['total_sqft'] = pd.to_numeric(test_data['total_sqft'], errors='coerce')

# Drop rows with NaN values in 'total_sqft' column
train_data.dropna(subset=['total_sqft'], inplace=True)
test_data.dropna(subset=['total_sqft'], inplace=True)

# Assuming 'avg_2bhk_rent' represents the average rent for a 2 BHK apartment
# Perform feature engineering: calculate price per square foot
train_data['price_per_sqft'] = train_data['avg_2bhk_rent'] / train_data['total_sqft']
test_data['price_per_sqft'] = test_data['avg_2bhk_rent'] / test_data['total_sqft']

# Now you can use train_data and test_data with the newly added 'price_per_sqft' feature


In [41]:
train_data.head()
test_data.head()

Unnamed: 0,ID,area_type,location,size,total_sqft,bath,balcony,avg_2bhk_rent,dist_from_city,price_per_sqft
0,0,Super built-up Area,Chamrajpet,2 BHK,650.0,1.0,1.0,15875.0,6.7,24.423077
1,1,Super built-up Area,7th Phase JP Nagar,3 BHK,1370.0,2.0,1.0,,11.0,
2,2,Super built-up Area,Whitefield,3 BHK,1725.0,3.0,2.0,14981.0,17.3,8.684638
3,3,Built-up Area,Jalahalli,2 BHK,1000.0,2.0,0.0,11000.0,16.6,11.0
4,4,Plot Area,TC Palaya,1 Bedroom,1350.0,1.0,0.0,,12.2,


In [42]:
# Handle missing values
train_data.fillna(0, inplace=True)  # Fill missing values with 0 for simplicity
test_data.fillna(0, inplace=True)

In [43]:
train_data.head()
test_data.head()

Unnamed: 0,ID,area_type,location,size,total_sqft,bath,balcony,avg_2bhk_rent,dist_from_city,price_per_sqft
0,0,Super built-up Area,Chamrajpet,2 BHK,650.0,1.0,1.0,15875.0,6.7,24.423077
1,1,Super built-up Area,7th Phase JP Nagar,3 BHK,1370.0,2.0,1.0,0.0,11.0,0.0
2,2,Super built-up Area,Whitefield,3 BHK,1725.0,3.0,2.0,14981.0,17.3,8.684638
3,3,Built-up Area,Jalahalli,2 BHK,1000.0,2.0,0.0,11000.0,16.6,11.0
4,4,Plot Area,TC Palaya,1 Bedroom,1350.0,1.0,0.0,0.0,12.2,0.0


In [44]:
# Convert categorical variables to numerical using one-hot encoding
train_data = pd.get_dummies(train_data, columns=['area_type', 'location', 'size'])
test_data = pd.get_dummies(test_data, columns=['area_type', 'location', 'size'])

In [45]:
train_data.head()
test_data.head()

Unnamed: 0,ID,total_sqft,bath,balcony,avg_2bhk_rent,dist_from_city,price_per_sqft,area_type_Built-up Area,area_type_Carpet Area,area_type_Plot Area,...,size_5 BHK,size_5 Bedroom,size_6 BHK,size_6 Bedroom,size_7 BHK,size_7 Bedroom,size_8 BHK,size_8 Bedroom,size_9 BHK,size_9 Bedroom
0,0,650.0,1.0,1.0,15875.0,6.7,24.423077,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1370.0,2.0,1.0,0.0,11.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,1725.0,3.0,2.0,14981.0,17.3,8.684638,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,1000.0,2.0,0.0,11000.0,16.6,11.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,1350.0,1.0,0.0,0.0,12.2,0.0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [46]:
# Ensure the columns in the test dataset match those in the training dataset
missing_cols = set(X_train.columns) - set(test_data.columns)
for col in missing_cols:
    test_data[col] = 0

  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[c

  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[c

In [47]:
# Reorder columns to match the order in the training dataset
test_data = test_data[X_train.columns]

In [48]:
# Splitting into features and target variable
X = train_data.drop('price', axis=1)
y = train_data['price']

In [49]:
train_data.head()
test_data.head()

Unnamed: 0,ID,total_sqft,bath,balcony,avg_2bhk_rent,dist_from_city,price_per_sqft,area_type_Built-up Area,area_type_Carpet Area,area_type_Plot Area,...,size_5 BHK,size_5 Bedroom,size_6 BHK,size_6 Bedroom,size_7 BHK,size_7 Bedroom,size_8 BHK,size_8 Bedroom,size_9 BHK,size_9 Bedroom
0,0,650.0,1.0,1.0,15875.0,6.7,24.423077,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1370.0,2.0,1.0,0.0,11.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,1725.0,3.0,2.0,14981.0,17.3,8.684638,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,1000.0,2.0,0.0,11000.0,16.6,11.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,1350.0,1.0,0.0,0.0,12.2,0.0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [50]:
# Splitting the dataset into the Training set and Test set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)


In [51]:
train_data.head()
test_data.head()

Unnamed: 0,ID,total_sqft,bath,balcony,avg_2bhk_rent,dist_from_city,price_per_sqft,area_type_Built-up Area,area_type_Carpet Area,area_type_Plot Area,...,size_5 BHK,size_5 Bedroom,size_6 BHK,size_6 Bedroom,size_7 BHK,size_7 Bedroom,size_8 BHK,size_8 Bedroom,size_9 BHK,size_9 Bedroom
0,0,650.0,1.0,1.0,15875.0,6.7,24.423077,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1370.0,2.0,1.0,0.0,11.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,1725.0,3.0,2.0,14981.0,17.3,8.684638,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,1000.0,2.0,0.0,11000.0,16.6,11.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,1350.0,1.0,0.0,0.0,12.2,0.0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [52]:
# Model Training
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [53]:
# Model Evaluation
y_pred = regressor.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f'RMSE: {rmse}')

RMSE: 92.19224997475933


In [54]:
# Make predictions on the preprocessed test dataset
test_predictions = regressor.predict(test_data)

# Creating submission file
submission_df = pd.DataFrame({'ID': test_data.index, 'price': test_predictions})
submission_df.to_csv('submission.csv', index=False)