<a href="https://colab.research.google.com/github/mdrakibhossain091b1/DATA-RESEARCH-/blob/main/Colab%20Code/Data_Preprocessing_and_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [58]:
# data load
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression


In [59]:
# load housing data
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/Dataset/Housing2.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [60]:
# How many houses are there?
number_of_house = len(df)
print(f"Number of houses: {number_of_house}")
print(f"\nFirst 3 houses: {df.head(3)}")
# Show all column names
print(f"\nColumns:\n\n{df.columns}")
# Show data types of columns
print("\nDataType:\n\n", df.dtypes)

Number of houses: 545

First 3 houses:       price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000  7420         4          2        3      yes        no       no   
1  12250000  8960         4          4        4      yes        no       no   
2  12250000  9960         3          2        2      yes        no      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0              no             yes        2      yes        furnished  
1              no             yes        3       no        furnished  
2              no              no        2      yes   semi-furnished  

Columns:

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')

DataType:

 price                int64
area                 int64
bedrooms             int64
bathrooms            int64
stories      

In [66]:
#Convert Yes/No to 1/0
# Check what's in our data
print("Current mainroad values:\n", df['mainroad'].head())
# Models need numbers, not text!
# We need: yes → 1, no → 0

label_encoder = LabelEncoder()
df['mainroad'] = label_encoder.fit_transform(df['mainroad'])
print("\nAfter Label Encoding 'mainroad':")
print(df['mainroad'])

df['guestroom'] = label_encoder.fit_transform(df['guestroom'])
print("\nAfter Label Encoding 'guestroom':")
print(df['guestroom'])

df['basement'] = label_encoder.fit_transform(df['basement'])
print("\nAfter Label Encoding 'basement':")
print(df['basement'])

df['hotwaterheating'] = label_encoder.fit_transform(df['hotwaterheating'])
print("\nAfter Label Encoding 'hotwaterheatingt':")
print(df['hotwaterheating'])

df['airconditioning'] = label_encoder.fit_transform(df['airconditioning'])
print("\nAfter Label Encoding 'airconditioning':")
print(df['airconditioning'])

df['prefarea'] = label_encoder.fit_transform(df['prefarea'])
print("\nAfter Label Encoding 'prefarea':")
print(df['prefarea'])

furnishingstatus_encoded = pd.get_dummies(df['furnishingstatus'], prefix = 'furnishingstatus')

df= pd.concat([df.drop('furnishingstatus', axis=1), furnishingstatus_encoded], axis=1)

print("\nDataType:\n\n", df.dtypes)


Current mainroad values:
 0    1
1    1
2    1
3    1
4    1
Name: mainroad, dtype: int64

After Label Encoding 'mainroad':
0      1
1      1
2      1
3      1
4      1
      ..
540    1
541    0
542    1
543    0
544    1
Name: mainroad, Length: 545, dtype: int64

After Label Encoding 'guestroom':
0      0
1      0
2      0
3      0
4      1
      ..
540    0
541    0
542    0
543    0
544    0
Name: guestroom, Length: 545, dtype: int64

After Label Encoding 'basement':
0      0
1      0
2      1
3      1
4      1
      ..
540    1
541    0
542    0
543    0
544    0
Name: basement, Length: 545, dtype: int64

After Label Encoding 'hotwaterheatingt':
0      0
1      0
2      0
3      0
4      0
      ..
540    0
541    0
542    0
543    0
544    0
Name: hotwaterheating, Length: 545, dtype: int64

After Label Encoding 'airconditioning':
0      1
1      1
2      0
3      1
4      1
      ..
540    0
541    0
542    0
543    0
544    0
Name: airconditioning, Length: 545, dtype: int64

Aft

In [84]:


# Split into Train and Test
# Split - 80% train, 20% test
x = df.drop('price', axis=1)
y = df['price']

x_train, x_test, y_train, y_test = train_test_split(
    x, y,
    test_size= 0.2,
    random_state=20
)
print(f"\nWe will train with: {len(x_train)} houses")
print(f"We will test with: {len(x_test)} houses")


We will train with: 436 houses
We will test with: 109 houses


In [88]:
#Apply StandardScaler
# Area is huge (1650-13300), Bedrooms is small (1-6)!

scaler = StandardScaler()
x_train_scaler = scaler.fit_transform(x_train)
x_test_scaler = scaler.transform(x_test)
print("\nAfter scaling:")
print("All features now have similar range!")
print(f"Example - First house's mainroad before scaling: {x_train.values[0][6]}")
print(f"Example - First house's mainroad after scaling: {x_train_scaler[0][6]:.2f}")


After scaling:
All features now have similar range!
Example - First house's mainroad before scaling: 0
Example - First house's mainroad after scaling: -0.75


In [100]:
#train and test model

model = LogisticRegression(max_iter=100)
model.fit(x_train_scaler, y_train)
print("✅ Model trained!")
# Check accuracy on training data
train_score= model.score(x_train_scaler, y_train)
print(f'Training accuracy {train_score:.1%}')
# Check accuracy on test data
test_score = model.score(x_test_scaler, y_test)
print(f'Test accuracy {test_score:.1%}')

✅ Model trained!
Training accuracy 47.2%
Test accuracy 1.8%


In [103]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(
     model,
    x_train_scaler,
    y_train,
    cv= 5
)

print("5 mini-test scores:")
for i in range(5):
    print(f"  Test {i+1}: {scores[i]:.1%}")

print(f"\nAverage: {scores.mean():.1%}")
print(f"This means our model is {scores.mean():.1%} accurate!")



5 mini-test scores:
  Test 1: 3.4%
  Test 2: 3.4%
  Test 3: 2.3%
  Test 4: 2.3%
  Test 5: 1.1%

Average: 2.5%
This means our model is 2.5% accurate!


In [107]:
import joblib

# Save the model
joblib.dump(model, 'my_model.pkl')
print("✅ Model saved as 'my_model.pkl'")

# Save the scaler
joblib.dump(scaler, 'my_scaler.pkl')
print("✅ Scaler saved as 'my_scaler.pkl'")

# Save scaled data as CSV
pd.DataFrame(x_test_scaler).to_csv('x_test_scaler.csv', index=False)
print("✅ Scaled data saved as 'train_data_scaled.csv'")

✅ Model saved as 'my_model.pkl'
✅ Scaler saved as 'my_scaler.pkl'
✅ Scaled data saved as 'train_data_scaled.csv'


In [118]:
# Predict a House Price
# New house details:
# area=5000, bedrooms=3, bathrooms=2, stories=2,
# mainroad=yes, guestroom=no, basement=yes,
# hotwaterheating=no, airconditioning=yes,
# parking=2, prefarea=yes, furnishingstatus=furnished
#your code here
# Load saved model and scaler
model = joblib.load('my_model.pkl')
scaler = joblib.load('my_scaler.pkl')

new_data = pd.DataFrame({
    'area': [5000],
    'bedrooms': [3],
    'bathrooms': [2],
    'stories': [2],
    'mainroad': [1],
    'guestroom': [0],
    'basement': [1],
    'hotwaterheating': [0],
    'airconditioning': [1],
    'parking': [2],
    'prefarea': [1],
    'furnishingstatus_furnished': [1],
    'furnishingstatus_semi-furnished': [0],
    'furnishingstatus_unfurnished': [0]
})

new_house_scaler = scaler.transform(new_data)
prediction = model.predict(new_house_scaler)

print(prediction)


New House information:
[12215000]
