<a href="https://colab.research.google.com/github/makhmudov-khondamir/Machine-Learning/blob/main/Tashkent_house_price_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Machine Learning**
#Determining the price of houses in Tashkent.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression

# Load data
df = pd.read_csv('https://raw.githubusercontent.com/anvarnarz/praktikum_datasets/main/housing_data_08-02-2021.csv')

# Replace problematic values with NaN
df['size'].replace('Площадьземли:1сот', np.nan, inplace=True)
df['price'].replace('Договорная', np.nan, inplace=True)

# Convert 'size' and 'price' columns to numeric
df['size'] = pd.to_numeric(df['size'], errors='coerce')
df['price'] = pd.to_numeric(df['price'], errors='coerce')

# Fill NaN values in 'price' column based on the mean 'price' of each 'district'
df['price'] = df.groupby('district')['price'].transform(lambda x: x.fillna(x.mean()))

# Split the data into training and testing sets
train_set, test_set = train_test_split(df, random_state=10, test_size=0.2)

In [None]:
# Define numeric and categorical columns
numeric = ['rooms', 'size', 'level', 'max_levels']
categorical = ['location', 'district']

# Define preprocessing pipelines
pipelineNUM = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

pipelineCAT = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing pipelines into a ColumnTransformer
fullpipeline = ColumnTransformer([
    ('num', pipelineNUM, numeric),
    ('cat', pipelineCAT, categorical)
])

# Fit and transform the training data
x_train = train_set.drop('price', axis=1)
y_train = train_set['price']
x_prepared_train = fullpipeline.fit_transform(x_train)

# Train a Linear Regression model
LR_model = LinearRegression()
LR_model.fit(x_prepared_train, y_train)

# Transform the test data
x_test = test_set.drop('price', axis=1)
y_test = test_set['price']
x_prepared_test = fullpipeline.transform(x_test)

# Predict using the trained model
y_predicted = LR_model.predict(x_prepared_test)

# Compare predictions with actual values
pd.DataFrame({'Predicted Price': y_predicted, 'Actual Price': y_test.values})



Unnamed: 0,Predicted Price,Actual Price
0,82560.243882,56000.0
1,87198.278795,68000.0
2,19357.769936,33000.0
3,37623.741574,42000.0
4,48608.474649,51000.0
...,...,...
1508,36685.021191,60000.0
1509,-32671.330526,26000.0
1510,29633.132813,31000.0
1511,109759.613988,40000.0


In [None]:
from sklearn.ensemble import RandomForestRegressor
RF_model=RandomForestRegressor()
RF_model.fit(x_prepared_train, y_train)

rfpredicted=RF_model.predict(x_prepared_test)

pd.DataFrame({'Prognoz':rfpredicted, 'Real baxosi': y_test})

Unnamed: 0,Prognoz,Real baxosi
3928,57695.286450,56000.0
413,68440.833333,68000.0
3582,42134.926667,33000.0
1393,36779.202381,42000.0
1372,58970.000000,51000.0
...,...,...
3104,52372.283333,60000.0
7516,25836.656667,26000.0
621,31649.750000,31000.0
5779,36964.890000,40000.0


In [None]:
from sklearn.tree import DecisionTreeRegressor
DT_model=DecisionTreeRegressor()
DT_model.fit(x_prepared_train, y_train)

dtpredicted=DT_model.predict(x_prepared_test)
pd.DataFrame({'Prognoz':dtpredicted, 'Real baxosi': y_test})

Unnamed: 0,Prognoz,Real baxosi
3928,63000.0,56000.0
413,68500.0,68000.0
3582,47500.0,33000.0
1393,37000.0,42000.0
1372,65000.0,51000.0
...,...,...
3104,57000.0,60000.0
7516,27000.0,26000.0
621,31000.0,31000.0
5779,42000.0,40000.0


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
#Mean Absolute Error (MAE)
maeLR=mean_absolute_error(y_test, y_predicted)
maeRF=mean_absolute_error(y_test, rfpredicted)
maeDT=mean_absolute_error(y_test, dtpredicted)
print('Mean Absolute Error (MAE):')
print(f'maeLR: {maeLR}')
print(f'maeRF: {maeRF}')
print(f'maeDT: {maeDT}')

#Mean Squared error
mseLR=mean_squared_error(y_test, y_predicted)
mseRF=mean_squared_error(y_test, rfpredicted)
mseDT=mean_squared_error(y_test, dtpredicted)
print('\nMean Squared error (RMSE):')
print(f'mseLR: {np.sqrt(mseLR)}')
print(f'mseRF: {np.sqrt(mseRF)}')
print(f'mseDT: {np.sqrt(mseDT)}')

Mean Absolute Error (MAE):
maeLR: 38323.037556416915
maeRF: 22424.959562278054
maeDT: 23266.99697471776

Mean Squared error (RMSE):
mseLR: 405882.58045243565
mseRF: 399995.5439662531
mseDT: 400207.9560915857


**CROSS VALIDATION**

In [None]:
x=df.drop("price", axis=1)
y=df['price']

prepared_x=fullpipeline.fit_transform(x)

In [None]:
from sklearn.model_selection import cross_val_score
cv_scoresLR = cross_val_score(LR_model, prepared_x, y, scoring='neg_mean_squared_error', cv=5) #LinearRegression
cv_rmse_scoresLR = np.sqrt(-cv_scoresLR)
cv_scoresRF = cross_val_score(RF_model, prepared_x, y, scoring='neg_mean_squared_error', cv=5) #RandomForest
cv_rmse_scoresRF = np.sqrt(-cv_scoresRF)
cv_scoresDT = cross_val_score(DT_model, prepared_x, y, scoring='neg_mean_squared_error', cv=5) #DecisionTree
cv_rmse_scoresDT = np.sqrt(-cv_scoresDT)

In [None]:
def print_cv_results(model_name, scores):
    mean_score = np.mean(scores)
    std_score = np.std(scores)
    print(f"Cross Validation: {model_name}")
    for i, score in enumerate(scores):
        print(f"Fold {i+1}: RMSE = {score}")
    print(f"Mean RMSE: {mean_score}")
    print(f"Standard Deviation of RMSE: {std_score}\n")

# Print results for each model
print_cv_results('LinearRegression', cv_rmse_scoresLR)
print_cv_results('RandomForest', cv_rmse_scoresRF)
print_cv_results('DecisionTree', cv_rmse_scoresDT)

Cross Validation: LinearRegression
Fold 1: RMSE = 77907.4338273655
Fold 2: RMSE = 140644.7912161016
Fold 3: RMSE = 87685.60505290533
Fold 4: RMSE = 1342785.7230644955
Fold 5: RMSE = 472523.6459957785
Mean RMSE: 424309.43983132933
Standard Deviation of RMSE: 481601.26273015153

Cross Validation: RandomForest
Fold 1: RMSE = 48754.67770011511
Fold 2: RMSE = 139996.38134152372
Fold 3: RMSE = 48244.81159036987
Fold 4: RMSE = 1348723.3462644832
Fold 5: RMSE = 468597.17653864395
Mean RMSE: 410863.27868702717
Standard Deviation of RMSE: 493740.49398106674

Cross Validation: DecisionTree
Fold 1: RMSE = 45522.01150630155
Fold 2: RMSE = 173437.5213717937
Fold 3: RMSE = 64064.559186490216
Fold 4: RMSE = 1392945.8463434528
Fold 5: RMSE = 468609.69571015245
Mean RMSE: 428915.9268236382
Standard Deviation of RMSE: 505233.7897328459

