In [1]:
# Loading data libs for data preprocessesing and I/O functions
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score


russianRealEstate2021Dataset = pd.read_csv('Russia_Real_Estate_2021_main.csv', sep=r',|;', engine='python')
russianRealEstate2021Dataset.head()

Unnamed: 0,date,price,level,levels,rooms,area,kitchen_area,geo_lat,geo_lon,building_type,object_type,postal_code,street_id,id_region,house_id
0,2021-12-31,6439970,17,24,3,77.59,0.0,47.25679,39.71233,0,2,344068.0,,61,
1,2021-12-31,2994720,6,24,-1,29.36,0.0,47.25679,39.71233,0,2,344068.0,,61,
2,2021-12-31,3971520,24,24,1,41.37,0.0,47.25679,39.71233,0,2,344068.0,,61,
3,2021-12-31,3910500,5,24,1,43.45,0.0,47.25679,39.71233,0,2,344068.0,,61,
4,2021-12-31,6956950,11,24,3,76.45,0.0,47.25679,39.71233,0,2,344068.0,,61,


In [2]:
russianRealEstate2021Dataset.drop_duplicates()

Unnamed: 0,date,price,level,levels,rooms,area,kitchen_area,geo_lat,geo_lon,building_type,object_type,postal_code,street_id,id_region,house_id
0,2021-12-31,6439970,17,24,3,77.59,0.0,47.256790,39.712330,0,2,344068.0,,61,
1,2021-12-31,2994720,6,24,-1,29.36,0.0,47.256790,39.712330,0,2,344068.0,,61,
2,2021-12-31,3971520,24,24,1,41.37,0.0,47.256790,39.712330,0,2,344068.0,,61,
3,2021-12-31,3910500,5,24,1,43.45,0.0,47.256790,39.712330,0,2,344068.0,,61,
4,2021-12-31,6956950,11,24,3,76.45,0.0,47.256790,39.712330,0,2,344068.0,,61,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2021-12-31,6350000,3,5,3,85.00,12.0,53.331843,83.783791,0,0,656043.0,556952.5,22,1.172190e+06
996,2021-12-31,6099000,4,9,3,65.00,0.0,56.041539,92.753133,0,0,660030.0,581436.0,24,8.570030e+05
997,2021-12-31,2490000,1,10,2,56.90,0.0,55.169949,61.519210,0,0,454079.0,274414.0,74,1.820769e+06
998,2021-12-31,850000,2,2,2,37.00,5.0,55.946206,43.088179,0,0,606101.0,190983.0,52,9.583290e+05


In [3]:
russianRealEstate2021Dataset['area'] = russianRealEstate2021Dataset['area'].astype(int)
russianRealEstate2021Dataset['geo_lat'] = russianRealEstate2021Dataset['geo_lat'].astype(int)
russianRealEstate2021Dataset['geo_lon'] = russianRealEstate2021Dataset['geo_lon'].astype(int)
russianRealEstate2021Dataset['postal_code'] = russianRealEstate2021Dataset['postal_code'].astype(int)

In [4]:
# Using existing values in the Russian Real Estate Dataset to fill the missing values
# To ignore any errors like - TypeError: Cannot interpolate with all object-dtype columns in the DataFrame. Try setting at least one column to a numeric dtype.
try:
    russianRealEstate2021Dataset.interpolate(method ='linear', limit_direction ='forward', inplace=True)
except ValueError:
    print("Please ignore this error!")
except TypeError:
    print("Please ignore this error!")

In [5]:
# Removing the house_id column, because it is of no use in our prediction
# del russianRealEstate2021Dataset['house_id'] #OR
russianRealEstate2021Dataset.drop('house_id', inplace=True, axis=1, errors='ignore')
russianRealEstate2021Dataset.drop('street_id', inplace=True, axis=1, errors='ignore')
russianRealEstate2021Dataset.drop('id_region', inplace=True, axis=1, errors='ignore')
russianRealEstate2021Dataset.drop('date', inplace=True, axis=1, errors='ignore')

russianRealEstate2021DatasetSplitOne = np.array_split(russianRealEstate2021Dataset, 2)[0]
russianRealEstate2021DatasetSplitTwo = np.array_split(russianRealEstate2021Dataset, 2)[1]

print("R2 Score:", r2_score(russianRealEstate2021DatasetSplitOne, russianRealEstate2021DatasetSplitTwo))

R2 Score: -1.1924001152773016


In [6]:
# Split the data into train and test
trainingData, testingData = train_test_split(russianRealEstate2021Dataset, test_size=0.2, random_state=1)
print("Training dataset size:", len(trainingData))
print("Test dataset size:", len(testingData))


Training dataset size: 800
Test dataset size: 200


In [7]:
quartile1 = np.percentile(trainingData['rooms'], 25, interpolation = 'midpoint')
quartile3 = np.percentile(trainingData['rooms'], 75, interpolation = 'midpoint')
iqr = quartile3 - quartile1
 

In [8]:
# Upper bound
upperBound = quartile3+1.5*iqr
arrayOfUpperBound=np.array(trainingData['rooms'] >= upperBound)

In [9]:
# Lower bound
lowerBound=quartile1-1.5*iqr
arrayOfLowerBound=np.array(trainingData['rooms'] <= lowerBound)

In [10]:
# Removing the outliers, since data was not needed to drop from lower end this ommiteds
trainingData = trainingData[trainingData['rooms'] > arrayOfUpperBound]
trainingData.shape


testBuildingData = [5, 7, 3, 93.0, 12.0, 55.8821949, 37.264095, 0, 143442.0]

linearRegressionModel = linear_model.LinearRegression()
logisticRegressionModel = LogisticRegression()

linearRegressionModel.fit(trainingData[['level', 'levels', 'rooms', 'area', 'kitchen_area', 'geo_lat', 'geo_lon', 'building_type', 'postal_code']], trainingData['price'])
logisticRegressionModel.fit(trainingData[['level', 'levels', 'rooms', 'area', 'kitchen_area', 'geo_lat', 'geo_lon', 'building_type', 'postal_code']], trainingData['price'])

print("\n Coefficient: ", linearRegressionModel.coef_)
print("\n Intercept: ", linearRegressionModel.intercept_)


 Coefficient:  [ 6.57140382e+04  1.11952166e+05 -3.16118710e+06  2.74300898e+05
 -9.49072566e+04  7.52959329e+04  1.44345929e+04 -2.77254623e+05
 -1.14704361e+01]

 Intercept:  -3161516.4334746115


In [11]:
# All columns for dataset
# price	level	levels	rooms	area	 kitchen_area	geo_lat	geo_lon	building_type	object_type	postal_code
# Collected from Test Dataset(Sample Data): 5, 7, 3, 93.0, 12.0, 55.8821949, 37.264095, 0, 143442.0
# Linear Regression: 15899434
# Logistic Regression: 2950000
print("\n Original  Price for house is: 11000000")
print("\n Price for house using Linear Regression:", np.floor(linearRegressionModel.predict([testBuildingData])))
print("\n Price for house using Logistic Regression:", np.floor(logisticRegressionModel.predict([testBuildingData])))

print("\nPrediction Model has been successfully finished!")



 Original  Price for house is: 11000000

 Price for house using Linear Regression: [15938505.]

 Price for house using Logistic Regression: [2950000.]

Prediction Model has been successfully finished!


