In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

First, we read our raw data.

In [3]:
data = pd.read_csv(r'BakuApartmentData.csv')
data

Unnamed: 0.1,Unnamed: 0,price,location,rooms,square,floor,new_building,has_repair,has_bill_of_sale,has_mortgage
0,0,284000,Azadlıq Prospekti m.,3,140.0,12-May,1,1,1,1
1,1,355000,Şah İsmayıl Xətai m.,3,135.0,19/20,1,1,1,1
2,2,755000,Səbail r.,4,210.0,18-Jul,1,1,1,1
3,3,245000,Elmlər Akademiyası m.,3,86.0,10-Aug,1,1,1,1
4,4,350000,Elmlər Akademiyası m.,4,174.0,15-Dec,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...
39297,39297,85500,9-cu mikrorayon q.,2,45.0,9-Sep,0,0,1,0
39298,39298,102500,Memar Əcəmi m.,2,48.0,5-Mar,0,0,1,0
39299,39299,143500,İnşaatçılar m.,3,65.0,5-Mar,0,0,1,0
39300,39300,172000,Elmlər Akademiyası m.,4,90.0,5-Mar,0,0,1,0


Now, we look at its describtion to start our preprocessing.

In [7]:
data.describe(include="all")

Unnamed: 0.1,Unnamed: 0,price,location,rooms,square,floor,new_building,has_repair,has_bill_of_sale,has_mortgage
count,39302.0,39302.0,39302,39302.0,39302.0,39302,39302.0,39302.0,39302.0,39302.0
unique,,,111,,,370,,,,
top,,,İnşaatçılar m.,,,5-May,,,,
freq,,,2834,,,1143,,,,
mean,19650.5,232232.3,,2.813648,106.039311,,0.75589,0.839016,0.768256,0.337947
std,11345.654476,182775.4,,0.91488,59.856534,,0.429564,0.367521,0.421952,0.473016
min,0.0,9600.0,,1.0,12.0,,0.0,0.0,0.0,0.0
25%,9825.25,135000.0,,2.0,65.0,,1.0,1.0,1.0,0.0
50%,19650.5,187000.0,,3.0,94.0,,1.0,1.0,1.0,0.0
75%,29475.75,277000.0,,3.0,130.0,,1.0,1.0,1.0,1.0


Here, we have useless column ["Unnamed: 0"]

In [10]:
data = data.drop(["Unnamed: 0"], axis=1)
data

Unnamed: 0,price,location,rooms,square,floor,new_building,has_repair,has_bill_of_sale,has_mortgage
0,284000,Azadlıq Prospekti m.,3,140.0,12-May,1,1,1,1
1,355000,Şah İsmayıl Xətai m.,3,135.0,19/20,1,1,1,1
2,755000,Səbail r.,4,210.0,18-Jul,1,1,1,1
3,245000,Elmlər Akademiyası m.,3,86.0,10-Aug,1,1,1,1
4,350000,Elmlər Akademiyası m.,4,174.0,15-Dec,1,1,1,1
...,...,...,...,...,...,...,...,...,...
39297,85500,9-cu mikrorayon q.,2,45.0,9-Sep,0,0,1,0
39298,102500,Memar Əcəmi m.,2,48.0,5-Mar,0,0,1,0
39299,143500,İnşaatçılar m.,3,65.0,5-Mar,0,0,1,0
39300,172000,Elmlər Akademiyası m.,4,90.0,5-Mar,0,0,1,0


We see that in our "floor" column, we have meaningless dates. So, we need to keep only days as "floor"

In [13]:
data['floor'] = pd.to_datetime(data['floor'], format='%d-%b', errors='coerce')
data['floor'] = data['floor'].dt.day

Let's check if we have NaN values in our dataset.

In [18]:
data.isnull().sum()

price                  0
location               0
rooms                  0
square                 0
floor               8439
new_building           0
has_repair             0
has_bill_of_sale       0
has_mortgage           0
dtype: int64

We can either delete those rows or just fill with mean.

In [21]:
data["floor"] = data["floor"].fillna(data["floor"].mean())

In [23]:
data.isnull().sum()

price               0
location            0
rooms               0
square              0
floor               0
new_building        0
has_repair          0
has_bill_of_sale    0
has_mortgage        0
dtype: int64

So we have no NaN values in our dataset.

In [26]:
data

Unnamed: 0,price,location,rooms,square,floor,new_building,has_repair,has_bill_of_sale,has_mortgage
0,284000,Azadlıq Prospekti m.,3,140.0,12.000000,1,1,1,1
1,355000,Şah İsmayıl Xətai m.,3,135.0,12.590643,1,1,1,1
2,755000,Səbail r.,4,210.0,18.000000,1,1,1,1
3,245000,Elmlər Akademiyası m.,3,86.0,10.000000,1,1,1,1
4,350000,Elmlər Akademiyası m.,4,174.0,15.000000,1,1,1,1
...,...,...,...,...,...,...,...,...,...
39297,85500,9-cu mikrorayon q.,2,45.0,9.000000,0,0,1,0
39298,102500,Memar Əcəmi m.,2,48.0,5.000000,0,0,1,0
39299,143500,İnşaatçılar m.,3,65.0,5.000000,0,0,1,0
39300,172000,Elmlər Akademiyası m.,4,90.0,5.000000,0,0,1,0


We can observe that we have float values in our "floor" column

In [29]:
data["floor"] = data["floor"].astype(int)

In [31]:
data

Unnamed: 0,price,location,rooms,square,floor,new_building,has_repair,has_bill_of_sale,has_mortgage
0,284000,Azadlıq Prospekti m.,3,140.0,12,1,1,1,1
1,355000,Şah İsmayıl Xətai m.,3,135.0,12,1,1,1,1
2,755000,Səbail r.,4,210.0,18,1,1,1,1
3,245000,Elmlər Akademiyası m.,3,86.0,10,1,1,1,1
4,350000,Elmlər Akademiyası m.,4,174.0,15,1,1,1,1
...,...,...,...,...,...,...,...,...,...
39297,85500,9-cu mikrorayon q.,2,45.0,9,0,0,1,0
39298,102500,Memar Əcəmi m.,2,48.0,5,0,0,1,0
39299,143500,İnşaatçılar m.,3,65.0,5,0,0,1,0
39300,172000,Elmlər Akademiyası m.,4,90.0,5,0,0,1,0


In [33]:
data.describe(include="all")

Unnamed: 0,price,location,rooms,square,floor,new_building,has_repair,has_bill_of_sale,has_mortgage
count,39302.0,39302,39302.0,39302.0,39302.0,39302.0,39302.0,39302.0,39302.0
unique,,111,,,,,,,
top,,İnşaatçılar m.,,,,,,,
freq,,2834,,,,,,,
mean,232232.3,,2.813648,106.039311,12.463819,0.75589,0.839016,0.768256,0.337947
std,182775.4,,0.91488,59.856534,4.672904,0.429564,0.367521,0.421952,0.473016
min,9600.0,,1.0,12.0,1.0,0.0,0.0,0.0,0.0
25%,135000.0,,2.0,65.0,9.0,1.0,1.0,1.0,0.0
50%,187000.0,,3.0,94.0,12.0,1.0,1.0,1.0,0.0
75%,277000.0,,3.0,130.0,16.0,1.0,1.0,1.0,1.0


We see that we lots of unique values in our "location" column. But we can keep only METRO in columns.

In [60]:
data_rq = data[~data['location'].str.endswith('m.')]
data_m = data[data['location'].str.endswith('m.')]

In [58]:
# data_rq.describe(include="all")
data_rq['location'].unique()

array(['Səbail r.', 'Ağ şəhər q.', 'Yasamal r.', 'Nərimanov r.',
       'Yeni Yasamal q.', '7-ci mikrorayon q.', 'Binəqədi r.', 'Bayıl q.',
       'Xətai r.', 'Nəsimi r.', 'Binəqədi q.', '9-cu mikrorayon q.',
       'Əhmədli q.', 'Bakıxanov q.', 'Qaraçuxur q.', 'Badamdar q.',
       'Sabunçu r.', 'Həzi Aslanov q.', 'Suraxanı r.', 'Nizami r.',
       'Hövsan q.', 'Masazır q.', '8-ci mikrorayon q.',
       '4-cü mikrorayon q.', 'Yeni Günəşli q.', 'Yasamal q.',
       '1-ci mikrorayon q.', 'Məmmədli q.', 'Kubinka q.', 'Nardaran q.',
       'Mehdiabad q.', 'Lökbatan q.', 'Biləcəri q.', 'Köhnə Günəşli q.',
       'Kürdəxanı q.', '8-ci kilometr q.', 'Yeni Ramana q.',
       'Ceyranbatan q.', 'Abşeron r.', 'Zığ q.', 'Buzovna q.', 'Çiçək q.',
       'Massiv D q.', 'Ramana q.', 'Günəşli q.', 'Sahil q.', 'Zabrat q.',
       'Massiv A q.', 'Saray q.', 'Sulutəpə q.', '28 May q.',
       'M.Ə.Rəsulzadə q.', 'Xəzər r.', 'Şıxov q.', 'Xutor q.',
       'Massiv V q.', '6-cı mikrorayon q.', 'Novxanı q.'

In [62]:
data_m['location'].unique()

array(['Azadlıq Prospekti m.', 'Şah İsmayıl Xətai m.',
       'Elmlər Akademiyası m.', 'Nizami m.', 'İnşaatçılar m.',
       'Qara Qarayev m.', 'Həzi Aslanov m.', 'Əhmədli m.', 'Koroğlu m.',
       'Memar Əcəmi m.', 'Gənclik m.', '8 Noyabr m.', '20 Yanvar m.',
       'Nəsimi m.', '28 May m.', 'Avtovağzal m.', 'Nəriman Nərimanov m.',
       'Neftçilər m.', 'Xalqlar Dostluğu m.', 'İçəri Şəhər m.',
       'Dərnəgül m.', 'Sahil m.', 'Bakmil m.', 'Xocəsən m.', 'Ulduz m.'],
      dtype=object)

In [64]:
data_m.describe(include="all")

Unnamed: 0,price,location,rooms,square,floor,new_building,has_repair,has_bill_of_sale,has_mortgage
count,26825.0,26825,26825.0,26825.0,26825.0,26825.0,26825.0,26825.0,26825.0
unique,,25,,,,,,,
top,,İnşaatçılar m.,,,,,,,
freq,,2834,,,,,,,
mean,240584.9,,2.825983,107.532675,12.671351,0.749413,0.850475,0.781733,0.337223
std,187132.2,,0.910588,61.109333,4.661885,0.433359,0.356612,0.413077,0.47277
min,9600.0,,1.0,12.0,1.0,0.0,0.0,0.0,0.0
25%,141000.0,,2.0,66.0,9.0,0.0,1.0,1.0,0.0
50%,195000.0,,3.0,96.0,12.0,1.0,1.0,1.0,0.0
75%,283000.0,,3.0,132.0,16.0,1.0,1.0,1.0,1.0
