# Import data

In [68]:
import pandas as pd
import numpy as np
from pandas.api.types import infer_dtype

df = pd.read_csv('./csv_files/properties.csv', low_memory=False)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75876 entries, 0 to 75875
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   source              75876 non-null  int64  
 1   hyperlink           59908 non-null  object 
 2   locality            75874 non-null  object 
 3   postcode            68464 non-null  float64
 4   house_is            75876 non-null  object 
 5   property_subtype    75876 non-null  object 
 6   price               74405 non-null  object 
 7   sale                25810 non-null  object 
 8   rooms_number        74502 non-null  object 
 9   area                72349 non-null  object 
 10  kitchen_has         69763 non-null  object 
 11  furnished           69438 non-null  object 
 12  open_fire           70005 non-null  object 
 13  terrace             61138 non-null  object 
 14  terrace_area        64583 non-null  object 
 15  garden              67421 non-null  object 
 16  gard

# Check if there are columns with mixed data types ==> NO

In [69]:
# print data type of each column to check if there are
# any mixed ones, turns out that there are none
def is_mixed(col):
    return infer_dtype(col)

df.apply(is_mixed)

# ==> there are no columns with 'mixed' part of the inferred datatype

source                 integer
hyperlink               string
locality                string
postcode              floating
house_is                string
property_subtype        string
price                   string
sale                    string
rooms_number            string
area                    string
kitchen_has            boolean
furnished              boolean
open_fire              boolean
terrace                 string
terrace_area            string
garden                  string
garden_area             string
land_surface            string
land_plot_surface       string
facades_number          string
swimming_pool_has      boolean
building_state          string
dtype: object

# Remove leading and trailing spaces from column names

In [70]:
df.columns = [x.strip(' ') for x in df.columns.values]

# Find which kind of empties there are ==> there are only NaNs

In [71]:
# are there any empty strings? ==> no
print(np.where(df.applymap(lambda x: x == '')))

# are there any NaNs? ==> yes
np.where(pd.isnull(df))

(array([], dtype=int64), array([], dtype=int64))


(array([    0,     1,     2, ..., 75875, 75875, 75875]),
 array([ 7,  7,  7, ..., 19, 20, 21]))

# Display the percent of NaNs per column

In [72]:
# display the percent of NaNs per column
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'%_missing': percent_missing})
missing_value_df = missing_value_df.sort_values('%_missing', ascending = False)
missing_value_df

Unnamed: 0,%_missing
sale,65.983974
building_state,41.092045
hyperlink,21.044863
garden_area,20.736465
terrace,19.423797
land_plot_surface,16.698297
land_surface,15.643945
terrace_area,14.883494
facades_number,12.495387
swimming_pool_has,11.986662


# Put 'unknown' in place of NaN for everything else than int64 and float64 columns
## Please run this after converting numerical columns like price and facades from string to integer.  
Even a value of NaN might help predict the price, so to avoid the correlation algorithm skipping it?, and because NaN is not allowed, we replace it.

In [73]:
# replace all NaNs in strings with 'unknown'
df_nanfilled = df.select_dtypes(exclude=['int64','float64']).replace(np.nan, 'unknown')
df.update(df_nanfilled)
print(df.info())

# replace all 'None'/'none' strings with uknown
df_nonefilled = df.select_dtypes(exclude=['int64','float64']).replace('none', 'unknown')
df.update(df_nonefilled)
df_nonefilled = df.select_dtypes(exclude=['int64','float64']).replace('None', 'unknown')
df.update(df_nonefilled)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75876 entries, 0 to 75875
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   source             75876 non-null  int64  
 1   hyperlink          75876 non-null  object 
 2   locality           75876 non-null  object 
 3   postcode           68464 non-null  float64
 4   house_is           75876 non-null  object 
 5   property_subtype   75876 non-null  object 
 6   price              75876 non-null  object 
 7   sale               75876 non-null  object 
 8   rooms_number       75876 non-null  object 
 9   area               75876 non-null  object 
 10  kitchen_has        75876 non-null  object 
 11  furnished          75876 non-null  object 
 12  open_fire          75876 non-null  object 
 13  terrace            75876 non-null  object 
 14  terrace_area       75876 non-null  object 
 15  garden             75876 non-null  object 
 16  garden_area        758

# Remove leading and trailing spaces of every element

In [74]:
# remove leading and trailing spaces and newline characters from values if they are a string
df = df.applymap(lambda x: x.strip() if type(x)==str else x)

# Drop postcode column, because postcode is more completely available in 'locality'
### first we fix 'locality' column to carry just postcode or 'unknown' (stripping sporadic address parts)

In [75]:
import re

df.drop('postcode', axis = 1, inplace = True)

# write a function that returns the cleaned postcode from elements
# containing the address
def clean_locality(locality): 
    # Search for the presence of a 4 digit number (starts with 1-9)
    if re.search('[1-9]\d{3}', locality):
        # get the number
        return re.findall("[1-9]\d{3}", locality)[0]
    else: 
        # if no postcode is inside insert 'unknown' 
        return 'unknown'
          
# Updated locality column
df['locality'] = df['locality'].apply(clean_locality)

# Remove duplicates
### should execute after fixing columns
### should execute after removing non-property detail or incomplete columns: source and hyperlink

In [76]:
# drop columns 
df.drop(['source', 'hyperlink', 'sale'], axis = 1, inplace = True)

# drop 100% duplicate rows
lenght_before = len(df)
df.drop_duplicates(ignore_index = True, inplace = True)
dropped = len(df) - lenght_before
print(f'Dropped: {dropped}')

Dropped: -23730


# Print unique values per column

In [77]:
uniques = pd.DataFrame()
for col in df:
    col_uniques = pd.DataFrame({f'{col}_value': df[f'{col}'].value_counts().index,
                                f'{col}_count': df[f'{col}'].value_counts().values})
    uniques = pd.concat([uniques, col_uniques], axis = 1)

uniques.head(50)


Unnamed: 0,locality_value,locality_count,house_is_value,house_is_count,property_subtype_value,property_subtype_count,price_value,price_count,rooms_number_value,rooms_number_count,...,land_surface_value,land_surface_count,land_plot_surface_value,land_plot_surface_count,facades_number_value,facades_number_count,swimming_pool_has_value,swimming_pool_has_count,building_state_value,building_state_count
0,unknown,23768.0,False,16305.0,HOUSE,15224.0,unknown,679.0,3,11722.0,...,unknown,21376.0,unknown,16960,0,17138.0,False,43906.0,unknown,26334.0
1,8300,1169.0,True,16013.0,APARTMENT,10053.0,295000,468.0,2,11340.0,...,0,14467.0,0,1072,unknown,11827.0,unknown,6015.0,AS_NEW,11043.0
2,1180,942.0,TRUE,9463.0,house,3711.0,199000,453.0,4,5994.0,...,150,126.0,0.0,822,2,7209.0,True,2225.0,GOOD,7693.0
3,1000,731.0,Yes,4972.0,apartment,3646.0,299000,449.0,3.0,3879.0,...,100,112.0,Yes,468,4,4173.0,,,TO_BE_DONE_UP,2057.0
4,1050,686.0,No,4734.0,VILLA,3258.0,275000,435.0,1,3575.0,...,120,107.0,100,302,3,3367.0,,,TO_RENOVATE,1781.0
5,9000,570.0,FALSE,659.0,APARTMENT_BLOCK,1909.0,249000,432.0,5,3115.0,...,200,97.0,90,275,2.0,3041.0,,,JUST_RENOVATED,1621.0
6,8400,457.0,,,MIXED_USE_BUILDING,1708.0,225000,421.0,4.0,2346.0,...,300,92.0,80,257,4.0,3002.0,,,Not specified,626.0
7,4000,336.0,,,Apartment,1242.0,395000,402.0,6,1870.0,...,110,88.0,70,251,3.0,2147.0,,,old,475.0
8,1150,298.0,,,DUPLEX,900.0,195000,362.0,2.0,1495.0,...,160,85.0,120,236,1,200.0,,,New,393.0
9,1200,292.0,,,PENTHOUSE,865.0,325000,349.0,5.0,1115.0,...,1000,83.0,110,218,1.0,39.0,,,TO_RESTORE,123.0
