Adding columns

In [55]:
import pandas as pd

In [56]:
file_path = 'housing.csv'

data = pd.read_csv(file_path)

In [57]:
# Add a new column for price per square meter, avoiding division by zero or null landsize values
data['Land_price_per_m2'] = data.apply(
    lambda row: row['Price'] / row['Landsize'] if pd.notnull(row['Price']) and pd.notnull(row['Landsize']) and row['Landsize'] > 0 else None,
    axis=1
)

# Display the updated dataset's first few rows to confirm the addition
data[['Price', 'Landsize', 'Land_price_per_m2']].head()

Unnamed: 0,Price,Landsize,Land_price_per_m2
0,1480000.0,202.0,7326.732673
1,1035000.0,156.0,6634.615385
2,1465000.0,134.0,10932.835821
3,850000.0,94.0,9042.553191
4,1600000.0,120.0,13333.333333


In [58]:
# Add a new column for price per square meter, avoiding division by zero or null landsize values
data['Building_price_per_m2'] = data.apply(
    lambda row: row['Price'] / row['BuildingArea'] if pd.notnull(row['Price']) and pd.notnull(row['BuildingArea']) and row['BuildingArea'] > 0 else None,
    axis=1
)

# Display the updated dataset's first few rows to confirm the addition
data[['Price', 'BuildingArea', 'Building_price_per_m2']].head()

Unnamed: 0,Price,BuildingArea,Building_price_per_m2
0,1480000.0,,
1,1035000.0,79.0,13101.265823
2,1465000.0,150.0,9766.666667
3,850000.0,,
4,1600000.0,142.0,11267.605634


In [70]:
# Add a new column for price per square meter, avoiding division by zero or null landsize values
data['Building_and_land_price_per_m2'] = data.apply(
    lambda row: row['Price'] / (row['BuildingArea'] + row['Landsize']) if pd.notnull(row['Price']) and pd.notnull(row['Landsize']) and row['Landsize'] > 0 and pd.notnull(row['BuildingArea']) and row['BuildingArea'] > 0 else None,
    axis=1
)

# Display the updated dataset's first few rows to confirm the addition
data[['Price', 'BuildingArea', 'Landsize', 'Building_and_land_price_per_m2']].head()

Unnamed: 0,Price,BuildingArea,Landsize,Building_and_land_price_per_m2
0,1480000.0,,202.0,
1,1035000.0,79.0,156.0,4404.255319
2,1465000.0,150.0,134.0,5158.450704
3,850000.0,,94.0,
4,1600000.0,142.0,120.0,6106.870229


Check correlation of building price per mq2 and land price per mq2

In [77]:
# Filter the data to include only rows where Building_price_per_m2 is not NaN
filtered_data = data.dropna(subset=['Building_price_per_m2'])

# Calculate the correlation between Building_price_per_m2 and Land_price_per_m2
correlation = filtered_data[['Building_price_per_m2', 'Land_price_per_m2']].corr()

# Display the correlation matrix
print(correlation)

# Calculate the correlation between Building_price_per_m2, Land_price_per_m2, and Building_and_land_price_per_m2
correlation = filtered_data[['Building_price_per_m2', 'Land_price_per_m2', 'Building_and_land_price_per_m2']].corr()

# Display the correlation matrix
print('\n\n', correlation)

                       Building_price_per_m2  Land_price_per_m2
Building_price_per_m2               1.000000           0.019125
Land_price_per_m2                   0.019125           1.000000


                                 Building_price_per_m2  Land_price_per_m2  \
Building_price_per_m2                        1.000000           0.019125   
Land_price_per_m2                            0.019125           1.000000   
Building_and_land_price_per_m2               0.129224           0.420268   

                                Building_and_land_price_per_m2  
Building_price_per_m2                                 0.129224  
Land_price_per_m2                                     0.420268  
Building_and_land_price_per_m2                        1.000000  


In [60]:
# Convert the 'Date' column to datetime format
data['Date'] = pd.to_datetime(data['Date'], format='%d/%m/%Y')

data['Date'].head()

0   2016-12-03
1   2016-02-04
2   2017-03-04
3   2017-03-04
4   2016-06-04
Name: Date, dtype: datetime64[ns]

to check if the yearbuilt floating numbers, aside from the nan records, are all actual integers (so they're floating with .0 decimal):

In [61]:
all_integers = data['YearBuilt'].dropna().apply(lambda x: x.is_integer()).all()


print(f"All 'YearBuilt' values are integers with .0 floating point: {all_integers}")


All 'YearBuilt' values are integers with .0 floating point: True


In [62]:
data['YearBuilt'] = data['YearBuilt'].astype('Int64')

data['YearBuilt'].head()

0    <NA>
1    1900
2    1900
3    <NA>
4    2014
Name: YearBuilt, dtype: Int64

to check that also for property count, we are not converting values that might have a decimal part:

In [63]:
all_integers_property_count = data['Propertycount'].apply(lambda x: x.is_integer()).all()

print(f"All 'Propertycount' values are integers with .0 floating point: {all_integers_property_count}")

All 'Propertycount' values are integers with .0 floating point: True


In [64]:
data['Propertycount'] = data['Propertycount'].astype('Int64')

data['Propertycount'].head()


0    4019
1    4019
2    4019
3    4019
4    4019
Name: Propertycount, dtype: Int64

(i still have to understand wtf is propertycount...)

next: the dummy vars for the regions :)

In [65]:
# Create dummy variables for the 'Regionname' column
region_dummies = pd.get_dummies(data['Regionname'], drop_first=True)

# Display the first few rows of the dummy variables
region_dummies.head()

Unnamed: 0,Eastern Victoria,Northern Metropolitan,Northern Victoria,South-Eastern Metropolitan,Southern Metropolitan,Western Metropolitan,Western Victoria
0,False,True,False,False,False,False,False
1,False,True,False,False,False,False,False
2,False,True,False,False,False,False,False
3,False,True,False,False,False,False,False
4,False,True,False,False,False,False,False


In [66]:
region_dummies = region_dummies.rename(columns={
    'Eastern Victoria': 'is_Eastern_Victoria',
    'Northern Metropolitan': 'is_Northern_Metropolitan',
    'Northern Victoria': 'is_Northern_Victoria',
    'South-Eastern Metropolitan': 'is_South_Eastern_Metropolitan',
    'Southern Metropolitan': 'is_Southern_Metropolitan',
    'Western Metropolitan': 'is_Western_Metropolitan',
    'Western Victoria': 'is_Western_Victoria'
})

# Display the first few rows of the renamed dummy variables
region_dummies.head()

Unnamed: 0,is_Eastern_Victoria,is_Northern_Metropolitan,is_Northern_Victoria,is_South_Eastern_Metropolitan,is_Southern_Metropolitan,is_Western_Metropolitan,is_Western_Victoria
0,False,True,False,False,False,False,False
1,False,True,False,False,False,False,False
2,False,True,False,False,False,False,False
3,False,True,False,False,False,False,False
4,False,True,False,False,False,False,False


In [67]:
# Concatenate the region dummies with the original data dataframe
data = pd.concat([data, region_dummies], axis=1)

# Display the dataframe after the addition
data_info = data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 30 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   Suburb                         13580 non-null  object        
 1   Address                        13580 non-null  object        
 2   Rooms                          13580 non-null  int64         
 3   Type                           13580 non-null  object        
 4   Price                          13580 non-null  float64       
 5   Method                         13580 non-null  object        
 6   SellerG                        13580 non-null  object        
 7   Date                           13580 non-null  datetime64[ns]
 8   Distance                       13580 non-null  float64       
 9   Postcode                       13580 non-null  float64       
 10  Bedroom2                       13580 non-null  float64       
 11  Bathroom       