In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [36]:
df = pd.read_csv(r'C:\Users\Odafaz\Desktop\Daft Project\listing.csv')
pd.set_option('display.max_colwidth', None)

# Questions about the data

- Are there any obvious patterns or anomalies?
- How many rows and columns does the dataset have?
- What are the data types of the columns? Are they appropriate for the intended analysis?
- Are there any missing values in the dataset? If so, how prevalent are they and how should they be handled?
- Are there any duplicated records in the dataset?
- Are there any outliers or extreme values that need to be addressed?

In [37]:
df.head()

Unnamed: 0.1,Unnamed: 0,Date Entered/Renewed,Views,Type,Rent,Bedroom,Bathroom,Available From,Furnished,Lease,BER Rating,Address,Distance From City Centre,Latitude,Longitude,Region,Url,Input Date,Distance Category
0,0,2023-07-03,2177,House,4200.0,5,4,Immediately,No,No Minimum,E2,"18 Park View, Castleknock, Dublin 15",5.936479,53.371088,-6.34543,Dublin 15,https://www.daft.ie/for-rent/house-18-park-view-castleknock-dublin-15/5283975,04/07/2023,5 - 7 km
1,1,2023-07-03,1657,Apartment,1950.0,2,1,Immediately,No,No Minimum,,"Kiltipper Gate, Tallaght, Dublin 24",10.873323,53.270194,-6.371307,Dublin 24,https://www.daft.ie/for-rent/apartment-kiltipper-gate-tallaght-dublin-24/5320239,04/07/2023,10 - 15 km
2,2,2023-07-03,4773,House,2200.0,3,2,Immediately,No,No Minimum,C3,"Finn Eber Fort, Finglas, Dublin 11",5.047329,53.386601,-6.299074,Dublin 11,https://www.daft.ie/for-rent/house-finn-eber-fort-finglas-dublin-11/5306164,04/07/2023,5 - 7 km
3,3,2023-07-03,11103,House,2900.0,3,3,Immediately,Yes,Minimum 1 Year,D1,"1 Barnwell Lane, Hansfield",12.685229,53.391471,-6.441591,,https://www.daft.ie/for-rent/house-1-barnwell-lane-hansfield/4701295,04/07/2023,10 - 15 km
4,4,2023-07-03,2453,Studio,1400.0,0,0,Immediately,Yes,Minimum 1 Year,BER Exempt,"4 Annesley Bridge Road, Fairview, Dublin 3",2.600064,53.361324,-6.238628,Dublin 3,https://www.daft.ie/for-rent/studio-apartment-4-annesley-bridge-road-fairview-dublin-3/5325391,04/07/2023,1 - 3 km


In [38]:
df.describe()

Unnamed: 0.1,Unnamed: 0,Views,Rent,Bedroom,Bathroom,Distance From City Centre,Latitude,Longitude
count,744.0,744.0,744.0,744.0,744.0,744.0,744.0,744.0
mean,371.59543,6850.614247,2993.403226,2.032258,1.622312,5.51902,53.336921,-6.254506
std,215.068355,14173.30038,2469.668834,1.331483,1.1904,4.639276,0.048366,0.069496
min,0.0,0.0,866.666667,0.0,0.0,0.138261,53.228331,-6.498807
25%,185.75,1352.5,1889.75,1.0,1.0,2.004283,53.313794,-6.276744
50%,371.5,2534.0,2400.0,2.0,1.0,3.539725,53.335072,-6.249125
75%,557.25,5443.75,3200.0,3.0,2.0,8.598259,53.355302,-6.226484
max,744.0,151297.0,30000.0,13.0,11.0,30.182664,53.613816,-6.075379


In [39]:
df.shape

(744, 19)

In [40]:
df.dtypes

Unnamed: 0                     int64
Date Entered/Renewed          object
Views                          int64
Type                          object
Rent                         float64
Bedroom                        int64
Bathroom                       int64
Available From                object
Furnished                     object
Lease                         object
BER Rating                    object
Address                       object
Distance From City Centre    float64
Latitude                     float64
Longitude                    float64
Region                        object
Url                           object
Input Date                    object
Distance Category             object
dtype: object

In [41]:
df.isna().sum()

Unnamed: 0                     0
Date Entered/Renewed           0
Views                          0
Type                           0
Rent                           0
Bedroom                        0
Bathroom                       0
Available From                 0
Furnished                      0
Lease                          0
BER Rating                    50
Address                        0
Distance From City Centre      0
Latitude                       0
Longitude                      0
Region                       163
Url                            0
Input Date                     0
Distance Category              0
dtype: int64

### Region NaN

I don't see major problems with Region having 175 NaN values. The reason is because the region column comes from a split() function and gets the region based on if it exists "Dublin +(num)" after the last comma in Address. In some cases, the address does not contain information about the region, therefore it stays empty.

### BER Rating NaN

BER stands for Building Energy Rating. It does from A to G (best to worse)

BER Rating is not a mandatory field to be able to list a renting on Daft.ie. It's understandable that some people do not disclosure this information. I would assume people can tend not to display it if their BER Rating very low.


In [42]:
# Addres of rows with NaN 'Region'
region_NaN = df[df['Region'].isna()]
region_NaN['Address']

3                                                      1 Barnwell Lane, Hansfield
5                                                         Kilbarrack Road, Raheny
6                                             Millrace Green, Saggart, Co. Dublin
10                                                          Flat 9, 47 Grove Park
17                          Meadow Court, Stillorgan Park, Stillorgan, Co. Dublin
                                          ...                                    
725             Large 3 Bedroom, Occu Adamstown, Adamstown, Adamstown, Co. Dublin
729    2 Bedroom Apartment , Marietta Woods, Castle Park Road, Dalkey, Co. Dublin
730      2 bedroom apartment, Hali, Cherrywood, Dublin 18, Cherrywood, Co. Dublin
731       Studio Apartment , Marietta Woods, Castle Park Road, Dalkey, Co. Dublin
732      1 bedroom apartment, Hali, Cherrywood, Dublin 18, Cherrywood, Co. Dublin
Name: Address, Length: 163, dtype: object

In [43]:
# Select rows where 'BER Rating' is NaN
ber_NaN= df[df['BER Rating'].isna()]
ber_NaN.head()

Unnamed: 0.1,Unnamed: 0,Date Entered/Renewed,Views,Type,Rent,Bedroom,Bathroom,Available From,Furnished,Lease,BER Rating,Address,Distance From City Centre,Latitude,Longitude,Region,Url,Input Date,Distance Category
1,1,2023-07-03,1657,Apartment,1950.0,2,1,Immediately,No,No Minimum,,"Kiltipper Gate, Tallaght, Dublin 24",10.873323,53.270194,-6.371307,Dublin 24,https://www.daft.ie/for-rent/apartment-kiltipper-gate-tallaght-dublin-24/5320239,04/07/2023,10 - 15 km
9,9,2023-07-03,3617,House,1953.0,3,3,Immediately,No,No Minimum,,"Monastery Heath, Clondalkin, Dublin 22",8.102033,53.321493,-6.382107,Dublin 22,https://www.daft.ie/for-rent/house-monastery-heath-clondalkin-dublin-22/5313925,04/07/2023,7 - 10 km
13,13,2023-07-03,2135,Apartment,1800.0,2,1,Immediately,No,No Minimum,,"Saul Road, Crumlin, Dublin 12",3.136372,53.325495,-6.300524,Dublin 12,https://www.daft.ie/for-rent/apartment-saul-road-crumlin-dublin-12/5319229,04/07/2023,3 - 5 km
14,14,2023-07-03,757,House,2300.0,3,1,Immediately,No,No Minimum,,"Sundale Close, Tallaght, Dublin 24",11.833933,53.280218,-6.407489,Dublin 24,https://www.daft.ie/for-rent/house-sundale-close-tallaght-dublin-24/5320229,04/07/2023,10 - 15 km
156,156,2023-07-03,3338,Apartment,1350.0,1,1,Immediately,No,No Minimum,,"Ballyfermot Parade, Ballyfermot, Dublin 10",5.754168,53.340095,-6.353186,Dublin 10,https://www.daft.ie/for-rent/flat-ballyfermot-parade-ballyfermot-dublin-10/5314520,04/07/2023,5 - 7 km


# Anomalies

- Drop any property that is not a studio and has 0 bedrooms or 0 bathrooms. 
- It can happen in some cases where the person who creates the listing didn't fill out the information well enough in the website. 

In [44]:
# Check if there are properties that are not Studios, but don't have bedrooms or bathrooms
df_anomalies = df.query('Type != "Studio" & (Bedroom == 0 | Bathroom == 0)')
df_anomalies

Unnamed: 0.1,Unnamed: 0,Date Entered/Renewed,Views,Type,Rent,Bedroom,Bathroom,Available From,Furnished,Lease,BER Rating,Address,Distance From City Centre,Latitude,Longitude,Region,Url,Input Date,Distance Category
198,198,2023-07-03,1632,House,30000.0,0,1,Immediately,No,Minimum 1 Year,,"Commercial Unit North Strand Rd, Corner Of North S, Dublin 17",2.505682,53.360637,-6.239497,Dublin 17,https://www.daft.ie/for-rent/house-commercial-unit-north-strand-rd-corner-of-north-s-dublin-17/4737256,04/07/2023,1 - 3 km


In [45]:
# Drop anomaly
df = df.drop(df_anomalies.index)

# Outliers

- I would say every property listed for over €5000/month with less than 3 bedrooms is an outlier as very few people can realistic afford it as Dublin's average net montly salary is around €2900. 
- The purpose of this project is to analyze and predict rent prices for the average person. Therefore, I will remove these outliers.

In [46]:
df_outliers = df.query('Rent > 5000 & Bedroom <= 2')
df_outliers

Unnamed: 0.1,Unnamed: 0,Date Entered/Renewed,Views,Type,Rent,Bedroom,Bathroom,Available From,Furnished,Lease,BER Rating,Address,Distance From City Centre,Latitude,Longitude,Region,Url,Input Date,Distance Category
125,125,2023-07-04,2504,Apartment,5500.0,2,2,Immediately,Yes,Minimum 1 Year,A2,"The Barrington, Lansdowne Place, Dublin 4",2.680511,53.332788,-6.233008,Dublin 4,https://www.daft.ie/for-rent/apartment-the-barrington-lansdowne-place-dublin-4/4728426,04/07/2023,1 - 3 km
374,374,2023-07-03,2531,Apartment,8500.0,2,2,Immediately,Yes,Minimum 6 Months,A2,"Apartment 12, The Nicholson, Lansdowne Place, Ballsbridge, Dublin 4",2.697586,53.333041,-6.232452,Dublin 4,https://www.daft.ie/for-rent/apartment-apartment-12-the-nicholson-lansdowne-place-ballsbridge-dublin-4/5287863,04/07/2023,1 - 3 km
411,411,2023-07-03,280,Studio,18000.0,0,0,Immediately,Yes,No Minimum,D2,"8 Apartments - 38/39 Abbey Street Upper, Dublin 1",0.344344,53.347773,-6.263867,Dublin 1,https://www.daft.ie/for-rent/studio-apartment-8-apartments-38-39-abbey-street-upper-dublin-1/5311063,04/07/2023,< 1 km
524,524,2023-07-03,3211,Apartment,7500.0,2,2,Immediately,Yes,Minimum 1 Year,A2,"Apartment 34, The Nicholson, Lansdowne Place, Ballsbridge, Dublin 4",2.70623,53.33316,-6.232184,Dublin 4,https://www.daft.ie/for-rent/apartment-apartment-34-the-nicholson-lansdowne-place-ballsbridge-dublin-4/4718646,04/07/2023,1 - 3 km
526,526,2023-07-03,3108,Apartment,8000.0,2,2,Immediately,Yes,Minimum 1 Year,A2,"Apartment 11, The Nicholson, Lansdowne Place, Ballsbridge, Dublin 4",2.697586,53.333041,-6.232452,Dublin 4,https://www.daft.ie/for-rent/apartment-apartment-11-the-nicholson-lansdowne-place-ballsbridge-dublin-4/4718605,04/07/2023,1 - 3 km
614,614,2023-06-30,5033,Apartment,13500.0,2,3,Immediately,Yes,Minimum 1 Year,A3,"Apartment 2201, The Hailing Station, Capital Dock Residence, Dublin 2",2.399817,53.345258,-6.231223,Dublin 2,https://www.daft.ie/for-rent/apartment-apartment-2201-the-hailing-station-capital-dock-residence-dublin-2/4718076,04/07/2023,1 - 3 km


# Correlation

- Bedroom and Bathroom have a high correlation (0.68 and 0.73 respectively) with Rent prices, which makes a lot of sense.
- Distance From City Center has negative correlation (-0.087), which makes sense as property prices tend to decrease the further you are from the city centre

In [47]:
correlation = df.corr(numeric_only=True)
correlation 

Unnamed: 0.1,Unnamed: 0,Views,Rent,Bedroom,Bathroom,Distance From City Centre,Latitude,Longitude
Unnamed: 0,1.0,0.227618,-0.002323,-0.034175,0.02606,0.088732,0.00293,0.001699
Views,0.227618,1.0,-0.048637,-0.112823,-0.055764,-0.016652,0.007589,-0.055141
Rent,-0.002323,-0.048637,1.0,0.680906,0.737396,-0.087661,-0.060155,0.153049
Bedroom,-0.034175,-0.112823,0.680906,1.0,0.837708,0.166231,0.0247,0.105547
Bathroom,0.02606,-0.055764,0.737396,0.837708,1.0,0.14846,-0.00374,0.119248
Distance From City Centre,0.088732,-0.016652,-0.087661,0.166231,0.14846,1.0,0.145261,0.157712
Latitude,0.00293,0.007589,-0.060155,0.0247,-0.00374,0.145261,1.0,0.023826
Longitude,0.001699,-0.055141,0.153049,0.105547,0.119248,0.157712,0.023826,1.0


In [48]:
# Drop outliers
df = df.drop(df_outliers.index)

In [49]:
df.to_csv(r'C:\Users\Odafaz\Desktop\Daft Project\listing_final.csv', encoding ='utf-8')