In [None]:
'''
for colab
# connect to google drive
from google.colab import drive
drive.mount('/content/drive')
'''

Mounted at /content/drive


In [16]:
import pandas as pd
import zipfile

In [2]:
CSV_PATH = 'data/heights.csv'

In [3]:
df = pd.read_csv(CSV_PATH)
df.head()

Unnamed: 0,name,height
0,mohan,5.9
1,maria,5.2
2,sakib,5.1
3,tao,5.5
4,virat,4.9


In [4]:
"""
  - setting a quantile threshold
  - 0.95, 0.05 is typical, but it could be different based on the problem and the data
"""
upper_threshold = df['height'].quantile(0.95)
lower_threshold = df['height'].quantile(0.05)

In [5]:
# now find all outliers in the set
df[df['height']>upper_threshold]

Unnamed: 0,name,height
9,imran,14.5


In [6]:
df[df['height']<lower_threshold]

Unnamed: 0,name,height
12,yoseph,1.2


In [None]:
"""
  Example of using domain knowledge would be know what is the possible height
  for people, using the tallest and smallest living people as the thresholds
"""

In [7]:
# remove the outliers
# had to use & not and
df[(df['height'] > lower_threshold) & (df['height'] < upper_threshold)]

Unnamed: 0,name,height
0,mohan,5.9
1,maria,5.2
2,sakib,5.1
3,tao,5.5
4,virat,4.9
5,khusbu,5.4
6,dmitry,6.2
7,selena,6.5
8,john,7.1
10,jose,6.1


In [8]:
# Now it's time to move to a more complex data set
CSV_PATH = 'data/bhp.csv'
df = pd.read_csv(CSV_PATH)
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250


In [9]:
# look at the size of  the data
df.shape

(13200, 7)

In [10]:
# describe is a great way to look at the quantiles and more
df.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13200.0,13200.0,13200.0,13200.0,13200.0
mean,1555.302783,2.691136,112.276178,2.800833,7920.337
std,1237.323445,1.338915,149.175995,1.292843,106727.2
min,1.0,1.0,8.0,1.0,267.0
25%,1100.0,2.0,50.0,2.0,4267.0
50%,1275.0,2.0,71.85,3.0,5438.0
75%,1672.0,3.0,120.0,3.0,7317.0
max,52272.0,40.0,3600.0,43.0,12000000.0


In [11]:
"""
  - Domain knowledge may not always come from yourself
  - You would could work in a team with someone with more knowledge that you and
    that would guide your choices
"""
lower_threshold, upper_threshold = df['price_per_sqft'].quantile([0.001, 0.999])
upper_threshold, lower_threshold

(50959.36200000098, 1366.184)

In [12]:
df[df['price_per_sqft'] > upper_threshold]

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
345,other,3 Bedroom,11.0,3.0,74.0,3,672727
1005,other,1 BHK,15.0,1.0,30.0,1,200000
1106,other,5 Bedroom,24.0,2.0,150.0,5,625000
4044,Sarjapur Road,4 Bedroom,1.0,4.0,120.0,4,12000000
4924,other,7 BHK,5.0,7.0,115.0,7,2300000
5911,Mysore Road,1 Bedroom,45.0,1.0,23.0,1,51111
6356,Bommenahalli,4 Bedroom,2940.0,3.0,2250.0,4,76530
7012,other,1 BHK,650.0,1.0,500.0,1,76923
7575,other,1 BHK,425.0,1.0,750.0,1,176470
7799,other,4 BHK,2000.0,3.0,1063.0,4,53150


In [13]:
df[df['price_per_sqft'] < lower_threshold]

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
665,Yelahanka,3 BHK,35000.0,3.0,130.0,3,371
798,other,4 Bedroom,10961.0,4.0,80.0,4,729
1867,other,3 Bedroom,52272.0,2.0,140.0,3,267
2392,other,4 Bedroom,2000.0,3.0,25.0,4,1250
3934,other,1 BHK,1500.0,1.0,19.5,1,1300
5343,other,9 BHK,42000.0,8.0,175.0,9,416
5417,Ulsoor,4 BHK,36000.0,4.0,450.0,4,1250
5597,JP Nagar,2 BHK,1100.0,1.0,15.0,2,1363
7166,Yelahanka,1 Bedroom,26136.0,1.0,150.0,1,573
7862,JP Nagar,3 BHK,20000.0,3.0,175.0,3,875


In [14]:
df_no_outliers = df[(df['price_per_sqft'] < upper_threshold) &
 (df['price_per_sqft'] > lower_threshold)]
df_no_outliers.shape

(13172, 7)

In [None]:
# Use sample to get random entries
df_no_outliers.sample(5)

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
11991,Thanisandra,4 Bedroom,3671.0,4.0,220.0,4,5992
11518,Kammasandra,2 BHK,982.0,2.0,25.53,2,2599
3608,Sarjapur Road,2 BHK,984.0,2.0,45.91,2,4665
2727,KR Puram,4 Bedroom,600.0,5.0,75.0,4,12500
11166,Electronic City Phase II,2 BHK,829.0,2.0,22.8,2,2750


Airbnb Problem

In [17]:
ZIP_PATH = "data/airbnb.zip"
DESTINATION_DIR = "data/"

try:
    with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall(DESTINATION_DIR)
    print(f"Success.")
except FileNotFoundError:
    print(f"'{ZIP_PATH}' not found.")
except zipfile.BadZipFile:
    print(f"'{ZIP_PATH}' is not a valid ZIP file.")
except Exception as e:
    print(f"Unexpected error: {e}")

Success.


In [18]:
CSV_PATH = "data/new_york_listings_2024.csv"
df = pd.read_csv(CSV_PATH)
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,...,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license,rating,bedrooms,beds,baths
0,1312228,Rental unit in Brooklyn · ★5.0 · 1 bedroom,7130382,Walter,Brooklyn,Clinton Hill,40.68371,-73.96461,Private room,55.0,...,2015-12-20,0.03,1,0,0,No License,5.0,1,1,Not specified
1,45277537,Rental unit in New York · ★4.67 · 2 bedrooms ·...,51501835,Jeniffer,Manhattan,Hell's Kitchen,40.76661,-73.9881,Entire home/apt,144.0,...,2023-05-01,0.24,139,364,2,No License,4.67,2,1,1
2,971353993633883038,Rental unit in New York · ★4.17 · 1 bedroom · ...,528871354,Joshua,Manhattan,Chelsea,40.750764,-73.994605,Entire home/apt,187.0,...,2023-12-18,1.67,1,343,6,Exempt,4.17,1,2,1
3,3857863,Rental unit in New York · ★4.64 · 1 bedroom · ...,19902271,John And Catherine,Manhattan,Washington Heights,40.8356,-73.9425,Private room,120.0,...,2023-09-17,1.38,2,363,12,No License,4.64,1,1,1
4,40896611,Condo in New York · ★4.91 · Studio · 1 bed · 1...,61391963,Stay With Vibe,Manhattan,Murray Hill,40.75112,-73.9786,Entire home/apt,85.0,...,2023-12-03,0.24,133,335,3,No License,4.91,Studio,1,1


In [19]:
df.shape

(20758, 22)

In [20]:
df.describe()

Unnamed: 0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,beds
count,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0
mean,3.034044e+17,174931600.0,40.726798,-73.939161,187.776616,28.558435,42.642596,1.25791,18.844108,205.990317,10.852105,1.723721
std,3.901216e+17,172554100.0,0.060294,0.061403,1022.797208,33.536518,73.561654,1.904661,70.910834,135.087768,21.357071,1.212272
min,2595.0,1678.0,40.500314,-74.24984,10.0,1.0,1.0,0.01,1.0,0.0,0.0,1.0
25%,27088080.0,20417380.0,40.68415,-73.98071,80.0,30.0,4.0,0.21,1.0,87.0,1.0,1.0
50%,49930030.0,108727100.0,40.72282,-73.949587,125.0,30.0,14.0,0.65,2.0,215.0,3.0,1.0
75%,7.216019e+17,314410200.0,40.763098,-73.91746,199.0,30.0,49.0,1.8,5.0,353.0,15.0,2.0
max,1.054376e+18,550403500.0,40.911147,-73.71365,100000.0,1250.0,1865.0,75.49,713.0,365.0,1075.0,42.0


In [21]:
df.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'number_of_reviews_ltm', 'license', 'rating',
       'bedrooms', 'beds', 'baths'],
      dtype='object')

In [35]:
upper_threshold = df['price'].quantile(.99)
df_filtered = df[(df['price'] > 100) & (df['price'] < upper_threshold)]
df_filtered.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,...,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license,rating,bedrooms,beds,baths
1,45277537,Rental unit in New York · ★4.67 · 2 bedrooms ·...,51501835,Jeniffer,Manhattan,Hell's Kitchen,40.76661,-73.9881,Entire home/apt,144.0,...,2023-05-01,0.24,139,364,2,No License,4.67,2,1,1
2,971353993633883038,Rental unit in New York · ★4.17 · 1 bedroom · ...,528871354,Joshua,Manhattan,Chelsea,40.750764,-73.994605,Entire home/apt,187.0,...,2023-12-18,1.67,1,343,6,Exempt,4.17,1,2,1
3,3857863,Rental unit in New York · ★4.64 · 1 bedroom · ...,19902271,John And Catherine,Manhattan,Washington Heights,40.8356,-73.9425,Private room,120.0,...,2023-09-17,1.38,2,363,12,No License,4.64,1,1,1
5,49584983,Rental unit in New York · ★5.0 · 1 bedroom · 1...,51501835,Jeniffer,Manhattan,Hell's Kitchen,40.75995,-73.99296,Entire home/apt,115.0,...,2023-07-29,0.16,139,276,2,No License,5.0,1,1,1
6,45457047,Rental unit in New York · ★4.33 · Studio · 1 b...,51501835,Jeniffer,Manhattan,Hell's Kitchen,40.76737,-73.98787,Entire home/apt,105.0,...,2022-08-31,0.1,139,364,0,No License,4.33,Studio,1,1


In [37]:
df_filtered.shape

(12548, 22)

In [39]:
df_filtered.sample(10)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,...,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license,rating,bedrooms,beds,baths
3037,942635903441753168,Rental unit in New York · ★4.63 · 1 bedroom · ...,401202937,Eliza,Manhattan,Midtown,40.746203,-73.988347,Private room,149.0,...,2023-12-21,1.79,55,128,8,Exempt,4.63,1,4,1
17244,41934934,Home in New York · ★5.0 · 1 bedroom · 1 bed · ...,36793116,Mighty Tree Properties,Manhattan,Midtown,40.76493,-73.97814,Entire home/apt,192.0,...,2023-11-15,0.25,6,326,3,No License,5.0,1,1,1
5315,22345676,Rental unit in Brooklyn · ★4.59 · Studio · 1 b...,2921109,Alice,Brooklyn,Bedford-Stuyvesant,40.68957,-73.92968,Entire home/apt,112.0,...,2020-07-17,0.65,1,90,0,No License,4.59,Studio,1,1
18569,35026161,Rental unit in New York · ★4.71 · Studio · 2 b...,113723310,MyNyHousing,Manhattan,Upper West Side,40.78939,-73.97218,Entire home/apt,150.0,...,2023-04-30,0.27,60,335,3,No License,4.71,Studio,2,1
14849,871933212421118004,Home in Queens · ★4.67 · 1 bedroom · 2 beds · ...,42814202,Irina,Queens,Fresh Meadows,40.743466,-73.78837,Private room,111.0,...,2023-10-11,1.08,6,364,9,No License,4.67,1,2,1
11570,23650568,Rental unit in Queens · ★4.89 · 2 bedrooms · 4...,24771677,Noelle,Queens,Bayside,40.76383,-73.77133,Entire home/apt,145.0,...,2022-08-19,1.05,3,92,0,No License,4.89,2,4,1
17383,36943029,Rental unit in Queens · ★4.95 · 4 bedrooms · 7...,75268330,Arel Concepts,Queens,Astoria,40.7642,-73.90611,Entire home/apt,475.0,...,2023-11-12,3.6,1,0,54,No License,4.95,4,7,2
13172,607202437461817180,Rental unit in Brooklyn · ★4.76 · 1 bedroom · ...,398614581,Hamza,Brooklyn,Fort Greene,40.68735,-73.975107,Private room,115.0,...,2023-12-05,2.65,4,363,26,No License,4.76,1,1,1
9586,31311422,Rental unit in New York · ★4.88 · 1 bedroom · ...,234357462,Marina,Manhattan,West Village,40.73412,-74.00506,Entire home/apt,196.0,...,2023-11-19,0.8,1,333,4,No License,4.88,1,2,1
2323,3530517,Loft in New York · ★4.95 · 2 bedrooms · 2 beds...,17773625,Josée,Manhattan,Tribeca,40.71922,-74.00276,Entire home/apt,990.0,...,2023-12-06,1.24,1,362,18,No License,4.95,2,2,2
