### **FEATURE ENGINEERING FOR AIRBNB SEASONALITY ANALYSIS**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

LOAD DATA AND SETUP

In [2]:
df = pd.read_csv('cleaned_data_for_analysis.csv')

DATE COMPONENTS

In [3]:
df['last_review'] = pd.to_datetime(df['last_review'],errors='coerce')

In [4]:
if pd.api.types.is_datetime64_any_dtype(df['last_review']):
    print(" last_review is datetime")
    print(f"From: {df['last_review'].min().date()}")
    print(f"To: {df['last_review'].max().date()}")
else:
    print("last_review is NOT datetime")

 last_review is datetime
From: 2012-08-25
To: 2025-06-26


In [5]:
# RANGE OF KEY COLUMNS

key_cols = ['price','minimum_nights','availability_365','reviews_per_month']
for col in key_cols:
    if col in df.columns:
        print(f"- {col}: {df[col].min():.2f} to {df[col].max():.2f}")

- price: 50.00 to 1200.00
- minimum_nights: 1.00 to 365.00
- availability_365: 0.00 to 365.00
- reviews_per_month: 0.00 to 25.23


In [6]:
print(f"\n📊 SAMPLE DATA:")
df[['name', 'last_review', 'price', 'reviews_per_month', 'neighbourhood_group']].head()


📊 SAMPLE DATA:


Unnamed: 0,name,last_review,price,reviews_per_month,neighbourhood_group
0,Clean & quiet apt home by the park,2021-10-19,966.0,0.21,Brooklyn
1,Skylit Midtown Castle,2022-05-21,142.0,0.38,Manhattan
2,Entire Apt: Spacious Studio/Loft by central park,2018-11-19,204.0,0.1,Manhattan
3,Large Furnished Room Near B'way,2019-06-24,1018.0,3.47,Manhattan
4,Beautiful 1br on Upper West Side,2019-06-22,606.0,0.43,Manhattan


In [7]:
print(F"ALL COLUMNS ({len(df.columns)}): ")
for i, col in enumerate(df.columns,1):
    print(f" {i:2d}.{col}")

ALL COLUMNS (27): 
  1.id
  2.name
  3.host_id
  4.host_identity_verified
  5.host_name
  6.neighbourhood_group
  7.neighbourhood
  8.lat
  9.long
 10.country
 11.country_code
 12.instant_bookable
 13.cancellation_policy
 14.room_type
 15.construction_year
 16.price
 17.service_fee
 18.minimum_nights
 19.number_of_reviews
 20.last_review
 21.reviews_per_month
 22.review_rate_number
 23.calculated_host_listings_count
 24.availability_365
 25.house_rules
 26.license
 27.missing_location


In [8]:
print(f"\n✨ KEY COLUMNS FOR SEASONALITY ANALYSIS: ")

# define important categories

columns_importance = {
    'Critical': [
        'last_review',              # Time series base
        'reviews_per_month',        # Demand 
        'price'                     # Revenue
    ],
    'High Priority': [
        'number_of_reviews',        # Total Demand
        'availability_365',         # Supply metric
        'minimum_nights',           # Booking patterns
        'neighbourhood_group',      # Location segmentation
        'room_type'                 # Property segmentation
    ],
    'Medium Priority': [
        'service_fee',              # Additional pricing
        'calculated_host_listings_count',   # Host type indicator
        'lat','long'                # Geospatial analysis
    ]
}

for category, cols in columns_importance.items():
    print(f"\n{category}:")
    for col in cols:
        if col in df.columns:
            print(f"{col} found")
        else:
            print(f"{col} not available")


✨ KEY COLUMNS FOR SEASONALITY ANALYSIS: 

Critical:
last_review found
reviews_per_month found
price found

High Priority:
number_of_reviews found
availability_365 found
minimum_nights found
neighbourhood_group found
room_type found

Medium Priority:
service_fee found
calculated_host_listings_count found
lat found
long found


In [9]:
# Analysis columns

analysis_columns =[]
for category, cols in columns_importance.items():
    analysis_columns.extend([col for col in cols if col in df.columns])
print(f"\nTotal columns for analysis: {len(analysis_columns)}")
print(f"    {analysis_columns}")


Total columns for analysis: 12
    ['last_review', 'reviews_per_month', 'price', 'number_of_reviews', 'availability_365', 'minimum_nights', 'neighbourhood_group', 'room_type', 'service_fee', 'calculated_host_listings_count', 'lat', 'long']


##### **TIME-BASED FEATURES**

In [10]:
print("=" * 30)
print("EXTRACTING DATE FEATURES")
print("=" * 30)

# Extracting date components from last_review
df['year'] = df['last_review'].dt.year
df['month'] = df['last_review'].dt.month
df['day'] = df['last_review'].dt.day
df['day_of_week'] = df['last_review'].dt.day_of_week
df['day_name'] = df['last_review'].dt.day_name()
df['quarter'] = df['last_review'].dt.quarter
df['week_of_year'] = df['last_review'].dt.isocalendar().week

# Sample data
df[['last_review','year','month','quarter','day_name']].head(10)

EXTRACTING DATE FEATURES


Unnamed: 0,last_review,year,month,quarter,day_name
0,2021-10-19,2021,10,4,Tuesday
1,2022-05-21,2022,5,2,Saturday
2,2018-11-19,2018,11,4,Monday
3,2019-06-24,2019,6,2,Monday
4,2019-06-22,2019,6,2,Saturday
5,2019-06-23,2019,6,2,Sunday
6,2019-06-24,2019,6,2,Monday
7,2018-10-31,2018,10,4,Wednesday
8,2019-06-29,2019,6,2,Saturday
9,2019-06-28,2019,6,2,Friday


##### **SEASONALITY INDICATORS**

In [11]:
print("=" * 30)
print("CREATING SEASON CATEGORIES")
print("=" * 30)

def assign_season(month):
    if month in [12,1,2]:
        return 'Winter'
    elif month in [3,4,5]:
        return 'Summer'
    elif month in [6,7,8]:
        return 'Summer'
    else :  # month in 9.10,11
        return 'Fall'

df['season'] = df['month'].apply(assign_season)

print("season categories created: ")
print(df['season'].value_counts().sort_index())
print('\n')
print("SAMPLE WITH SEASONS: ")
df[['last_review','month','season','reviews_per_month']].head(10)

CREATING SEASON CATEGORIES
season categories created: 
season
Fall       6055
Summer    32647
Winter    11828
Name: count, dtype: int64


SAMPLE WITH SEASONS: 


Unnamed: 0,last_review,month,season,reviews_per_month
0,2021-10-19,10,Fall,0.21
1,2022-05-21,5,Summer,0.38
2,2018-11-19,11,Fall,0.1
3,2019-06-24,6,Summer,3.47
4,2019-06-22,6,Summer,0.43
5,2019-06-23,6,Summer,1.5
6,2019-06-24,6,Summer,1.34
7,2018-10-31,10,Fall,0.22
8,2019-06-29,6,Summer,1.2
9,2019-06-28,6,Summer,1.72


CREATING WEEKEND / WEEKDAY FLAG

In [13]:
print("="*30)
print("CREATING WEEKEND/WEEKDAY FLAGS")
print("="*30)

df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
print("weekend flag created")
print(f"Weekday Reviews: {(df['is_weekend']==0).sum():,}")
print(f"Weekend Reviews: {(df['is_weekend']==1).sum():,}")

CREATING WEEKEND/WEEKDAY FLAGS
weekend flag created
Weekday Reviews: 29,555
Weekend Reviews: 20,975


In [14]:
print("\nDISTRIBUTION")
weekend_distribution = df.groupby('is_weekend')['reviews_per_month'].mean()
print(f"-> Average reviews/month on weekdays: {weekend_distribution[0]:.2f}")
print(f"-> Average reviews/month on weekends: {weekend_distribution[1]:.2f}")


DISTRIBUTION
-> Average reviews/month on weekdays: 1.33
-> Average reviews/month on weekends: 1.49


In [18]:
print("="*30)
print("SEASONALITY SUMMARY")
print("="*30)

new_features=['year','month','day','quarter','day_of_week','day_name',
              'week_of_year','season','is_weekend']

print(f"\nNEW FEATURES CREATED: {len(new_features)}")
for i, feature in enumerate(new_features,1):
    print(f"{i}. {feature}")

print(f"\n UPDATED DATASET: ")
print(f" -> Total columns: {len(df.columns)}")
print(f" -> New Date Features: {len(new_features)}")

# All columns with new features
print("\n SAMPLE WITH NEW FEATURES: ")
display_cols = ['last_review', 'year', 'month', 'quarter', 'season', 
                'day_name', 'is_weekend', 'price', 'reviews_per_month']
df[display_cols].head()

SEASONALITY SUMMARY

NEW FEATURES CREATED: 9
1. year
2. month
3. day
4. quarter
5. day_of_week
6. day_name
7. week_of_year
8. season
9. is_weekend

 UPDATED DATASET: 
 -> Total columns: 36
 -> New Date Features: 9

 SAMPLE WITH NEW FEATURES: 


Unnamed: 0,last_review,year,month,quarter,season,day_name,is_weekend,price,reviews_per_month
0,2021-10-19,2021,10,4,Fall,Tuesday,0,966.0,0.21
1,2022-05-21,2022,5,2,Summer,Saturday,1,142.0,0.38
2,2018-11-19,2018,11,4,Fall,Monday,0,204.0,0.1
3,2019-06-24,2019,6,2,Summer,Monday,0,1018.0,3.47
4,2019-06-22,2019,6,2,Summer,Saturday,1,606.0,0.43


DEMAND INDICATORS

PRICE CATEGORIES

BOOKING PATTERNS