# Nashville

In [1]:
import pandas as pd
import numpy as np

In [2]:
calendar = pd.read_csv('usa/Nashville/calendar.csv')
listings = pd.read_csv('usa/Nashville/listings.csv')
listings_detailed = pd.read_csv('usa/Nashville/listings_detailed.csv')
neighbor = pd.read_csv('usa/Nashville/neighbourhoods.csv')
neighbor_json = pd.read_json('usa/Nashville/neighbourhoods.geojson')
review = pd.read_csv('usa/Nashville/reviews.csv')
review_detailed = pd.read_csv('usa/Nashville/reviews_detailed.csv')

In [3]:
data_list = [
    ("calendar", calendar.shape),  
    ("listings", listings.shape),
    ("listings_detailed", listings_detailed.shape),
    ("neighbor", neighbor.shape),
    ("neighbor_json", neighbor_json.shape),  # JSON structure might be more complex than what .shape can describe
    ("review", review.shape),
    ("review_detailed", review_detailed.shape)
]

for name, shape in data_list:
    print(f"The shape of '{name}' dataframe is: {shape}")

The shape of 'calendar' dataframe is: (3119707, 7)
The shape of 'listings' dataframe is: (8548, 18)
The shape of 'listings_detailed' dataframe is: (8548, 75)
The shape of 'neighbor' dataframe is: (35, 2)
The shape of 'neighbor_json' dataframe is: (35, 2)
The shape of 'review' dataframe is: (541576, 2)
The shape of 'review_detailed' dataframe is: (541576, 6)


In [4]:
# Function to perform EDA
def perform_eda(df):
    print("Rows, Columns:", df.shape)
    print("Missing Values:\n", df.isnull().sum())
    numerical_features = df.select_dtypes(include=[np.number])
    categorical_features = df.select_dtypes(include=[object])
    print("Numerical Features:", numerical_features.columns.tolist())
    print("Categorical Features:", categorical_features.columns.tolist())
    if not numerical_features.empty:
        print("Mean of Numerical Features:\n", numerical_features.mean())
        print("Standard Deviation of Numerical Features:\n", numerical_features.std())
        for col in numerical_features.columns:
            print(f"Range of {col}: {numerical_features[col].min()} - {numerical_features[col].max()}")




In [5]:
dataframes = {
    "calendar": calendar,
    "listings": listings,
    "listings_detailed": listings_detailed,
    "neighbor": neighbor,
    # For neighbor_json, since it's a GeoJSON, the EDA will be different and not covered in this general function.
    "review": review,
    "review_detailed": review_detailed
}

for name, df in dataframes.items():
    print(f"EDA for {name}:")
    perform_eda(df)
    print('=================='*10)

EDA for calendar:
Rows, Columns: (3119707, 7)
Missing Values:
 listing_id        0
date              0
available         0
price             0
adjusted_price    0
minimum_nights    1
maximum_nights    1
dtype: int64
Numerical Features: ['listing_id', 'minimum_nights', 'maximum_nights']
Categorical Features: ['date', 'available', 'price', 'adjusted_price']
Mean of Numerical Features:
 listing_id        2.780580e+17
minimum_nights    6.559823e+00
maximum_nights    1.242534e+06
dtype: float64
Standard Deviation of Numerical Features:
 listing_id        3.490456e+17
minimum_nights    1.778989e+01
maximum_nights    5.162566e+07
dtype: float64
Range of listing_id: 6422 - 849420841551486570
Range of minimum_nights: 1.0 - 999.0
Range of maximum_nights: 1.0 - 2147483647.0
EDA for listings:
Rows, Columns: (8548, 18)
Missing Values:
 id                                   0
name                                 0
host_id                              0
host_name                            0
neighbour