In [1]:
#Importing our dataset into VS code
import pandas as pd
import openpyxl
import ast
file_path = '/Users/nasase/Documents/GitHub/ADS_Projects/Airbnb Dataset_FINAL.xlsx'

df = pd.read_excel(file_path)

print(df.head())

      ID                                               NAME  \
0   5456          Walk to 6th, Rainey St and Convention Ctr   
1   5769                                     NW Austin Room   
2   6448  Secluded Studio @ Zilker - King Bed, Bright & ...   
3   8502                            Woodland Studio Lodging   
4  13035      Historic house in highly walkable East Austin   

                                  NEIGHBOUR OVERVIEW Host Location  \
0  My neighborhood is ideally located if you want...    Austin, TX   
1  Quiet neighborhood with lots of trees and good...    Austin, TX   
2  The neighborhood is fun and funky (but quiet)!...    Austin, TX   
3                                                NaN    Austin, TX   
4  East Cesar Chavez is a gentrifying urban area ...    Austin, TX   

  Host Neighbourhood         Property Type        Room Type  Accommodates  \
0      East Downtown     Entire guesthouse  Entire home/apt             3   
1  SW Williamson Co.  Private room in home    

In [2]:
#Check column header names
print(df.columns)

Index(['ID', 'NAME', 'NEIGHBOUR OVERVIEW', 'Host Location',
       'Host Neighbourhood', 'Property Type', 'Room Type', 'Accommodates',
       'Bathrooms', 'Bedrooms', 'Beds', 'Amenities', 'Price',
       'Number of Reviews', 'Review Scores Rating'],
      dtype='object')


In [3]:
#Checking for Duplicates 
duplicates = df[df.duplicated(subset=['ID', 'NAME', 'Host Location', 'Host Neighbourhood'], keep=False)]
#Used these four columns to ensure there are no duplicate entries. 

if not duplicates.empty: 
    print(f"Found {duplicates.shape[0]} duplicates")
    print(duplicates)
else:
    print("No duplicates found")

No duplicates found


In [4]:
print(df.dtypes)
#We can see that each column is set to a proper data type so no issues will occur later in the project. 

ID                        int64
NAME                     object
NEIGHBOUR OVERVIEW       object
Host Location            object
Host Neighbourhood       object
Property Type            object
Room Type                object
Accommodates              int64
Bathrooms               float64
Bedrooms                float64
Beds                    float64
Amenities                object
Price                     int64
Number of Reviews         int64
Review Scores Rating    float64
dtype: object


In [5]:
print(df['Amenities'].sample(10))

4370    ["Hot water", "Extra pillows and blankets", "B...
570     ["Hot water", "Outdoor dining area", "Luggage ...
3543            ["Exterior security cameras on property"]
4360    ["Free dryer \u2013 In building", "Hot water",...
5057    ["Hot water", "Private backyard \u2013 Not ful...
5902    ["Hot water", "Outdoor dining area", "Extra pi...
847     ["Hot water", "Ceiling fan", "Safe", "Extra pi...
3971    ["Pool", "Hot water", "Ceiling fan", "Extra pi...
4912    ["Hot water", "Clothing storage", "Extra pillo...
2547    ["Outdoor kitchen", "Hot water", "Outdoor dini...
Name: Amenities, dtype: object


In [6]:
#Convert Amenities into usable list
import ast
df['Amenities'] = df['Amenities'].apply(lambda x: ast.literal_eval(x) if isinstance (x, str) else [])

print(df['Amenities'].head())



0    [Hot water, Extra pillows and blankets, Backya...
1    [Hot water, Outdoor dining area, Ceiling fan, ...
2    [Free dryer – In building, Hot water, Outdoor ...
3                                                   []
4    [Hot water, Ceiling fan, Extra pillows and bla...
Name: Amenities, dtype: object


In [7]:
#Getting Amenity counts
all_amenities = [amenity for sublist in df['Amenities'] if isinstance(sublist, list) for amenity in sublist]
amenities_count = pd.Series(all_amenities).value_counts().reset_index()
amenities_count.columns = ['Neighborhood', 'Count']

pd.set_option('display.max_rows', None)
print(amenities_count)


                                           Neighborhood  Count
0                                           Smoke alarm   7248
1                                               Kitchen   6937
2                                            Essentials   6449
3                                                  Wifi   6415
4                                             Hot water   6399
5                                 Dishes and silverware   6314
6                                 Carbon monoxide alarm   6168
7                                            Hair dryer   6158
8                                               Hangers   6074
9                                        Cooking basics   6054
10                                    Fire extinguisher   6034
11                             Free parking on premises   5876
12                                            Microwave   5813
13                                        Self check-in   5808
14                                                 Iron

In [8]:
#Getting Neighborhood Counts
neighborhood_count= df['Host Neighbourhood'].value_counts().reset_index()
neighborhood_count.columns = ['Host Neighourhood', 'Count']

print(neighborhood_count)

                          Host Neighourhood  Count
0                              South Austin    376
1                             East Downtown    354
2                                    Zilker    212
3                                 Hyde Park    198
4                                     Holly    193
5                               South Lamar    191
6                   East Riverside - Oltorf    186
7                           Downtown Austin    154
8                            Travis Heights    130
9                                    Dawson    128
10                            Bouldin Creek    121
11                      Central East Austin    120
12                               North Loop    118
13                 Greater South River City    117
14                              Clarksville    113
15                           Central Austin    112
16                                  Govalle    110
17                             Barton Hills    106
18                        East 