In [1]:
import pandas as pd

In [3]:
df = pd.read_csv("Airbnb_data.csv")

In [5]:
df.shape

(74111, 29)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74111 entries, 0 to 74110
Data columns (total 29 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      74111 non-null  int64  
 1   log_price               74111 non-null  float64
 2   property_type           74111 non-null  object 
 3   room_type               74111 non-null  object 
 4   amenities               74111 non-null  object 
 5   accommodates            74111 non-null  int64  
 6   bathrooms               73911 non-null  float64
 7   bed_type                74111 non-null  object 
 8   cancellation_policy     74111 non-null  object 
 9   cleaning_fee            74111 non-null  bool   
 10  city                    74111 non-null  object 
 11  description             74111 non-null  object 
 12  first_review            58247 non-null  object 
 13  host_has_profile_pic    73923 non-null  object 
 14  host_identity_verified  73923 non-null

In [9]:
# Missing values summary
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

bathrooms                   200
first_review              15864
host_has_profile_pic        188
host_identity_verified      188
host_response_rate        18299
host_since                  188
last_review               15827
neighbourhood              6872
review_scores_rating      16722
thumbnail_url              8216
zipcode                     968
bedrooms                     91
beds                        131
dtype: int64


In [13]:
# Drop specified columns
columns_to_drop = ['id', 'name', 'thumbnail_url','host_since','first_review', 'last_review', 'host_has_profile_pic', 'host_identity_verified']
df = df.drop(columns=columns_to_drop, axis=1)

# Verify the columns are removed
print(df.columns)

Index(['log_price', 'property_type', 'room_type', 'amenities', 'accommodates',
       'bathrooms', 'bed_type', 'cancellation_policy', 'cleaning_fee', 'city',
       'description', 'host_response_rate', 'instant_bookable', 'latitude',
       'longitude', 'neighbourhood', 'number_of_reviews',
       'review_scores_rating', 'zipcode', 'bedrooms', 'beds'],
      dtype='object')


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74111 entries, 0 to 74110
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   log_price             74111 non-null  float64
 1   property_type         74111 non-null  object 
 2   room_type             74111 non-null  object 
 3   amenities             74111 non-null  object 
 4   accommodates          74111 non-null  int64  
 5   bathrooms             73911 non-null  float64
 6   bed_type              74111 non-null  object 
 7   cancellation_policy   74111 non-null  object 
 8   cleaning_fee          74111 non-null  bool   
 9   city                  74111 non-null  object 
 10  description           74111 non-null  object 
 11  host_response_rate    55812 non-null  object 
 12  instant_bookable      74111 non-null  object 
 13  latitude              74111 non-null  float64
 14  longitude             74111 non-null  float64
 15  neighbourhood      

In [17]:
df.head()

Unnamed: 0,log_price,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,city,...,host_response_rate,instant_bookable,latitude,longitude,neighbourhood,number_of_reviews,review_scores_rating,zipcode,bedrooms,beds
0,5.010635,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",3,1.0,Real Bed,strict,True,NYC,...,,f,40.696524,-73.991617,Brooklyn Heights,2,100.0,11201,1.0,1.0
1,5.129899,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",7,1.0,Real Bed,strict,True,NYC,...,100%,t,40.766115,-73.98904,Hell's Kitchen,6,93.0,10019,3.0,3.0
2,4.976734,Apartment,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",5,1.0,Real Bed,moderate,True,NYC,...,100%,t,40.80811,-73.943756,Harlem,10,92.0,10027,1.0,3.0
3,6.620073,House,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",4,1.0,Real Bed,flexible,True,SF,...,,f,37.772004,-122.431619,Lower Haight,0,,94117,2.0,2.0
4,4.744932,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",2,1.0,Real Bed,moderate,True,DC,...,100%,t,38.925627,-77.034596,Columbia Heights,4,40.0,20009,0.0,1.0


In [38]:
# Fill missing values
df['bathrooms'] = df['bathrooms'].fillna(df['bathrooms'].median())
df['review_scores_rating'] = df['review_scores_rating'].fillna(df['review_scores_rating'].median())

In [68]:
df['host_response_rate'].value_counts()

host_response_rate
100%    43254
90%      2277
80%      1113
0%        883
50%       611
        ...  
39%         1
15%         1
31%         1
21%         1
6%          1
Name: count, Length: 80, dtype: int64

In [46]:
# Fill categorical columns with mode
categorical_cols = ['neighbourhood', 'zipcode', 'host_has_profile_pic', 'host_identity_verified']
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

In [50]:
# Drop rows with excessive missing data
df = df.dropna(subset=['first_review', 'last_review'])

In [54]:
# Convert boolean-like columns
df['cleaning_fee'] = df['cleaning_fee'].astype(int)
df['instant_bookable'] = df['instant_bookable'].apply(lambda x: 1 if x == 't' else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaning_fee'] = df['cleaning_fee'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['instant_bookable'] = df['instant_bookable'].apply(lambda x: 1 if x == 't' else 0)


In [11]:
df['host_since']

0        26-03-2012
1        19-06-2017
2        25-10-2016
3        19-04-2015
4        01-03-2015
            ...    
74106    24-03-2013
74107    03-05-2016
74108    05-01-2012
74109    17-09-2017
74110    26-11-2012
Name: host_since, Length: 74111, dtype: object

In [1]:
#all columns name