# Airbnb 

In [193]:
# Importing necessary libraries
import pandas as pd
import numpy as np

In [136]:
# Accessing the data
df = pd.read_csv('airbnb.csv')

In [137]:
# DataFrame first look
df.head()

Unnamed: 0,host_is_superhost,cancellation_policy,instant_bookable,host_total_listings_count,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,...,bedrooms_na,bathrooms_na,beds_na,review_scores_rating_na,review_scores_accuracy_na,review_scores_cleanliness_na,review_scores_checkin_na,review_scores_communication_na,review_scores_location_na,review_scores_value_na
0,t,moderate,t,1.0,Western Addition,37.76931,-122.43386,Apartment,Entire home/apt,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,f,strict_14_with_grace_period,f,2.0,Bernal Heights,37.74511,-122.42102,Apartment,Entire home/apt,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,f,strict_14_with_grace_period,f,10.0,Haight Ashbury,37.76669,-122.4525,Apartment,Private room,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,f,strict_14_with_grace_period,f,10.0,Haight Ashbury,37.76487,-122.45183,Apartment,Private room,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,f,strict_14_with_grace_period,f,2.0,Western Addition,37.77525,-122.43637,House,Entire home/apt,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [138]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7146 entries, 0 to 7145
Data columns (total 34 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   host_is_superhost               7146 non-null   object 
 1   cancellation_policy             7146 non-null   object 
 2   instant_bookable                7146 non-null   object 
 3   host_total_listings_count       7146 non-null   float64
 4   neighbourhood_cleansed          7146 non-null   object 
 5   latitude                        7146 non-null   float64
 6   longitude                       7146 non-null   float64
 7   property_type                   7146 non-null   object 
 8   room_type                       7146 non-null   object 
 9   accommodates                    7146 non-null   float64
 10  bathrooms                       7146 non-null   float64
 11  bedrooms                        7146 non-null   float64
 12  beds                            71

The data has 34 columns and 7146 entries. With many floats and some object data types. Next steps for data manipultion:
1. Renaming columns;
2. Verifying the datatypes of each column and change to other adequate datatype when necessary;
3. Remove, modify or create data from the original, when necessary;

In [139]:
# Listing columns names
df.columns

Index(['host_is_superhost', 'cancellation_policy', 'instant_bookable',
       'host_total_listings_count', 'neighbourhood_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bedrooms', 'beds', 'bed_type', 'minimum_nights', 'number_of_reviews',
       'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'price', 'bedrooms_na', 'bathrooms_na',
       'beds_na', 'review_scores_rating_na', 'review_scores_accuracy_na',
       'review_scores_cleanliness_na', 'review_scores_checkin_na',
       'review_scores_communication_na', 'review_scores_location_na',
       'review_scores_value_na'],
      dtype='object')

In [140]:
# Constructing dictionary for modifying columns' names
newcols = {}
for column_name in df.columns:
    newcols[column_name] = column_name.capitalize().replace('_', ' ')
newcols
# Renaming columns
df.rename(columns=newcols, inplace = True)

In [141]:
df.head()

Unnamed: 0,Host is superhost,Cancellation policy,Instant bookable,Host total listings count,Neighbourhood cleansed,Latitude,Longitude,Property type,Room type,Accommodates,...,Bedrooms na,Bathrooms na,Beds na,Review scores rating na,Review scores accuracy na,Review scores cleanliness na,Review scores checkin na,Review scores communication na,Review scores location na,Review scores value na
0,t,moderate,t,1.0,Western Addition,37.76931,-122.43386,Apartment,Entire home/apt,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,f,strict_14_with_grace_period,f,2.0,Bernal Heights,37.74511,-122.42102,Apartment,Entire home/apt,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,f,strict_14_with_grace_period,f,10.0,Haight Ashbury,37.76669,-122.4525,Apartment,Private room,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,f,strict_14_with_grace_period,f,10.0,Haight Ashbury,37.76487,-122.45183,Apartment,Private room,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,f,strict_14_with_grace_period,f,2.0,Western Addition,37.77525,-122.43637,House,Entire home/apt,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Reorganizing the dataframe so that the binary categorical columns are positioned to the left of the dataframe.

In [142]:
# Getting binary and non-binary columns
binary_columns = []
non_binary_columns= []
for column in df.columns:
    if len(df[column].value_counts()) == 2:
        binary_columns.append(column)
    else:
        non_binary_columns.append(column) 

binary_columns

['Host is superhost',
 'Instant bookable',
 'Bedrooms na',
 'Bathrooms na',
 'Beds na',
 'Review scores rating na',
 'Review scores accuracy na',
 'Review scores cleanliness na',
 'Review scores checkin na',
 'Review scores communication na',
 'Review scores location na',
 'Review scores value na']

Most of the binary columns ends with ```na```. I will try to understand the meaning of it soon.

In [143]:
# Reordering columns
new_order = binary_columns + non_binary_columns
df = df[new_order]
df.head()

Unnamed: 0,Host is superhost,Instant bookable,Bedrooms na,Bathrooms na,Beds na,Review scores rating na,Review scores accuracy na,Review scores cleanliness na,Review scores checkin na,Review scores communication na,...,Minimum nights,Number of reviews,Review scores rating,Review scores accuracy,Review scores cleanliness,Review scores checkin,Review scores communication,Review scores location,Review scores value,Price
0,t,t,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,180.0,97.0,10.0,10.0,10.0,10.0,10.0,10.0,170.0
1,f,f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,30.0,111.0,98.0,10.0,10.0,10.0,10.0,10.0,9.0,235.0
2,f,f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,32.0,17.0,85.0,8.0,8.0,9.0,9.0,9.0,8.0,65.0
3,f,f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,32.0,8.0,93.0,9.0,9.0,10.0,10.0,9.0,9.0,65.0
4,f,f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7.0,27.0,97.0,10.0,10.0,10.0,10.0,10.0,9.0,785.0


In [144]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7146 entries, 0 to 7145
Data columns (total 34 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Host is superhost               7146 non-null   object 
 1   Instant bookable                7146 non-null   object 
 2   Bedrooms na                     7146 non-null   float64
 3   Bathrooms na                    7146 non-null   float64
 4   Beds na                         7146 non-null   float64
 5   Review scores rating na         7146 non-null   float64
 6   Review scores accuracy na       7146 non-null   float64
 7   Review scores cleanliness na    7146 non-null   float64
 8   Review scores checkin na        7146 non-null   float64
 9   Review scores communication na  7146 non-null   float64
 10  Review scores location na       7146 non-null   float64
 11  Review scores value na          7146 non-null   float64
 12  Cancellation policy             71

Now I focus in the datatypes. Some columns are unnecessarily of the type ```object``` or ```float```. I will try to change it to ```string``` or ```integer``` whenever possible. As I check columns one by one, I may also rename the column to a better name.

List of data modifications:

1. Host is superhost, Instant bookable: Change ```t``` $\rightarrow$ ```1``` (true), ```f``` $\rightarrow$ ```0``` (false).

In [154]:
df.rename(columns={'Host is superhost' : 'Superhost'}, inplace = True)
df['Superhost'] = df['Superhost'].apply(lambda x: 1 if x=='t' else (0 if x=='f' else None))
df['Instant bookable'] = df['Instant bookable'].apply(lambda x: 1 if x=='t' else (0 if x=='f' else None))

In [155]:
df.head()

Unnamed: 0,Superhost,Instant bookable,Bedrooms na,Bathrooms na,Beds na,Review scores rating na,Review scores accuracy na,Review scores cleanliness na,Review scores checkin na,Review scores communication na,...,Minimum nights,Number of reviews,Review scores rating,Review scores accuracy,Review scores cleanliness,Review scores checkin,Review scores communication,Review scores location,Review scores value,Price
0,,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,180.0,97.0,10.0,10.0,10.0,10.0,10.0,10.0,170.0
1,,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,30.0,111.0,98.0,10.0,10.0,10.0,10.0,10.0,9.0,235.0
2,,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,32.0,17.0,85.0,8.0,8.0,9.0,9.0,9.0,8.0,65.0
3,,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,32.0,8.0,93.0,9.0,9.0,10.0,10.0,9.0,9.0,65.0
4,,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7.0,27.0,97.0,10.0,10.0,10.0,10.0,10.0,9.0,785.0


The next columns, ending with ```na```, seems to point entries where there should have a ```NaN```, <i>i.e.<\i>, entries that should not be filled. Example: Bathrooms 

In [162]:
# Counting different entries in column Bathrooms where 'Bathroons na' == 0 (false)
df[df['Bathrooms na'] == 0]['Bathrooms'].value_counts()

1.0     5124
2.0     1005
1.5      480
2.5      173
3.0      132
3.5       54
0.0       39
4.0       32
5.0       26
0.5       17
8.0       15
10.0      12
4.5        8
6.0        6
5.5        1
14.0       1
Name: Bathrooms, dtype: int64

In [163]:
# Counting different entries in column Bathrooms where 'Bathroons na' == 1 (true)
df[df['Bathrooms na'] == 1]['Bathrooms'].value_counts()

1.0    21
Name: Bathrooms, dtype: int64

I will verify if this happens with two other columns and, if yes, I will assume this happens for all columns with ```na``` and treat adequately.

In [166]:
df[df['Bedrooms na']==0]['Bedrooms'].value_counts()

1.0     4197
2.0     1304
0.0      804
3.0      627
4.0      175
5.0       25
6.0        9
7.0        2
14.0       1
Name: Bedrooms, dtype: int64

In [167]:
df[df['Bedrooms na']==1]['Bedrooms'].value_counts()

1.0    2
Name: Bedrooms, dtype: int64

In [172]:
df[df['Review scores rating na']==0]['Review scores rating'].value_counts()

100.0    1603
98.0      677
99.0      617
97.0      596
96.0      418
95.0      348
93.0      249
94.0      234
90.0      167
92.0      141
80.0      125
91.0      108
89.0       71
87.0       66
88.0       58
60.0       44
86.0       41
85.0       27
70.0       21
84.0       20
83.0       16
82.0       14
76.0        8
20.0        8
40.0        6
75.0        6
77.0        6
78.0        5
73.0        5
81.0        5
67.0        4
74.0        2
79.0        2
50.0        2
30.0        1
72.0        1
63.0        1
56.0        1
64.0        1
Name: Review scores rating, dtype: int64

In [169]:
df[df['Review scores rating na']==1]['Review scores rating'].value_counts()

98.0    1421
Name: Review scores rating, dtype: int64

In [171]:
column_name = 'Review scores rating'
result = column_name + ' na'
result

'Review scores rating na'

The same happens, but the number present where should be a ```NaN``` depends on the column, as we can see in the above cell.
Let's replace these wrong values by ```NaNs```.

In [None]:
def includenan(column_name):
    na_counts = df[df[column_name + ' na'] == 1][column_name].value_counts()
    if len(na_counts) == 1 
        df[df[column_name] == na_counts.index[0]][column_name] = 

In [220]:
column_name = 'Bathrooms'
na_counts = df[df[column_name + ' na'] == 1][column_name].value_counts()

In [219]:
na_counts.index[0]

1.0

<h1> CONTINUAR DAQUI </h1>

In [221]:
df[column_name == na_counts.index[0]]

KeyError: False

In [210]:
df[column_name].value_counts()

1.0     5145
2.0     1005
1.5      480
2.5      173
3.0      132
3.5       54
0.0       39
4.0       32
5.0       26
0.5       17
8.0       15
10.0      12
4.5        8
6.0        6
5.5        1
14.0       1
Name: Bathrooms, dtype: int64