In [1]:
import pandas
import os


data_directory = '../../Data/Raw/airbnb/'
output_directory='../../Data/Processed/airbnb/'

# Load Data 

In [2]:
#Colocando o caminho do arquivo csv em uma variável.
airbnb_file = data_directory + '/airbnb_queens_2019.csv'

#Importando o arquivo csv para um DataFrame.
df_airbnb = pandas.read_csv(airbnb_file)

#Retirando as colunas desnecessárias para a análise. (neighbourhood_group)
#Axis = 1 representa coluna e inplace é para retirar do datafreme original.
df_airbnb.drop(['neighbourhood_group'], axis = 1, inplace = True)


#Printando o shape do DF
print('Airbnb: ', df_airbnb.shape)

Airbnb:  (5666, 15)


In [3]:
#Verificando quantos registros nulos existem em cada coluna.
print(df_airbnb.isna().sum())

id                                   0
name                                 0
host_id                              0
host_name                            2
neighbourhood                        0
latitude                             0
longitude                            0
room_type                            0
price                                0
minimum_nights                       0
number_of_reviews                    0
last_review                       1092
reviews_per_month                 1092
calculated_host_listings_count       0
availability_365                     0
dtype: int64


# AIRBNB

In [None]:
df_airbnb.head()

# Fill NaN Values 

In [15]:
#Colocando em um dicionário as colunas que estavam com valor NaN e setando 0 para elas.
nan_rule = {
  'host_name': 0,
  'last_review': 0,
  'reviews_per_month': 0,
}

#Percorrendo os dicionário e mudando os valores nulos das colunas por zero do DataFrame.
for cname, rule in nan_rule.items():
    df_airbnb.loc[:, cname] = df_airbnb[cname].fillna(rule)\
    
print(df_airbnb.shape)

(5666, 15)


# Numeric Variables 

In [17]:
num_variables = ['latitude', 'longitude', 'price','minimum_nights','number_of_reviews','reviews_per_month',
                 'calculated_host_listings_count','availability_365']

for cname in num_variables:
    df_airbnb[cname] = df_airbnb[cname].astype(float, errors='ignore').fillna(0)

# Categorical Variables 

In [18]:
cat_variables = ['name','host_name','neighbourhood','room_type','last_review']

for cname in cat_variables:
    df_airbnb[cname] = df_airbnb[cname].astype(str)

# Prepared Data 

## Features 

In [19]:
df_airbnb.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,12937,"1 Stop fr. Manhattan! Private Suite,Landmark B...",50124,Orestes,Long Island City,40.74771,-73.9474,Private room,130.0,3.0,248.0,01-07-19,2.25,1.0,215.0
1,18198,Little King of Queens,70091,Justin,Woodside,40.75038,-73.90334,Private room,70.0,30.0,25.0,31-05-19,0.22,1.0,324.0
2,32363,Fully Furnished Basement Apartment,140025,Fredah,Flushing,40.74028,-73.83168,Private room,140.0,2.0,1.0,19-09-11,0.01,1.0,1.0
3,39593,"A room w/ a Manhattan view, longer stay",110506,Myung,Sunnyside,40.74559,-73.92313,Private room,79.0,30.0,28.0,12-04-19,0.26,1.0,126.0
4,45910,Beautiful Queens Brownstone! - 5BR,204539,Mark,Ridgewood,40.70382,-73.89797,Entire home/apt,350.0,8.0,10.0,12-05-19,0.11,5.0,365.0


# Export Data

In [20]:
help(os.makedirs)

Help on function makedirs in module os:

makedirs(name, mode=511, exist_ok=False)
    makedirs(name [, mode=0o777][, exist_ok=False])
    
    Super-mkdir; create a leaf directory and all intermediate ones.  Works like
    mkdir, except that any intermediate path segment (not just the rightmost)
    will be created if it does not exist. If the target directory already
    exists, raise an OSError if exist_ok is False. Otherwise no exception is
    raised.  This is recursive.



In [21]:
os.makedirs(output_directory, exist_ok=True)

airbnb_file = output_directory + '/airbnb.parquet'

# read features
df_airbnb.to_parquet(airbnb_file)