# <span style="background-color:white;color:red;"> ⌛⌛ EN PROGRESO⌛⌛</span>

## Bloque I
Descomprimir el csv en formato gz

In [6]:
"""  
Importación de ficheros de AIRBNB. Preprocesado y cargado a la base de datos 

Obtenemos los datos desde https://insideairbnb.com/get-the-data/

Podríamos hacer scraping, obtener la fecha de la última actualización de Madrid y generar el enlace para descargarlo, pero no es necesario por que se actualiza cada mucho tiempo y no es necesario tener la última versión. Podemos cambiar la info procesable en la carpeta data manualmente.

"""
import os
import pandas as pd
import numpy as np
import datetime
from config_bd import *
import requests
import sys
import gzip
import shutil

DATA_DIR = os.path.join('data')
# DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
TEMP_DIR = os.path.join(DATA_DIR, 'TEMP')


""" Comprobamos que existe el directorio de datos """

if not os.path.exists(DATA_DIR):
    print(f"El directorio {DATA_DIR} no existe. No es posible continuar...")
    sys.exit()

""" Descomprimimos el archivo de calendario """

if not os.path.exists(TEMP_DIR):
    os.makedirs(TEMP_DIR)


gz_file_path = os.path.join(DATA_DIR, 'listings.csv.gz')
output_file_path = os.path.join(TEMP_DIR, os.path.splitext(os.path.basename(gz_file_path))[0])

if os.path.exists(gz_file_path):
    try:
        with gzip.open(gz_file_path, 'rb') as f_in:
            with open(output_file_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        print(f"Archivo {gz_file_path} descomprimido como {output_file_path}")
    except Exception as e:
        print(f"Error al descomprimir el archivo {gz_file_path}: {e}")
        sys.exit()
else:
    print(f"El archivo {gz_file_path} no existe. No es posible continuar...")
    sys.exit()


Archivo data\listings.csv.gz descomprimido como data\TEMP\listings.csv


## Bloque II
Carga del csv de detalle completo en pandas. Cambios de tipos e imputación

Estos son lo pasos seguidos:

<ul>
<li>Eliminamos campos innecesarios</li>
<li>Eliminamos las propiedades que no se encuentran dentro del perímetro de interés (habitaciones, compartidas, albergues...)</li>
<li></li>
<li></li>
<li></li>
<li></li>
<li></li>
</ul>

In [7]:
df = pd.read_csv(output_file_path, low_memory=False)

Eliminamos campos innecesarios

In [8]:
df = df.drop(columns=['scrape_id','last_scraped','source','neighborhood_overview','host_id','host_url','host_name','host_since','host_location','host_response_time','host_response_rate','host_acceptance_rate','host_is_superhost','host_thumbnail_url','host_picture_url','host_listings_count','host_total_listings_count','host_verifications','host_has_profile_pic','host_identity_verified','neighbourhood','bathrooms_text','beds','amenities','minimum_minimum_nights','maximum_minimum_nights','minimum_maximum_nights','maximum_maximum_nights','minimum_nights_avg_ntm','maximum_nights_avg_ntm','has_availability','calendar_last_scraped','number_of_reviews','number_of_reviews_ltm','number_of_reviews_l30d','availability_eoy','number_of_reviews_ly','first_review','last_review','review_scores_rating','review_scores_accuracy','review_scores_cleanliness','review_scores_checkin','review_scores_communication','review_scores_location','review_scores_value','license','instant_bookable','calculated_host_listings_count','calculated_host_listings_count_entire_homes','calculated_host_listings_count_private_rooms','calculated_host_listings_count_shared_rooms','reviews_per_month','host_about','host_neighbourhood','calendar_updated','availability_30','availability_60','availability_90','availability_365'])

Eliminamos las habitaciones, casas compartidas, hoteles, picaderos, etc, solo queremos viviendas completas

In [None]:
# df['property_type'].value_counts()
# df['room_type'].value_counts()
df = df[df['room_type']== 'Entire home/apt']
df['room_type'].value_counts()

room_type
Entire home/apt    16805
Name: count, dtype: int64

In [14]:
df

Unnamed: 0,id,listing_url,name,description,picture_url,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,price,minimum_nights,maximum_nights,estimated_occupancy_l365d,estimated_revenue_l365d
1,30320,https://www.airbnb.com/rooms/30320,Great Vacational Apartments,,https://a0.muscache.com/pictures/336868/f67409...,Sol,Centro,40.414760,-3.704180,Entire rental unit,Entire home/apt,2,,1.0,,5,180,0,
2,30959,https://www.airbnb.com/rooms/30959,Beautiful loft in Madrid Center,Beautiful Loft 60m2 size just in the historica...,https://a0.muscache.com/pictures/78173471/835e...,Embajadores,Centro,40.412590,-3.701050,Entire loft,Entire home/apt,2,,1.0,,3,730,0,
3,40916,https://www.airbnb.com/rooms/40916,Holiday Apartment Madrid Center,,https://a0.muscache.com/pictures/336736/c3b486...,Universidad,Centro,40.422470,-3.705770,Entire rental unit,Entire home/apt,3,,1.0,,5,180,0,
5,70059,https://www.airbnb.com/rooms/70059,Tu hogar en centro de Madrid.,"Very nice, cozy and bright, right in the cente...",https://a0.muscache.com/pictures/6479679/b14b9...,Universidad,Centro,40.423810,-3.710380,Entire rental unit,Entire home/apt,2,1.0,1.0,$66.00,15,365,0,0.0
6,70310,https://www.airbnb.com/rooms/70310,"Heart of Malasaña Cozy, Quiet & Sunny Apartment",Cozy Apartment in great location in the center...,https://a0.muscache.com/pictures/42404aff-e9c6...,Universidad,Centro,40.422520,-3.702500,Entire rental unit,Entire home/apt,2,1.0,1.0,$89.00,30,360,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25281,1368974630547284843,https://www.airbnb.com/rooms/1368974630547284843,Coqueto estudio en tetuan!,Disconnect from the routine in this charming a...,https://a0.muscache.com/pictures/miso/Hosting-...,Valdeacederas,Tetuán,40.464951,-3.704578,Entire rental unit,Entire home/apt,2,1.0,1.0,$67.00,5,365,0,0.0
25282,1368981094706864086,https://www.airbnb.com/rooms/1368981094706864086,Bonito estudio en Tetuán!,Get away from routine at this unique and relax...,https://a0.muscache.com/pictures/miso/Hosting-...,Valdeacederas,Tetuán,40.468355,-3.706939,Entire rental unit,Entire home/apt,2,1.0,1.0,$63.00,5,365,0,0.0
25283,1368985680426539467,https://www.airbnb.com/rooms/1368985680426539467,Acogedor y nuevo ático con gran terraza (B),"Cozy penthouse floor with large terrace, locat...",https://a0.muscache.com/pictures/miso/Hosting-...,Canillas,Hortaleza,40.464110,-3.647130,Entire rental unit,Entire home/apt,3,1.0,1.0,$103.00,2,365,0,0.0
25284,1368986418960986344,https://www.airbnb.com/rooms/1368986418960986344,Elegante apartamento en Madrid!,Have fun with the whole family at this stylish...,https://a0.muscache.com/pictures/miso/Hosting-...,San Isidro,Carabanchel,40.395983,-3.730953,Entire rental unit,Entire home/apt,5,1.0,2.0,$100.00,5,365,0,0.0


In [15]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,16805.0,6.619263e+17,5.286172e+17,30320.0,40066130.0,8.339923e+17,1.140569e+18,1.368992e+18
latitude,16805.0,40.42273,0.021631,40.33189,40.41048,40.4209,40.43116,40.57729
longitude,16805.0,-3.694871,0.02413785,-3.88399,-3.70704,-3.701052,-3.689055,-3.545904
accommodates,16805.0,3.850521,1.860342,1.0,2.0,4.0,4.0,16.0
bathrooms,13870.0,1.30566,0.6103988,0.0,1.0,1.0,1.5,8.0
bedrooms,16552.0,1.52465,0.9531958,0.0,1.0,1.0,2.0,9.0
minimum_nights,16805.0,8.555132,26.27219,1.0,1.0,2.0,4.0,1125.0
maximum_nights,16805.0,554.4614,420.1542,1.0,364.0,365.0,1125.0,11111.0
estimated_occupancy_l365d,16805.0,95.43445,99.5833,0.0,0.0,56.0,192.0,255.0
estimated_revenue_l365d,13871.0,14697.5,33879.59,0.0,1116.0,8960.0,21420.0,2636136.0


## Bloque III
Importamos a la base de datos. 
**Requiere haber creado la tabla** <code>01_CREATE TABLE bnb_property.sql</code>

In [10]:
from config_bd import get_connection

In [11]:
df.to_csv(os.path.join(TEMP_DIR,'transform_calendar.csv'), index=False)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9236806 entries, 0 to 9236805
Data columns (total 6 columns):
 #   Column          Dtype  
---  ------          -----  
 0   property_id     int64  
 1   date            object 
 2   available       int64  
 3   price           float64
 4   minimum_nights  int64  
 5   maximum_nights  int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 422.8+ MB


In [None]:
from tqdm import tqdm
# from time import sleep

with get_connection() as conn:
    cursor = conn.cursor()
    cursor.fast_executemany = True 

    CHUNKSIZE = 500
    for chunk in tqdm(
        pd.read_csv(os.path.join(TEMP_DIR, 'transform_calendar.csv'), chunksize=CHUNKSIZE),
        unit_scale=CHUNKSIZE, unit=" filas"
    ):

        insert_sql = """
        INSERT INTO dbo.bnb_calendar
            (property_id, [date], available, price,
            minimum_nights, maximum_nights)
        VALUES (?,?,?,?,?,?)
        """
        cursor.executemany(insert_sql, chunk.values.tolist())
        conn.commit()        




0 filas [00:00, ? filas/s]

9300000 filas [08:31, 18174.81 filas/s]


Elimino el fichero para no dejar duplicado el csv

In [42]:
os.remove(os.path.join(TEMP_DIR,'transform_calendar.csv'))