In [4]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import datetime as dt


In [8]:
# Load in the data

cities = ['barcelona', 'madrid', 'malaga', 'mallorca', 'menorca', 'sevilla', 'valencia']
calendar = pd.DataFrame()
listings = pd.DataFrame()
neighbourhoods = pd.DataFrame()
reviews = pd.DataFrame()

for city in cities:
    calendar = pd.concat([pd.read_csv(f'{city}/calendar.csv'), calendar])
    listings = pd.concat([pd.read_csv(f'{city}/listings 3.csv'), listings])
    neighbourhoods = pd.concat([pd.read_csv(f'{city}/neighbourhoods.csv'), neighbourhoods])
    reviews = pd.concat([pd.read_csv(f'{city}/reviews 3.csv'), reviews])

  calendar = pd.concat([pd.read_csv(f'{city}/calendar.csv'), calendar])
  calendar = pd.concat([pd.read_csv(f'{city}/calendar.csv'), calendar])
  calendar = pd.concat([pd.read_csv(f'{city}/calendar.csv'), calendar])


In [3]:
# Create small versions of the CSV files with only 200 rows each
calendar_small = calendar.head(200)
listings_small = listings.head(200)
neighbourhoods_small = neighbourhoods.head(200)
reviews_small = reviews.head(200)

# Save the small versions to new CSV files
calendar_small.to_csv('madrid/calendar_small.csv', index=False)
listings_small.to_csv('madrid/listings_small.csv', index=False)
neighbourhoods_small.to_csv('madrid/neighbourhoods_small.csv', index=False)
reviews_small.to_csv('madrid/reviews_small.csv', index=False)

In [4]:
# Create dataframes from SQL tables
neighborhood = pd.DataFrame(columns=["id", "ubicacion", "descripcion", "nombre", "grupo"])

host = pd.DataFrame(columns=["id", "nombre", "ubicacion", "numero_casas", "correo", "telefono", "host_desde", "tasa_aceptacion", "tiempo_respuesta", "descripcion"])

casa = pd.DataFrame(columns=["id", "host_id", "neighborhood_id", "nombre", "tipo", "descripcion", "num_habitaciones", "num_camas", "banos", "ciudad", "latitud", 
"longitud", "calificacion", "num_reviews"])

review = pd.DataFrame(columns=["id", "id_cliente", "id_casa", "fecha", "reviewer_name", "comentarios"])

cliente = pd.DataFrame(columns=["id", "nombre_cliente"])

reserva = pd.DataFrame(columns=["id", "casa_id", "cliente_id", "fecha_inicio", "fecha_final", "num_personas", "precio", "num_dias"])


In [None]:
# Create dataframes from SQL tables
# Calendar table
proc_calendar = pd.DataFrame(columns=[
    'listing_id', 'date', 'available', 'price', 'adjusted_price', 'minimum_nights', 'maximum_nights'
])

# Property type table
proc_property_type = pd.DataFrame(columns=['id', 'type'])

# Listings table
proc_listings = pd.DataFrame(columns=[
    'id', 'name', 'host_id', 'neighbourhood_group', 'neighbourhood', 'city', 'latitude', 'longitude',
    'room_type', 'property_type', 'price', 'minimum_nights', 'maximum_nights', 'license'
])

# Listings description table
proc_listings_description = pd.DataFrame(columns=[
    'listing_id', 'description', 'picture_url', 'accommodates', 'bathrooms', 'bathrooms_text',
    'bedrooms', 'beds', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights',
    'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm'
])

# Listings amenities table
proc_listings_amenities = pd.DataFrame(columns=['listing_id', 'amenity'])

# Listings availability table
proc_listings_availability = pd.DataFrame(columns=[
    'listing_id', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights',
    'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'calendar_updated',
    'has_availability', 'availability_30', 'availability_60', 'availability_90', 'availability_365',
    'instant_bookable'
])

# Listings reviews table
proc_listings_reviews = pd.DataFrame(columns=[
    'listing_id', 'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d',
    'first_review', 'last_review', 'review_scores_rating', 'review_scores_accuracy',
    'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication',
    'review_scores_location', 'review_scores_value', 'reviews_per_month'
])

# Neighborhood groups table
proc_neighborhood_groups = pd.DataFrame(columns=['id', 'name', 'neighbourhood_group_cleansed'])

# Neighborhoods table
proc_neighborhoods = pd.DataFrame(columns=[
    'id', 'name', 'neighborhood_cleansed', 'neighborhood_group', 'overview', 'city'
])

# Countries table
proc_countries = pd.DataFrame(columns=['iso_3166_1', 'name'])

# Cities table
proc_cities = pd.DataFrame(columns=['id', 'name', 'country_code'])

# Hosts table
proc_hosts = pd.DataFrame(columns=[
    'id', 'url', 'name', 'host_since', 'location', 'host_about', 'response_time', 'response_rate',
    'acceptance_rate', 'is_superhost', 'thumbnail_url', 'picture_url', 'neighborhood',
    'listings_count', 'total_listings_count', 'verifications', 'has_profile_pic',
    'identity_verified', 'calculated_host_listings_count',
    'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms',
    'calculated_host_listings_count_shared_rooms'
])

# Users table
proc_users = pd.DataFrame(columns=['id', 'name'])

# Reviews table
proc_reviews = pd.DataFrame(columns=['id', 'listing_id', 'date', 'reviewer_id', 'comments'])

In [15]:
# Create a dictionary to map information from listings to the new dataframes
listings_info = {
    'host': {
        'id': listings['host_id'],
        'nombre': listings['host_name'],
        'ubicacion': listings['host_location'],
        'numero_casas': listings['host_listings_count'],
        'correo': listings['host_url'],
        'telefono': listings['host_picture_url'],
        'host_desde': listings['host_since'],
        'tasa_aceptacion': listings['host_acceptance_rate'],
        'tiempo_respuesta': listings['host_response_time'],
        'descripcion': listings['host_about']
    },
    'casa': {
        'id': listings['id'],
        'host_id': listings['host_id'],
        'neighborhood_id': listings['neighbourhood_cleansed'],
        'nombre': listings['name'],
        'tipo': listings['property_type'],
        'descripcion': listings['description'],
        'num_habitaciones': listings['bedrooms'],
        'num_camas': listings['beds'],
        'banos': listings['bathrooms'],
        'ciudad': listings['neighbourhood_group_cleansed'],
        'latitud': listings['latitude'],
        'longitud': listings['longitude'],
        'calificacion': listings['review_scores_rating'],
        'num_reviews': listings['number_of_reviews']
    },
    'reserva': {
        'casa_id': listings['id'],

    }
}
# Map the dictionary to the new dataframes
for key, value in listings_info.items():
    for col, series in value.items():
        if key == 'host':
            host[col] = series
        elif key == 'casa':
            casa[col] = series
        elif key == 'reserva':
            reserva[col] = series

# Create a dictionary to map information from reviews to the new dataframes
reviews_info = {
    'review': {
        'id': reviews['id'],
        'id_cliente': reviews['reviewer_id'],
        'id_casa': reviews['listing_id'],
        'fecha': reviews['date'],
        'reviewer_name': reviews['reviewer_name'],
        'comentarios': reviews['comments']
    }
    # 'cliente': {
    #     'id': reviews['reviewer_id'],
    #     'nombre_cliente': reviews['reviewer_name']
    # }
}
# Map the dictionary to the new dataframes
for key, value in reviews_info.items():
    for col, series in value.items():
        if key == 'review':
            review[col] = series
        elif key == 'cliente':
            cliente[col] = series

# Create a dictionary to map information from neighbourhoods to the new dataframes
neighbourhoods_info = {
    'neighborhood': {
        'nombre': neighbourhoods['neighbourhood'],
        'grupo': neighbourhoods['neighbourhood_group'],
    }
}
# Map the dictionary to the new dataframes
for key, value in neighbourhoods_info.items():
    for col, series in value.items():
        if key == 'neighborhood':
            neighborhood[col] = series