# 1.Import Dependencies

In [None]:
import pandas as pd
import numpy as np
import re
import plotly.express as px
import os
import zipfile
import json
import copy
import warnings
warnings.filterwarnings('ignore')

In [None]:
## restart kernel after installation
# !pip install sdv==0.18.0

# 2.Import Datasets

## 2.1. Cleaned Detailed Listings

In [None]:
# personal directory
# det_list_df = pd.read_csv('./train_test_data_nashville.csv', dtype={'Unnamed: 0':str})

# GitHub directory
det_list_df = pd.read_csv('../../data/raw/AirBnB/Samples/listings.csv', dtype={'id':str})

In [None]:
det_list_df.shape

(6738, 12)

In [None]:
det_list_df.columns

Index(['Unnamed: 0', 'accommodates', 'num_bathrooms', 'bedrooms', 'beds',
       'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'availability_ind',
       'host_is_superhost_ind', 'latitude', 'longitude', 'price'],
      dtype='object')

In [None]:
det_list_df.head()

Unnamed: 0.1,Unnamed: 0,accommodates,num_bathrooms,bedrooms,beds,minimum_nights_avg_ntm,maximum_nights_avg_ntm,availability_ind,host_is_superhost_ind,latitude,longitude,price
0,72906,4,1.0,2.0,2.0,2.0,1125.0,1,1,36.13122,-86.80066,104.616438
1,431258,4,2.5,2.0,2.0,2.3,1101.9,1,1,36.1758,-86.7995,351.986301
2,329997,2,1.0,1.0,1.0,2.2,1086.4,1,1,36.1758,-86.7995,127.887671
3,1885504,6,2.0,2.0,3.0,2.6,1125.0,1,0,36.10963,-86.74195,133.876712
4,632636,2,1.5,1.0,1.0,2.3,60.0,1,1,36.1723,-86.7925,163.739726


In [None]:
# rename columns
det_list_df = det_list_df.rename(columns={'Unnamed: 0':'id'})

In [None]:
det_list_df.head()

Unnamed: 0,id,accommodates,num_bathrooms,bedrooms,beds,minimum_nights_avg_ntm,maximum_nights_avg_ntm,availability_ind,host_is_superhost_ind,latitude,longitude,price
0,72906,4,1.0,2.0,2.0,2.0,1125.0,1,1,36.13122,-86.80066,104.616438
1,431258,4,2.5,2.0,2.0,2.3,1101.9,1,1,36.1758,-86.7995,351.986301
2,329997,2,1.0,1.0,1.0,2.2,1086.4,1,1,36.1758,-86.7995,127.887671
3,1885504,6,2.0,2.0,3.0,2.6,1125.0,1,0,36.10963,-86.74195,133.876712
4,632636,2,1.5,1.0,1.0,2.3,60.0,1,1,36.1723,-86.7925,163.739726


In [None]:
# sanity check
det_list_df[det_list_df['id'].isin(['782004953350872454'])]

Unnamed: 0,id,accommodates,num_bathrooms,bedrooms,beds,minimum_nights_avg_ntm,maximum_nights_avg_ntm,availability_ind,host_is_superhost_ind,latitude,longitude,price
6691,782004953350872454,12,2.0,4.0,5.0,1.3,1125.0,1,0,36.17984,-86.74995,420.29589


## 2.2. OHE Amenities

In [None]:
# personal directory
# ohe_amenities_df = pd.read_csv('/content/OHE_amenities_nashville.csv', dtype={'id':str})

# GitHub directory
ohe_amenities_df = pd.read_csv('../../data/processed/OHE_amenities_nashville.csv', dtype={'id':str})

In [None]:
ohe_amenities_df.shape

(8127, 82)

In [None]:
ohe_amenities_df.columns

Index(['id', 'Free Parking', 'Coffee Maker', 'Patio or Balcony', 'WiFi',
       'Smoke Alarm', 'Air Conditioning', 'TV', 'Heating', 'Essentials',
       'Kitchen', 'Hair Dryer', 'Long Term Stays Allowed', 'Iron', 'Hangers',
       'Shampoo', 'Washer', 'Refrigerator', 'Diningware', 'Hot Water',
       'Microwave', 'Fire Extinguisher', 'Dryer', 'Carbon Monoxide Alarm',
       'Bed Linens', 'Self Check-in', 'Oven', 'Cooking Basics', 'Stove',
       'Dishwasher', 'Private Entrance', 'First Aid Kit',
       'Extra Pillows and Blankets', 'Dedicated Workspace', 'Surveillance',
       'Freezer', 'Body Soap', 'Backyard', 'Clothing Storage', 'Wine Glasses',
       'Conditioner', 'Toaster', 'Cleaning Products', 'Dining Table', 'Keypad',
       'BBQ', 'Shades', 'Luggage Dropoff Allowed', 'Smart Lock',
       'Pets Allowed', 'Baby Crib', 'Pool', 'Gym', 'Blender', 'Fire Pit',
       'Elevator', 'Lockbox', 'Fireplace', 'Paid Parking', 'Laundromat Nearby',
       'Exercise Equipment', 'City Skyline Vi

In [None]:
# sanity check
ohe_amenities_df[ohe_amenities_df['id'].isin(['782004953350872454'])]

Unnamed: 0,id,Free Parking,Coffee Maker,Patio or Balcony,WiFi,Smoke Alarm,Air Conditioning,TV,Heating,Essentials,...,Park View,Resort View,Mountain View,Valley View,Lake View,Golf Course View,Marina View,Canal View,Bay View,Vineyard View
8027,782004953350872454,1,0,0,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0


# 3.Generate Synthetic Listings Dataset

* [Generate Synthetic Dataset with the Synthetic Data Vault (SDV)](https://medium.com/geekculture/generate-synthetic-dataset-with-the-synthetic-data-vault-sdv-26b564b8fe15)
* [SDV Getting Started](https://sdv.dev/SDV/getting_started/quickstart.html#quickstart)
* [SDV Models and Descriptions](https://sdv.dev/SDV/user_guides/single_table/models.html)
* [Evaluation Framework](https://sdv.dev/SDV/user_guides/evaluation/evaluation_framework.html)


## 3.1. Merge Datasets

In [None]:
new_df = det_list_df.merge(ohe_amenities_df, how='left', on='id')

In [None]:
new_df.shape

(6738, 93)

In [None]:
new_df.columns

Index(['id', 'accommodates', 'num_bathrooms', 'bedrooms', 'beds',
       'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'availability_ind',
       'host_is_superhost_ind', 'latitude', 'longitude', 'price',
       'Free Parking', 'Coffee Maker', 'Patio or Balcony', 'WiFi',
       'Smoke Alarm', 'Air Conditioning', 'TV', 'Heating', 'Essentials',
       'Kitchen', 'Hair Dryer', 'Long Term Stays Allowed', 'Iron', 'Hangers',
       'Shampoo', 'Washer', 'Refrigerator', 'Diningware', 'Hot Water',
       'Microwave', 'Fire Extinguisher', 'Dryer', 'Carbon Monoxide Alarm',
       'Bed Linens', 'Self Check-in', 'Oven', 'Cooking Basics', 'Stove',
       'Dishwasher', 'Private Entrance', 'First Aid Kit',
       'Extra Pillows and Blankets', 'Dedicated Workspace', 'Surveillance',
       'Freezer', 'Body Soap', 'Backyard', 'Clothing Storage', 'Wine Glasses',
       'Conditioner', 'Toaster', 'Cleaning Products', 'Dining Table', 'Keypad',
       'BBQ', 'Shades', 'Luggage Dropoff Allowed', 'Smart L

In [None]:
# sanity check
for col in new_df.columns:
  print('{}\n'.format(new_df[new_df['id'].isin(['782004953350872454'])][col]))

6691    782004953350872454
Name: id, dtype: object

6691    12
Name: accommodates, dtype: int64

6691    2.0
Name: num_bathrooms, dtype: float64

6691    4.0
Name: bedrooms, dtype: float64

6691    5.0
Name: beds, dtype: float64

6691    1.3
Name: minimum_nights_avg_ntm, dtype: float64

6691    1125.0
Name: maximum_nights_avg_ntm, dtype: float64

6691    1
Name: availability_ind, dtype: int64

6691    0
Name: host_is_superhost_ind, dtype: int64

6691    36.17984
Name: latitude, dtype: float64

6691   -86.74995
Name: longitude, dtype: float64

6691    420.29589
Name: price, dtype: float64

6691    1
Name: Free Parking, dtype: int64

6691    0
Name: Coffee Maker, dtype: int64

6691    0
Name: Patio or Balcony, dtype: int64

6691    1
Name: WiFi, dtype: int64

6691    1
Name: Smoke Alarm, dtype: int64

6691    1
Name: Air Conditioning, dtype: int64

6691    1
Name: TV, dtype: int64

6691    1
Name: Heating, dtype: int64

6691    0
Name: Essentials, dtype: int64

6691    1
Name: Kitchen, d

## 3.2 Generate Synthetic Datasets


### Credits

* [Generate Synthetic Dataset with the Synthetic Data Vault (SDV)
](https://medium.com/geekculture/generate-synthetic-dataset-with-the-synthetic-data-vault-sdv-26b564b8fe15)


### 3.2.1 Create, Fit, and Save Models or Load Models

In [None]:
# import all 4 sdv models
from sdv.tabular import GaussianCopula
from sdv.tabular import CTGAN
from sdv.tabular import CopulaGAN
from sdv.tabular import TVAE

In [None]:
use_pickle_files = True

### 3.2.1.1 Load Models

In [None]:
# # personal directory
# # uncomment to unzip and load fitted models if needed
# with zipfile.ZipFile('/content/SDV Fitted Models.zip', 'r') as zip_ref:
#   try:
#     zip_ref.extractall('/content')
#     print('Unzipped files')
#   except:
#     print('Failed to unzip files')

In [None]:
# # GitHub directory
# uncomment to unzip and load fitted models if needed
if use_pickle_files:
  with zipfile.ZipFile('../../models/SDV Fitted Models.zip', 'r') as zip_ref:
    try:
      zip_ref.extractall('../../models')
      print('Unzipped files')
    except:
      print('Failed to unzip files')

  # personal directory
  # model_Gaussian = GaussianCopula.load('/content/Fitted Models/model_Gaussian.pkl')
  # model_CTGAN = CTGAN.load('/content/Fitted Models/model_CTGAN.pkl')
  # model_CopulaGAN = CopulaGAN.load('/content/Fitted Models/model_CopulaGAN.pkl')
  # model_TVAE = TVAE.load('/content/Fitted Models/model_TVAE.pkl')

  # GitHub directory
  model_Gaussian = GaussianCopula.load('../../models/SDV Fitted Models/model_Gaussian.pkl')
  model_CTGAN = CTGAN.load('../../models/SDV Fitted Models/model_CTGAN.pkl')
  model_CopulaGAN = CopulaGAN.load('../../models/SDV Fitted Models/model_CopulaGAN.pkl')
  model_TVAE = TVAE.load('../../models/SDV Fitted Models/model_TVAE.pkl')

#### 3.2.2.2 OR Create Models

In [None]:
%%time

if not use_pickle_files:
  # create, fit and save GaussianCopula model
  model_Gaussian = GaussianCopula(primary_key='id')
  model_Gaussian.fit(new_df)

  # personal directory
  # model_Gaussian.save('/content/model_Gaussian.pkl')

  # GitHub Directory
  model_Gaussian.save('../../models/model_Gaussian.pkl')

CPU times: user 2.08 s, sys: 109 ms, total: 2.19 s
Wall time: 2.22 s


In [None]:
%%time

if not use_pickle_files:
  # create, fit and save CTGAN model
  model_CTGAN = CTGAN(primary_key='id')
  model_CTGAN.fit(new_df)
  # personal directory
  # model_CTGAN.save('/content/model_CTGAN.pkl')

  # GitHub Directory
  model_CTGAN.save('../../models/model_CTGAN.pkl')

CPU times: user 21min 53s, sys: 9.34 s, total: 22min 2s
Wall time: 22min 27s


In [None]:
%%time

if not use_pickle_files:
  # create, fit and save CopulaGAN model
  model_CopulaGAN = CopulaGAN(primary_key='id')
  model_CopulaGAN.fit(new_df)

  # personal directory
  # model_CopulaGAN.save('/content/model_CopulaGAN.pkl')

  # GitHub Directory
  model_CopulaGAN.save('../../models/model_CopulaGAN.pkl')

CPU times: user 22min 9s, sys: 9.03 s, total: 22min 19s
Wall time: 22min 38s


In [None]:
%%time

if not use_pickle_files:
  # create, fit and save TVAE model
  model_TVAE = TVAE(primary_key='id')
  model_TVAE.fit(new_df)

  # personal directory
  # model_TVAE.save('/content/model_TVAE.pkl')

  # GitHub Directory
  model_TVAE.save('../../models/model_TVAE.pkl')


CPU times: user 6min 29s, sys: 7.71 s, total: 6min 37s
Wall time: 6min 58s


### 3.2.2 Create Synthetic Datasets

In [None]:
n=7000

In [None]:
%%time

new_data_model_Gaussian = model_Gaussian.sample(n)

CPU times: user 1.22 s, sys: 31.9 ms, total: 1.25 s
Wall time: 2.07 s


In [None]:
%%time

new_data_model_CTGAN = model_CTGAN.sample(n)

CPU times: user 1.61 s, sys: 18.7 ms, total: 1.63 s
Wall time: 1.65 s


In [None]:
%%time

new_data_model_CopulaGAN = model_CopulaGAN.sample(n)

CPU times: user 2.31 s, sys: 19 ms, total: 2.33 s
Wall time: 2.33 s


In [None]:
%%time

new_data_model_TVAE = model_TVAE.sample(n)

CPU times: user 1.14 s, sys: 10 ms, total: 1.15 s
Wall time: 1.15 s


# 4.Select Most Similar Dataset

In [None]:
cols = [
  'minimum_nights_avg_ntm',
  'num_bathrooms',
  'accommodates',
  'bedrooms',
  'Keypad',
  'Pets Allowed',
  'Extra Pillows and Blankets',
  'Hair Dryer',
  'Bed Linens',
  'Kitchen',
  'longitude',
  'latitude',
  'price',
  'Source']

In [None]:
# set source for comparison
new_df['Source']='AirBnB'

## 4.1. Gaussian Comparison

In [None]:
# set source for comparison
new_data_model_Gaussian['Source']='Gaussian'

In [None]:
comp_Gaussian = pd.concat([new_df[cols], new_data_model_Gaussian[cols]])

In [None]:
for var in cols[: -1]:
  fig = px.histogram(
    data_frame=comp_Gaussian[[var, 'Source']], 
    x=var, 
    color='Source', 
    color_discrete_map ={
        'AirBnB':'orange',
        'Gaussian':'blue'
    },
    barmode='group',
    marginal="box", # can be `box`, `violin`
    title='Gaussian',
    width=900,
    height=300)
  
  fig.show()

## 4.2. CTGAN Comparison

In [None]:
# set source for comparison
new_data_model_CTGAN['Source']='CTGAN'

In [None]:
comp_CTGAN= pd.concat([new_df[cols], new_data_model_CTGAN[cols]])

In [None]:
for var in cols[: -1]:
  fig = px.histogram(
    data_frame=comp_CTGAN[[var, 'Source']], 
    x=var, 
    color='Source', 
    color_discrete_map ={
        'AirBnB':'orange',
        'CTGAN':'#00CC96'
    },
    barmode='group',
    marginal="box", # can be `box`, `violin`
    title='CTGAN',
    width=900,
    height=300)
  
  fig.show()

## 4.3. CopulaGAN Comparison

In [None]:
# set source for comparison
new_data_model_CopulaGAN['Source']='CopulaGAN'

In [None]:
comp_CopulaGAN= pd.concat([new_df[cols], new_data_model_CopulaGAN[cols]])

In [None]:
for var in cols[: -1]:
  fig = px.histogram(
    data_frame=comp_CopulaGAN[[var, 'Source']], 
    x=var, 
    color='Source', 
    color_discrete_map ={
        'AirBnB':'orange',
        'CopulaGAN':'#FF6692'
    },
    barmode='group',
    marginal="box", # can be `rug`, `violin`
    title='CopulaGAN',
    width=900,
    height=300)
  
  fig.show()

## 4.4. TVAE Comparsion

In [None]:
# set source for comparison
new_data_model_TVAE['Source']='TVAE'

In [None]:
comp_TVAE= pd.concat([new_df[cols], new_data_model_TVAE[cols]])

In [None]:
for var in cols[: -1]:
  fig = px.histogram(
    data_frame=comp_TVAE[[var, 'Source']], 
    x=var, 
    color='Source', 
    color_discrete_map ={
        'AirBnB':'orange',
        'TVAE':'red'
    },
    barmode='group',
    marginal="box", # can be `rug`, `violin`
    title='TVAE',
    width=900,
    height=300)
  
  fig.show()

# 5.Export Winning Dataset

In [None]:
# personal directory
# new_data_model_CTGAN.iloc[:, :-1].to_csv(path_or_buf='/content/CTGAN_SDV.csv', index=False)

# GitHub directory
new_data_model_CTGAN.iloc[:, :-1].to_csv(path_or_buf='../../data/processed/CTGAN_SDV.csv', index=False)