### Fetch the artifact we just created (sample.csv) from W&B and read it with pandas

In [1]:
import wandb
import pandas as pd
import uuid

run_name = f"EDA_{uuid.uuid4().hex[:8]}"
run = wandb.init(project="nyc_airbnb", group="eda", name=run_name, save_code=True)
local_path = wandb.use_artifact("sample.csv:latest").file()
df = pd.read_csv(local_path)

  import pkg_resources
  import pkg_resources
[34m[1mwandb[0m: Currently logged in as: [33mlucphumy[0m ([33mlucphumy-na[0m). Use [1m`wandb login --relogin`[0m to force relogin


### Using pandas-profiling, create a profile

In [2]:
!pip install ydata-profiling

Collecting ydata-profiling
  Using cached ydata_profiling-4.16.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting pydantic>=2 (from ydata-profiling)
  Using cached pydantic-2.11.7-py3-none-any.whl.metadata (67 kB)
Collecting visions<0.8.2,>=0.7.5 (from visions[type_image_path]<0.8.2,>=0.7.5->ydata-profiling)
  Using cached visions-0.8.1-py3-none-any.whl.metadata (11 kB)
Collecting htmlmin==0.1.12 (from ydata-profiling)
  Using cached htmlmin-0.1.12-py3-none-any.whl
Collecting phik<0.13,>=0.11.1 (from ydata-profiling)
  Using cached phik-0.12.4-cp310-cp310-macosx_11_0_arm64.whl.metadata (5.6 kB)
Collecting tqdm<5,>=4.48.2 (from ydata-profiling)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting seaborn<0.14,>=0.10.1 (from ydata-profiling)
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting multimethod<2,>=1.4 (from ydata-profiling)
  Using cached multimethod-1.12-py3-none-any.whl.metadata (9.6 kB)
Collecting statsmodels<1,>=0.13.2 (from ydata

In [3]:
# pandas-profiling is deprecated. Use its actively-maintained successor ydata-profiling. 
from ydata_profiling import ProfileReport
from ydata_profiling.config import Settings

# Turn off the chi-square calculation to fix this error:
# TypeCheckError: argument "config" (dict) did not match any element in the union:
#  ydata_profiling.config.Settings: is not an instance of ydata_profiling.config.Settings
#  NoneType: is not an instance of NoneType
cfg = Settings(vars={"num": {"chi_squared_threshold": 0.0}})
profile = ProfileReport(df, config=cfg)
# Create report file
profile.to_file("my_report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|                                                                                            | 0/16 [00:00<?, ?it/s][A
100%|███████████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 57.54it/s][A
(using `df.profile_report(correlations={"auto": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'putmask: first argument must be an array')


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
# type of last_review column 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              20000 non-null  int64  
 1   name                            19993 non-null  object 
 2   host_id                         20000 non-null  int64  
 3   host_name                       19992 non-null  object 
 4   neighbourhood_group             20000 non-null  object 
 5   neighbourhood                   20000 non-null  object 
 6   latitude                        20000 non-null  float64
 7   longitude                       20000 non-null  float64
 8   room_type                       20000 non-null  object 
 9   price                           20000 non-null  int64  
 10  minimum_nights                  20000 non-null  int64  
 11  number_of_reviews               20000 non-null  int64  
 12  last_review                     

In [5]:
# price range
df["price"].agg(["min", "max"])

min        0
max    10000
Name: price, dtype: int64

In [6]:
# Drop outliers
min_price = 10
max_price = 350
idx = df['price'].between(min_price, max_price)
df = df[idx].copy()
# Convert last_review to datetime
df['last_review'] = pd.to_datetime(df['last_review'])

In [7]:
# last_review column has type datetime
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19001 entries, 0 to 19999
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   id                              19001 non-null  int64         
 1   name                            18994 non-null  object        
 2   host_id                         19001 non-null  int64         
 3   host_name                       18993 non-null  object        
 4   neighbourhood_group             19001 non-null  object        
 5   neighbourhood                   19001 non-null  object        
 6   latitude                        19001 non-null  float64       
 7   longitude                       19001 non-null  float64       
 8   room_type                       19001 non-null  object        
 9   price                           19001 non-null  int64         
 10  minimum_nights                  19001 non-null  int64         
 11  number_

In [8]:
# price range [10, 350]
df["price"].agg(["min", "max"])

min     10
max    350
Name: price, dtype: int64

In [9]:
run.finish()

VBox(children=(Label(value='10.577 MB of 10.577 MB uploaded (0.025 MB deduped)\r'), FloatProgress(value=1.0, m…