# Wrangling parquet file

Trying to figure our why `duckdb` is not loading the dataframe

In [1]:
import pandas as pd
from pathlib import Path
import duckdb

## Configuration

In [2]:
data_folder = Path().cwd().parent / 'data'

table = 'nycitydata'
parquet_file = data_folder / f'{table}.parquet'
duckdb_file = data_folder / f'{table}.duckdb'


print(f'File: {parquet_file} exists: {parquet_file.exists()}')
print(f'Data folder: {data_folder} exists: {data_folder.exists()}')

File: /home/luiscberrocal/PycharmProjects/hactoberfest-2023/src/data/nycitydata.parquet exists: True
Data folder: /home/luiscberrocal/PycharmProjects/hactoberfest-2023/src/data exists: True


## Loading data

In [3]:
df = pd.read_parquet(parquet_file)

In [4]:
df.shape

(6000, 46)

Fixed issue by add `SET GLOBAL pandas_analyze_sample=100000` to the connect function.

In [11]:
# duckdb.default_connection.execute("SET GLOBAL pandas_analyze_sample=100000")
if duckdb_file.exists():
    duckdb_file.unlink()
conn = duckdb.connect(str(duckdb_file)).execute("SET GLOBAL pandas_analyze_sample=100000")
conn.register('df', df)
conn.execute(f"CREATE TABLE {table} AS SELECT * FROM df")
conn.close()



## Finding nulls

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6000 entries, 0 to 1999
Data columns (total 46 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   unique_key                      6000 non-null   object
 1   created_date                    6000 non-null   object
 2   agency                          6000 non-null   object
 3   agency_name                     6000 non-null   object
 4   complaint_type                  6000 non-null   object
 5   descriptor                      5949 non-null   object
 6   location_type                   5478 non-null   object
 7   incident_zip                    5912 non-null   object
 8   incident_address                5792 non-null   object
 9   street_name                     5792 non-null   object
 10  cross_street_1                  4687 non-null   object
 11  cross_street_2                  4689 non-null   object
 12  address_type                    5974 non-null   objec

In [9]:
df.isna().sum()

unique_key                           0
created_date                         0
agency                               0
agency_name                          0
complaint_type                       0
descriptor                          51
location_type                      522
incident_zip                        88
incident_address                   208
street_name                        208
cross_street_1                    1313
cross_street_2                    1311
address_type                        26
city                               336
status                               0
resolution_description            1353
resolution_action_updated_date    1249
community_board                      0
bbl                                623
borough                              0
x_coordinate_state_plane            76
y_coordinate_state_plane            76
open_data_channel_type               0
park_facility_name                   0
park_borough                         0
latitude                 