# Exploring the Metadata of Data Frames


## Exploring dataframes


In [1]:
import pandas as pd


### Using the ABQ business data set

In [2]:
url = 'https://ddc-datascience.s3.amazonaws.com/a-z.business/2023-08-21/combined.txt'
csv = 'abq_bus.csv'

In [3]:
!curl -O {url}

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 36.3M  100 36.3M    0     0  28.6M      0  0:00:01  0:00:01 --:--:-- 28.6M


In [4]:
!ln -s combined.txt abq_bus.csv

In [5]:
abq_bus = pd.read_csv( csv, delimiter='\t', dtype='str' )
abq_bus.shape

(38111, 228)

In [6]:
rows, columns = abq_bus.shape
{
  "Rows": rows,
  "Columns" : columns
}

{'Rows': 38111, 'Columns': 228}

## Using a module on GitHub

In [7]:
# from https://github.com/rwcitek/example-c11/blob/main/python.modules/metadata.py

!curl -s -O https://raw.githubusercontent.com/rwcitek/example-c11/main/python.modules/metadata.py

# normally importing into the global namespace is very much frowned upon
from metadata import *

# this is preferred
# import metadata as md


In [8]:
ls -la

total 37272
drwxr-xr-x 1 root root     4096 Jun 20 21:37 [0m[01;34m.[0m/
drwxr-xr-x 1 root root     4096 Jun 20 21:34 [01;34m..[0m/
lrwxrwxrwx 1 root root       12 Jun 20 21:36 [01;36mabq_bus.csv[0m -> combined.txt
-rw-r--r-- 1 root root 38138507 Jun 20 21:36 combined.txt
drwxr-xr-x 4 root root     4096 Jun 18 13:23 [01;34m.config[0m/
-rw-r--r-- 1 root root     1163 Jun 20 21:37 metadata.py
drwxr-xr-x 2 root root     4096 Jun 20 21:37 [01;34m__pycache__[0m/
drwxr-xr-x 1 root root     4096 Jun 18 13:23 [01;34msample_data[0m/


### Looking at ABQ business data

In [9]:
pd.set_option('display.max_rows', 1_000)

In [10]:
md_abq_bus = metadata( abq_bus )
md_abq_bus

Unnamed: 0,count,unique,top,freq,Nulls,Nulls_pct,Data_types,Memory
Source,38111,1,AtoZDatabases,38111.0,0,0.0,object,2667770
Date,38111,2,08/21/2023,21111.0,0,0.0,object,2553437
Obsolescence Date,38111,2,02/21/2024,21111.0,0,0.0,object,2553437
Business Name,38111,34419,Capital One Bank - ATM,83.0,0,0.0,object,2940613
Legal Name,8034,3491,University of New Mexico - NM Poison & Drug In...,225.0,30077,78.9,object,1620755
Physical Address,37560,23160,2211 Lomas Blvd NE,283.0,551,1.4,object,2954109
Physical Address Number,37554,5255,201,443.0,557,1.5,object,2303671
Physical Pre Direction,91,5,N,42.0,38020,99.8,object,1221919
Physical Address Name,37560,3543,Central,1712.0,551,1.4,object,2431589
Physical Address Suffix,36267,30,Blvd,11616.0,1844,4.8,object,2230062


In [11]:
md_abq_bus

Unnamed: 0,count,unique,top,freq,Nulls,Nulls_pct,Data_types,Memory
Source,38111,1,AtoZDatabases,38111.0,0,0.0,object,2667770
Date,38111,2,08/21/2023,21111.0,0,0.0,object,2553437
Obsolescence Date,38111,2,02/21/2024,21111.0,0,0.0,object,2553437
Business Name,38111,34419,Capital One Bank - ATM,83.0,0,0.0,object,2940613
Legal Name,8034,3491,University of New Mexico - NM Poison & Drug In...,225.0,30077,78.9,object,1620755
Physical Address,37560,23160,2211 Lomas Blvd NE,283.0,551,1.4,object,2954109
Physical Address Number,37554,5255,201,443.0,557,1.5,object,2303671
Physical Pre Direction,91,5,N,42.0,38020,99.8,object,1221919
Physical Address Name,37560,3543,Central,1712.0,551,1.4,object,2431589
Physical Address Suffix,36267,30,Blvd,11616.0,1844,4.8,object,2230062


In [12]:
%%capture output
%%bash
apt-get update
# apt-cache search csv
apt-get install -y csvkit csvtool jq tree


In [13]:
%%bash
echo $PATH |
tr : '\n' |
while read folder ; do
  ls -la ${folder}/csv* 2> /dev/null || true
done

-rwxr-xr-x 1 root root     957 Sep 14  2021 /usr/bin/csvclean
-rwxr-xr-x 1 root root     953 Sep 14  2021 /usr/bin/csvcut
-rwxr-xr-x 1 root root     959 Sep 14  2021 /usr/bin/csvformat
-rwxr-xr-x 1 root root     955 Sep 14  2021 /usr/bin/csvgrep
-rwxr-xr-x 1 root root     955 Sep 14  2021 /usr/bin/csvjoin
-rwxr-xr-x 1 root root     955 Sep 14  2021 /usr/bin/csvjson
-rwxr-xr-x 1 root root     955 Sep 14  2021 /usr/bin/csvlook
-rwxr-xr-x 1 root root     951 Sep 14  2021 /usr/bin/csvpy
-rwxr-xr-x 1 root root     955 Sep 14  2021 /usr/bin/csvsort
-rwxr-xr-x 1 root root     953 Sep 14  2021 /usr/bin/csvsql
-rwxr-xr-x 1 root root     957 Sep 14  2021 /usr/bin/csvstack
-rwxr-xr-x 1 root root     955 Sep 14  2021 /usr/bin/csvstat
-rwxr-xr-x 1 root root 1560880 Jan 26  2022 /usr/bin/csvtool
-rwxr-xr-x 1 root root     957 Sep 14  2021 /bin/csvclean
-rwxr-xr-x 1 root root     953 Sep 14  2021 /bin/csvcut
-rwxr-xr-x 1 root root     959 Sep 14  2021 /bin/csvformat
-rwxr-xr-x 1 root root     955 Sep

In [17]:
!{ head -1 abq_bus.csv ; grep -i tricore abq_bus.csv ; } | head -2 | csvjson -t | jq .


[1;39m[
  [1;39m{
    [0m[34;1m"Source"[0m[1;39m: [0m[0;32m"AtoZDatabases"[0m[1;39m,
    [0m[34;1m"Date"[0m[1;39m: [0m[0;32m"2023-08-10"[0m[1;39m,
    [0m[34;1m"Obsolescence Date"[0m[1;39m: [0m[0;32m"2024-02-10"[0m[1;39m,
    [0m[34;1m"Business Name"[0m[1;39m: [0m[0;32m"Dr Lorene S Valdez Boyle Md"[0m[1;39m,
    [0m[34;1m"Legal Name"[0m[1;39m: [0m[0;32m"Tricore Reference Laboratories"[0m[1;39m,
    [0m[34;1m"Physical Address"[0m[1;39m: [0m[0;32m"801 Encino Pl NE Ste C1"[0m[1;39m,
    [0m[34;1m"Physical Address Number"[0m[1;39m: [0m[0;39m801[0m[1;39m,
    [0m[34;1m"Physical Pre Direction"[0m[1;39m: [0m[1;30mnull[0m[1;39m,
    [0m[34;1m"Physical Address Name"[0m[1;39m: [0m[0;32m"Encino"[0m[1;39m,
    [0m[34;1m"Physical Address Suffix"[0m[1;39m: [0m[0;32m"Pl"[0m[1;39m,
    [0m[34;1m"Physical Post Direction"[0m[1;39m: [0m[0;32m"NE"[0m[1;39m,
    [0m[34;1m"Physical City"[0m[1;39m: [0m[0;32m"Albuq

In [18]:
!{ head -1 abq_bus.csv ; grep -i 8731005 abq_bus.csv ; } | column -s$'\t' -t | cat -n


     1	Source         Date        Obsolescence Date  Business Name                      Legal Name              Physical Address            Physical Address Number  Physical Pre Direction  Physical Address Name  Physical Address Suffix  Physical Post Direction  Physical City  Physical State  Physical ZIP  Physical ZIP 4  Key Executive Name  First Name  Middle Initial  Last Name  Title                    Gender   Location Employee Size  Corporate Employee Size  Revenue / Yr  Mailing Address             Mailing Address Number  Mailing Pre Direction  Mailing Address Name  Mailing Address Suffix  Mailing Post Direction  Mailing City  Mailing State  Mailing ZIP  Mailing ZIP 4  Phone           Fax             Toll-Free       County Name  County Population  Metro Area       Latitude    Longitude     EIN  Main Line of Business                                                        Location Type  Importer or Exporter  Manufacturer  Primary SIC  Primary SIC Description                           

In [19]:
!{ head -1 abq_bus.csv ; grep -i 541714 abq_bus.csv ; } | cat -n


     1	Source	Date	Obsolescence Date	Business Name	Legal Name	Physical Address	Physical Address Number	Physical Pre Direction	Physical Address Name	Physical Address Suffix	Physical Post Direction	Physical City	Physical State	Physical ZIP	Physical ZIP 4	Key Executive Name	First Name	Middle Initial	Last Name	Title	Gender	Location Employee Size	Corporate Employee Size	Revenue / Yr	Mailing Address	Mailing Address Number	Mailing Pre Direction	Mailing Address Name	Mailing Address Suffix	Mailing Post Direction	Mailing City	Mailing State	Mailing ZIP	Mailing ZIP 4	Phone	Fax	Toll-Free	County Name	County Population	Metro Area	Latitude	Longitude	EIN	Main Line of Business	Location Type	Importer or Exporter	Manufacturer	Primary SIC	Primary SIC Description	SIC02	SIC02.Description	SIC03	SIC03.Description	SIC04	SIC04.Description	SIC05	SIC05.Description	SIC06	SIC06.Description	SIC07	SIC07.Description	SIC08	SIC08.Description	SIC09	SIC09.Description	SIC10	SIC10.Description	NAICS 1	NAICS 1 Description	NAIC

In [20]:
cols = [ x for x in abq_bus.columns if "NAICS" in x and "Desc" not in x ]
filter = abq_bus[ cols ].apply( lambda row: '|'.join(row.values.astype(str)).find("541714") >= 0, axis=1)
abq_bus[ filter ]


Unnamed: 0,Source,Date,Obsolescence Date,Business Name,Legal Name,Physical Address,Physical Address Number,Physical Pre Direction,Physical Address Name,Physical Address Suffix,...,Est. Rent Annual Expense,Est. Technology Annual Expense,Est. Telecom Annual Expense,Est. Utilities Annual Expense,AtoZ ID,Home Based Business,Franchise Type,Holding Parent Name,Source.1,Unnamed: 227
2534,AtoZDatabases,08/10/2023,02/10/2024,Alpha Omega Power Technologies,,8504 Calle Alameda NE,8504,,Calle Alam,,...,"$25,000 to $99,999","$25,000 to $99,999","$10,000 to $24,999","$20,000 to $49,999",11132551116480,No,,,AtoZDatabases,
3539,AtoZDatabases,08/10/2023,02/10/2024,Armonica Technologies,,5901 Indian School Rd NE,5901,,Indian Sch,Rd,...,"$25,000 to $99,999","$25,000 to $99,999","$10,000 to $24,999","$20,000 to $49,999",11140024245105,No,,,AtoZDatabases,
5168,AtoZDatabases,08/10/2023,02/10/2024,BennuBio,,6610 Gulton Ct NE,6610,,Gulton,Ct,...,"$25,000 to $99,999","$25,000 to $99,999","$10,000 to $24,999","$20,000 to $49,999",11140024165184,No,,,AtoZDatabases,
5499,AtoZDatabases,08/10/2023,02/10/2024,Biophagy,,5901 Indian School Rd NE,5901,,Indian Sch,Rd,...,"$25,000 to $99,999","$25,000 to $99,999","$10,000 to $24,999","$20,000 to $49,999",11140024245091,No,,,AtoZDatabases,
5500,AtoZDatabases,08/10/2023,02/10/2024,Bio-Save Resources of Albuquerque,Bio Save Resources Inc,701 2nd St SW,701,,2nd,St,...,"$25,000 to $99,999","$25,000 to $99,999","$10,000 to $24,999","$20,000 to $49,999",11132549821335,No,,,AtoZDatabases,
8218,AtoZDatabases,08/10/2023,02/10/2024,Circular Genomics,,5901 Indian School Rd NE,5901,,Indian Sch,Rd,...,"$2,500 to $4,999","$2,500 to $4,999","Up to $1,999","$2,500 to $7,499",11140035419965,No,,,AtoZDatabases,
9521,AtoZDatabases,08/10/2023,02/10/2024,curia,,4401 Alexander Blvd NE,4401,,Alexander,Blvd,...,"$25,000 to $99,999","$25,000 to $99,999","$5,000 to $9,999","$20,000 to $49,999",11132575497359,No,Chain Location,,AtoZDatabases,
17160,AtoZDatabases,08/21/2023,02/21/2024,Indica Labs,,8700 Education Pl NW Ste B,8700,,Education,Pl,...,"$25,000 to $99,999","$25,000 to $99,999","$25,000 to $99,999","$50,000 to $99,999",11140001673839,No,,,AtoZDatabases,
23147,AtoZDatabases,08/21/2023,02/21/2024,MedPharmics,MedPharmics LLC,883 Lead Ave SE Ste B,883,,Lead,Ave,...,"$25,000 to $99,999","$100,000 or more","$10,000 to $24,999","$50,000 to $99,999",11140024508033,No,,,AtoZDatabases,
25538,AtoZDatabases,08/21/2023,02/21/2024,Nob Hill Therapeutics,,5901 Indian School Rd NE,5901,,Indian Sch,Rd,...,"$25,000 to $99,999","$25,000 to $99,999","$10,000 to $24,999","$20,000 to $49,999",11140024556530,No,,,AtoZDatabases,


In [21]:
sum(filter)

12

In [22]:
abq_bus.iloc[0]


Source                                                  AtoZDatabases
Date                                                       08/10/2023
Obsolescence Date                                          02/10/2024
Business Name                                             0 Locskmith
Legal Name                                                        NaN
Physical Address                                   10500 Coors Byp NW
Physical Address Number                                         10500
Physical Pre Direction                                            NaN
Physical Address Name                                           Coors
Physical Address Suffix                                           Byp
Physical Post Direction                                            NW
Physical City                                             Albuquerque
Physical State                                                     NM
Physical ZIP                                                    87114
Physical ZIP 4      

In [23]:
abq_bus[["NAICS 1", "NAICS 1 Description"]].value_counts()


NAICS 1  NAICS 1 Description                                                                                
621111   Offices of Physicians (except Mental Health Specialists)                                               3901
541110   Offices of Lawyers                                                                                     1845
722511   Full-Service Restaurants                                                                               1459
621399   Offices of All Other Miscellaneous Health Practitioners                                                 926
531210   Offices of Real Estate Agents and Brokers                                                               854
236116   New Multifamily Housing Construction (except For-Sale Builders)                                         851
561990   All Other Support Services                                                                              824
624190   Other Individual and Family Services                           

In [24]:
abq_bus[["NAICS 1 Description"]].value_counts().to_frame().reset_index()


Unnamed: 0,NAICS 1 Description,count
0,Offices of Physicians (except Mental Health Sp...,3901
1,Offices of Lawyers,1845
2,Full-Service Restaurants,1459
3,Offices of All Other Miscellaneous Health Prac...,926
4,Offices of Real Estate Agents and Brokers,854
5,New Multifamily Housing Construction (except F...,851
6,All Other Support Services,824
7,Other Individual and Family Services,808
8,Unclassified Establishments,748
9,Other Activities Related to Credit Intermediation,703
