In [1]:
import os
import json
import pyperclip
import pandas as pd

In [2]:
DATA_PATH = os.path.join('../Apple Music Activity')
FILE_NAME = 'Apple Music Library Artists.json'
DOCS_PATH = os.path.join('../docs/Library')
DOC_NAME = 'LIBRARY_ARTISTS.md'

In [3]:
with open(os.path.join(DATA_PATH, FILE_NAME), 'r') as f:
    library = json.load(f)

In [4]:
len(library), type(library)

(1178, list)

In [5]:
library[0].keys()

dict_keys(['Artist Identifier', 'Artist Name', 'Date Created', 'Date Created In Library', 'Visible', 'Catalog Identifiers - Artist', 'Favorite Artist - Status', 'Favorite Artist - Date'])

In [6]:
keys_set = set()
for item in library:
    keys_set.add(len(item.keys()))
keys_set

{5, 6, 8}

In [7]:
types_set = set()
for item in library:
    for key in item.keys():
        types_set.add(type(item[key]))
        
types_set

{bool, str}

In [8]:
df = pd.DataFrame(library)

In [9]:
df.shape

(1178, 8)

In [10]:
df.columns

Index(['Artist Identifier', 'Artist Name', 'Date Created',
       'Date Created In Library', 'Visible', 'Catalog Identifiers - Artist',
       'Favorite Artist - Status', 'Favorite Artist - Date'],
      dtype='object')

In [11]:
df['Artist Identifier'].nunique()

1178

In [12]:
df['Artist Identifier'].value_counts()

Artist Identifier
r.002WMz7    1
r.dm2LHpj    1
r.eAcBmrr    1
r.e8Tiy4t    1
r.e6txtMx    1
            ..
r.IvnRXC5    1
r.ItjdXE7    1
r.IsUy0D3    1
r.Is4OJ7j    1
r.zwOlBFp    1
Name: count, Length: 1178, dtype: int64

In [13]:
df['Artist Identifier'].isna().sum()

np.int64(0)

In [14]:
df['Artist Name'].nunique()

1178

In [15]:
df['Artist Name'].value_counts()

Artist Name
Ed Sheeran                    1
Sumit Goswami                 1
CrazyDaniel                   1
Dalip Shekhawat               1
Elton John & Dua Lipa         1
                             ..
SLANDER                       1
TroyBoi                       1
T & Sugah                     1
YoungBoy Never Broke Again    1
Alesso & DubVision            1
Name: count, Length: 1178, dtype: int64

In [16]:
df['Artist Name'].isna().sum()

np.int64(0)

In [17]:
df['Date Created'].nunique()

1000

In [18]:
df['Date Created'].value_counts()

Date Created
2024-07-22T02:18:17Z    30
2023-02-15T08:07:07Z    30
2024-07-22T02:18:20Z    26
2021-06-05T17:48:05Z    22
2024-11-25T19:06:56Z    15
                        ..
2022-10-26T10:09:31Z     1
2022-12-07T07:39:33Z     1
2024-01-19T17:21:58Z     1
2023-06-04T22:00:38Z     1
2022-11-01T21:23:33Z     1
Name: count, Length: 1000, dtype: int64

In [19]:
df['Date Created'].isna().sum()

np.int64(0)

In [20]:
df['Date Created In Library'].nunique()

986

In [21]:
df['Date Created In Library'].value_counts()

Date Created In Library
2023-02-15T08:07:07Z    30
2024-07-22T02:18:17Z    30
2021-06-05T17:48:05Z    27
2024-07-22T02:18:20Z    26
2024-11-25T19:06:56Z    15
                        ..
2024-01-04T07:00:16Z     1
2023-08-07T01:55:36Z     1
2022-10-26T10:09:31Z     1
2022-12-07T07:39:33Z     1
2022-11-01T21:23:33Z     1
Name: count, Length: 986, dtype: int64

In [22]:
df['Date Created In Library'].isna().sum()

np.int64(0)

In [23]:
df['Favorite Artist - Date'].nunique()

186

In [24]:
df['Favorite Artist - Date'].value_counts()

Favorite Artist - Date
2022-10-26T10:00:03Z    1
2022-10-26T10:03:17Z    1
2022-10-26T10:00:19Z    1
2023-04-13T19:39:35Z    1
2022-10-26T09:55:01Z    1
                       ..
2022-10-26T10:02:00Z    1
2024-10-25T18:22:22Z    1
2022-12-07T07:39:33Z    1
2022-10-26T09:52:31Z    1
2024-06-05T03:51:12Z    1
Name: count, Length: 186, dtype: int64

In [25]:
df['Favorite Artist - Date'].isna().sum()

np.int64(992)

In [26]:
df[~df['Favorite Artist - Date'].isna()]['Artist Name']

0          Ed Sheeran
4            Jamie xx
9       Ramin Djawadi
23           Dua Lipa
26       Lost Stories
            ...      
1149            ISOxo
1155            Drake
1157    Flosstradamus
1164        Dom Dolla
1168              PFV
Name: Artist Name, Length: 186, dtype: object

In [27]:
df['Favorite Artist - Status'].nunique()

1

In [28]:
df['Favorite Artist - Status'].value_counts()

Favorite Artist - Status
True    186
Name: count, dtype: int64

In [29]:
df[~df['Favorite Artist - Status'].isna()]['Artist Name']

0          Ed Sheeran
4            Jamie xx
9       Ramin Djawadi
23           Dua Lipa
26       Lost Stories
            ...      
1149            ISOxo
1155            Drake
1157    Flosstradamus
1164        Dom Dolla
1168              PFV
Name: Artist Name, Length: 186, dtype: object

In [30]:
df['Catalog Identifiers - Artist'].nunique()

801

In [31]:
df['Catalog Identifiers - Artist'].value_counts()

Catalog Identifiers - Artist
1             25
827795989      9
430932944      9
4091218        9
1455262408     8
              ..
420126097      1
670534462      1
913944         1
957521979      1
1356377051     1
Name: count, Length: 801, dtype: int64

In [32]:
df[df['Catalog Identifiers - Artist'] == 1]['Artist Name']

Series([], Name: Artist Name, dtype: object)

In [33]:
df_non_nans = df.isna().sum(axis=1)
min_len = df_non_nans.min()
min_len_idx = df_non_nans.idxmin()

In [40]:
mdown_text = f"""
# Library Artists Data Definition

This data dictionary describes the fields in the {FILE_NAME} data. There are {df.shape[0]} rows and {df.shape[1]} columns. 

## Library Artists Data

"""

In [41]:
def generate_markdown_table(df, min_index):
    # Helper function to map data types to more readable formats
    def map_data_type(dtype, sample_value):
        # Check for datetime
        if pd.api.types.is_datetime64_any_dtype(dtype) or (
            isinstance(sample_value, str) and pd.to_datetime(sample_value, errors='coerce') is not pd.NaT
        ):
            return "Datetime"
        # Check for Boolean (Python Boolean or string representation)
        elif isinstance(sample_value, bool) or (isinstance(sample_value, str) and sample_value.lower() in ["true", "false"]):
            return "Boolean"
        elif pd.api.types.is_string_dtype(dtype):
            return "String"
        elif pd.api.types.is_numeric_dtype(dtype):
            if pd.api.types.is_integer_dtype(dtype):
                return "Integer"
            elif pd.api.types.is_float_dtype(dtype):
                return "Float"
        elif isinstance(sample_value, list):
            return "List"
        else:
            return str(dtype)  # Fallback to original dtype if no match

    # Generate the table header
    markdown = "| Column Name | Data Type | Description | Example Value |\n"
    markdown += "|-------------|-----------|-------------|---------------|\n"

    # Iterate over DataFrame columns to generate each row
    for column in df.columns:
        sample_value = df[column].iloc[min_len_idx] if not df[column].empty else "N/A"
        data_type = map_data_type(df[column].dtype, sample_value)
        # Convert example_value to string to avoid formatting issues
        example_value = str(sample_value).replace("\n", " ").replace("|", "\\|")
        # Create a new row for each column
        markdown += f"| `{column}` | {data_type} |  | {example_value} |\n"

    return markdown

mdown_text += generate_markdown_table(df, min_len_idx)



In [42]:
def write_markdown_file(mdown_text, file_path):
    with open(file_path, 'w') as f:
        f.write(mdown_text)
    print(f"Markdown file saved to {file_path}")

In [43]:
write_markdown_file(mdown_text, os.path.join(DOCS_PATH, DOC_NAME))

Markdown file saved to ../docs/Library/LIBRARY_ARTISTS.md
