In [3]:
import os
import json
import pyperclip
import pandas as pd

In [53]:
DATA_PATH = os.path.join('../Apple Music Activity')
FILE_NAME = 'Apple Music Library Albums.json'
DOCS_PATH = os.path.join('../docs/Library')
DOC_NAME = 'LIBRARY_ALBUMS.md'

In [19]:
with open(os.path.join(DATA_PATH, FILE_NAME), 'r') as f:
    library = json.load(f)

In [20]:
len(library), type(library)

(1702, list)

In [22]:
library[0].keys()

dict_keys(['Album ID', 'Title', 'Date Created', 'Date Created In Library', 'Visible', 'Catalog Identifiers - Album'])

In [23]:
keys_set = set()
for item in library:
    keys_set.add(len(item.keys()))
keys_set

{5, 6, 8}

In [24]:
types_set = set()
for item in library:
    for key in item.keys():
        types_set.add(type(item[key]))
        
types_set

{bool, str}

In [25]:
df = pd.DataFrame(library)

In [26]:
df.shape

(1702, 8)

In [27]:
df.columns

Index(['Album ID', 'Title', 'Date Created', 'Date Created In Library',
       'Visible', 'Catalog Identifiers - Album', 'Favorite Album - Status',
       'Favorite Album - Date'],
      dtype='object')

In [28]:
df['Album ID'].nunique()

1702

In [29]:
df['Album ID'].value_counts()

Album ID
l.00DcEq9    1
l.i0nhnht    1
l.hxxlqdx    1
l.hxO7jXw    1
l.hwsHup8    1
            ..
l.M7Agr7e    1
l.M48MNh8    1
l.Lz6OpSi    1
l.LqBtKBn    1
l.ztDqLco    1
Name: count, Length: 1702, dtype: int64

In [30]:
df['Album ID'].isna().sum()

0

In [31]:
df['Title'].nunique()

1597

In [32]:
df['Title'].value_counts()

Title
                                                        82
Shang-Chi and The Legend of The Ten Rings: The Album     3
Hate Me - Single                                         2
Kalyug (Original Motion Picture Soundtrack)              2
Lahore - Single                                          2
                                                        ..
Punya Paap                                               1
PTSD                                                     1
Upper Echelon (feat. T.I. & 2 Chainz) - Single           1
Baarishein - Single                                      1
Until Now                                                1
Name: count, Length: 1597, dtype: int64

In [33]:
df['Title'].isna().sum()

0

In [34]:
df[df['Title'] == '']['Title']

72       
106      
151      
164      
211      
       ..
1674     
1675     
1676     
1677     
1678     
Name: Title, Length: 82, dtype: object

In [35]:
df['Date Created'].nunique()

1397

In [36]:
df['Date Created'].value_counts()

Date Created
2023-02-15T08:07:07Z    57
2024-07-22T02:18:17Z    43
2024-07-22T02:18:20Z    31
2022-04-28T00:59:21Z    25
2021-06-05T17:48:05Z    24
                        ..
2023-07-12T21:04:40Z     1
2022-07-06T19:02:22Z     1
2022-12-17T22:07:57Z     1
2023-12-30T09:15:28Z     1
2023-11-01T06:36:09Z     1
Name: count, Length: 1397, dtype: int64

In [37]:
df['Date Created'].isna().sum()

0

In [38]:
df['Date Created In Library'].nunique()

1363

In [39]:
df['Date Created In Library'].value_counts()

Date Created In Library
2023-02-15T08:07:07Z    57
2024-07-22T02:18:17Z    43
2024-07-22T02:18:20Z    31
2021-06-05T17:48:05Z    30
2022-04-28T01:04:28Z    25
                        ..
2021-06-23T01:47:58Z     1
2023-05-14T17:18:11Z     1
2021-03-08T07:31:02Z     1
2022-04-14T21:59:56Z     1
2023-11-01T06:36:09Z     1
Name: count, Length: 1363, dtype: int64

In [40]:
df['Date Created In Library'].isna().sum()

0

In [41]:
df['Favorite Album - Date'].nunique()

2

In [42]:
df['Favorite Album - Date'].value_counts()

Favorite Album - Date
2021-03-26T01:48:22Z    1
2023-02-17T21:24:35Z    1
Name: count, dtype: int64

In [43]:
df['Favorite Album - Date'].isna().sum()

1700

In [44]:
df[~df['Favorite Album - Date'].isna()]['Title']

864             Vaaqif
1129    Quest For Fire
Name: Title, dtype: object

In [45]:
df['Favorite Album - Status'].nunique()

1

In [46]:
df['Favorite Album - Status'].value_counts()

Favorite Album - Status
True    2
Name: count, dtype: int64

In [47]:
df[~df['Favorite Album - Status'].isna()]['Title']

864             Vaaqif
1129    Quest For Fire
Name: Title, dtype: object

In [48]:
df['Catalog Identifiers - Album'].nunique()

1417

In [49]:
df['Catalog Identifiers - Album'].value_counts()

Catalog Identifiers - Album
1             69
1440829274     2
409001929      1
1253102461     1
1655166004     1
              ..
1706792328     1
1660225374     1
1542201783     1
1563859312     1
1021728914     1
Name: count, Length: 1417, dtype: int64

In [50]:
df[df['Catalog Identifiers - Album'] == 1]['Title']

Series([], Name: Title, dtype: object)

In [54]:
df_non_nans = df.isna().sum(axis=1)
min_len = df_non_nans.min()
min_len_idx = df_non_nans.idxmin()

In [51]:
mdown_text = f"""
# Library Albums Data Definition

This data dictionary describes the fields in the {FILE_NAME} data. There are {df.shape[0]} rows and {df.shape[1]} columns. 

## Library Track Data

"""

'\n# Library Albums Data Definition\n\nThis data dictionary describes the fields in the Apple Music Library Albums.json data. There are 1702 rows and 8 columns. \n\n## Library Track Data\n\n'

In [56]:
def generate_markdown_table(df, min_index):
    # Helper function to map data types to more readable formats
    def map_data_type(dtype, sample_value):
        # Check for datetime
        if pd.api.types.is_datetime64_any_dtype(dtype) or (
            isinstance(sample_value, str) and pd.to_datetime(sample_value, errors='coerce') is not pd.NaT
        ):
            return "Datetime"
        # Check for Boolean (Python Boolean or string representation)
        elif isinstance(sample_value, bool) or (isinstance(sample_value, str) and sample_value.lower() in ["true", "false"]):
            return "Boolean"
        elif pd.api.types.is_string_dtype(dtype):
            return "String"
        elif pd.api.types.is_numeric_dtype(dtype):
            if pd.api.types.is_integer_dtype(dtype):
                return "Integer"
            elif pd.api.types.is_float_dtype(dtype):
                return "Float"
        elif isinstance(sample_value, list):
            return "List"
        else:
            return str(dtype)  # Fallback to original dtype if no match

    # Generate the table header
    markdown = "| Column Name | Data Type | Description | Example Value |\n"
    markdown += "|-------------|-----------|-------------|---------------|\n"

    # Iterate over DataFrame columns to generate each row
    for column in df.columns:
        sample_value = df[column].iloc[min_len_idx] if not df[column].empty else "N/A"
        data_type = map_data_type(df[column].dtype, sample_value)
        # Convert example_value to string to avoid formatting issues
        example_value = str(sample_value).replace("\n", " ").replace("|", "\\|")
        # Create a new row for each column
        markdown += f"| `{column}` | {data_type} |  | {example_value} |\n"

    return markdown

mdown_text += generate_markdown_table(df, min_len_idx)



In [57]:
def write_markdown_file(mdown_text, file_path):
    with open(file_path, 'w') as f:
        f.write(mdown_text)
    print(f"Markdown file saved to {file_path}")

In [58]:
write_markdown_file(mdown_text, os.path.join(DOCS_PATH, DOC_NAME))

Markdown file saved to ../docs/Library/LIBRARY_ALBUMS.md
