In [2]:
import os
import json
import pyperclip
import pandas as pd

In [3]:
DATA_PATH = os.path.join('../Apple Music Activity')
FILE_NAME = 'Apple Music Library Artists.json'
DOCS_PATH = os.path.join('../docs/Library')
DOC_NAME = 'LIBRARY_ARTISTS.md'

In [4]:
with open(os.path.join(DATA_PATH, FILE_NAME), 'r') as f:
    library = json.load(f)

In [5]:
len(library), type(library)

(1015, list)

In [6]:
library[0].keys()

dict_keys(['Artist Identifier', 'Artist Name', 'Date Created', 'Date Created In Library', 'Visible', 'Catalog Identifiers - Artist', 'Favorite Artist - Status', 'Favorite Artist - Date'])

In [7]:
keys_set = set()
for item in library:
    keys_set.add(len(item.keys()))
keys_set

{5, 6, 8}

In [8]:
types_set = set()
for item in library:
    for key in item.keys():
        types_set.add(type(item[key]))
        
types_set

{bool, str}

In [9]:
df = pd.DataFrame(library)

In [10]:
df.shape

(1015, 8)

In [11]:
df.columns

Index(['Artist Identifier', 'Artist Name', 'Date Created',
       'Date Created In Library', 'Visible', 'Catalog Identifiers - Artist',
       'Favorite Artist - Status', 'Favorite Artist - Date'],
      dtype='object')

In [12]:
df['Artist Identifier'].nunique()

1015

In [13]:
df['Artist Identifier'].value_counts()

Artist Identifier
r.002WMz7    1
r.eIcke09    1
r.doMr4y1    1
r.dt7GPUw    1
r.duQAwL3    1
            ..
r.JXnAsp6    1
r.JZDeyoj    1
r.JZOsyUb    1
r.JlKItiw    1
r.zwOlBFp    1
Name: count, Length: 1015, dtype: int64

In [14]:
df['Artist Identifier'].isna().sum()

0

In [15]:
df['Artist Name'].nunique()

1015

In [16]:
df['Artist Name'].value_counts()

Artist Name
Ed Sheeran                      1
Bruno Mars                      1
Flowdan, Lil Baby & Skrillex    1
DNMO, Wolfy Lights & Blooom     1
KR$NA & French The Kid          1
                               ..
Joyner Lucas & Lil Baby         1
Major Lazer                     1
Imagine Dragons                 1
Skrillex                        1
Alesso & DubVision              1
Name: count, Length: 1015, dtype: int64

In [17]:
df['Artist Name'].isna().sum()

0

In [18]:
df['Date Created'].nunique()

866

In [19]:
df['Date Created'].value_counts()

Date Created
2023-02-15T08:07:07Z    31
2024-07-22T02:18:17Z    29
2024-07-22T02:18:20Z    26
2021-06-05T17:48:05Z    22
2022-04-28T00:59:21Z    12
                        ..
2021-03-05T20:18:35Z     1
2022-10-26T10:09:12Z     1
2023-01-14T06:21:05Z     1
2021-03-27T21:11:32Z     1
2022-11-01T21:23:33Z     1
Name: count, Length: 866, dtype: int64

In [20]:
df['Date Created'].isna().sum()

0

In [21]:
df['Date Created In Library'].nunique()

852

In [22]:
df['Date Created In Library'].value_counts()

Date Created In Library
2023-02-15T08:07:07Z    31
2024-07-22T02:18:17Z    29
2021-06-05T17:48:05Z    27
2024-07-22T02:18:20Z    26
2022-04-28T01:04:28Z    12
                        ..
2023-10-04T23:30:45Z     1
2022-05-04T22:19:17Z     1
2021-03-05T20:18:35Z     1
2022-10-26T10:09:12Z     1
2022-11-01T21:23:33Z     1
Name: count, Length: 852, dtype: int64

In [23]:
df['Date Created In Library'].isna().sum()

0

In [25]:
df['Favorite Artist - Date'].nunique()

168

In [27]:
df['Favorite Artist - Date'].value_counts()

Favorite Artist - Date
2022-10-26T10:00:03Z    1
2022-10-26T10:03:17Z    1
2022-10-26T10:00:19Z    1
2023-04-13T19:39:35Z    1
2022-10-26T09:55:01Z    1
                       ..
2022-10-26T10:04:00Z    1
2024-06-22T23:46:32Z    1
2022-10-26T10:02:00Z    1
2022-12-07T07:39:33Z    1
2024-06-05T03:51:12Z    1
Name: count, Length: 168, dtype: int64

In [28]:
df['Favorite Artist - Date'].isna().sum()

847

In [30]:
df[~df['Favorite Artist - Date'].isna()]['Artist Name']

0          Ed Sheeran
4            Jamie xx
8       Ramin Djawadi
22           Dua Lipa
25       Lost Stories
            ...      
982            Offset
988             ISOxo
994             Drake
996     Flosstradamus
1005              PFV
Name: Artist Name, Length: 168, dtype: object

In [31]:
df['Favorite Artist - Status'].nunique()

1

In [32]:
df['Favorite Artist - Status'].value_counts()

Favorite Artist - Status
True    168
Name: count, dtype: int64

In [34]:
df[~df['Favorite Artist - Status'].isna()]['Artist Name']

0          Ed Sheeran
4            Jamie xx
8       Ramin Djawadi
22           Dua Lipa
25       Lost Stories
            ...      
982            Offset
988             ISOxo
994             Drake
996     Flosstradamus
1005              PFV
Name: Artist Name, Length: 168, dtype: object

In [35]:
df['Catalog Identifiers - Artist'].nunique()

690

In [36]:
df['Catalog Identifiers - Artist'].value_counts()

Catalog Identifiers - Artist
1              25
827795989       9
1455262408      8
1484701109      7
430932944       7
               ..
358714030       1
1,356545647     1
444520760       1
944171951       1
1356377051      1
Name: count, Length: 690, dtype: int64

In [38]:
df[df['Catalog Identifiers - Artist'] == 1]['Artist Name']

Series([], Name: Artist Name, dtype: object)

In [39]:
df_non_nans = df.isna().sum(axis=1)
min_len = df_non_nans.min()
min_len_idx = df_non_nans.idxmin()

In [40]:
mdown_text = f"""
# Library Artists Data Definition

This data dictionary describes the fields in the {FILE_NAME} data. There are {df.shape[0]} rows and {df.shape[1]} columns. 

## Library Artists Data

"""

In [41]:
def generate_markdown_table(df, min_index):
    # Helper function to map data types to more readable formats
    def map_data_type(dtype, sample_value):
        # Check for datetime
        if pd.api.types.is_datetime64_any_dtype(dtype) or (
            isinstance(sample_value, str) and pd.to_datetime(sample_value, errors='coerce') is not pd.NaT
        ):
            return "Datetime"
        # Check for Boolean (Python Boolean or string representation)
        elif isinstance(sample_value, bool) or (isinstance(sample_value, str) and sample_value.lower() in ["true", "false"]):
            return "Boolean"
        elif pd.api.types.is_string_dtype(dtype):
            return "String"
        elif pd.api.types.is_numeric_dtype(dtype):
            if pd.api.types.is_integer_dtype(dtype):
                return "Integer"
            elif pd.api.types.is_float_dtype(dtype):
                return "Float"
        elif isinstance(sample_value, list):
            return "List"
        else:
            return str(dtype)  # Fallback to original dtype if no match

    # Generate the table header
    markdown = "| Column Name | Data Type | Description | Example Value |\n"
    markdown += "|-------------|-----------|-------------|---------------|\n"

    # Iterate over DataFrame columns to generate each row
    for column in df.columns:
        sample_value = df[column].iloc[min_len_idx] if not df[column].empty else "N/A"
        data_type = map_data_type(df[column].dtype, sample_value)
        # Convert example_value to string to avoid formatting issues
        example_value = str(sample_value).replace("\n", " ").replace("|", "\\|")
        # Create a new row for each column
        markdown += f"| `{column}` | {data_type} |  | {example_value} |\n"

    return markdown

mdown_text += generate_markdown_table(df, min_len_idx)



In [42]:
def write_markdown_file(mdown_text, file_path):
    with open(file_path, 'w') as f:
        f.write(mdown_text)
    print(f"Markdown file saved to {file_path}")

In [43]:
write_markdown_file(mdown_text, os.path.join(DOCS_PATH, DOC_NAME))

Markdown file saved to ../docs/Library/LIBRARY_ARTISTS.md
