In [1]:
import os
import json
import pyperclip
import pandas as pd



In [3]:
DATA_PATH = os.path.join('../Apple Music Activity')
FILE_NAME = 'Apple Music Library Activity.json'
DOCS_PATH = os.path.join('../docs/Library')
DOC_NAME = 'LIBRARY_ACTIVITY.md'

In [4]:
with open(os.path.join(DATA_PATH, FILE_NAME), 'r') as f:
    library = json.load(f)

In [5]:
len(library), type(library)

(6072, list)

In [6]:
library[0].keys()

dict_keys(['Transaction Type', 'Transaction Identifier', 'Transaction Date', 'UserAgent', 'Subscription Start Date', 'Subscription Type'])

In [7]:
keys_set = set()
for item in library:
    keys_set.add(len(item.keys()))
keys_set

{5, 6, 7, 8, 9}

In [8]:
types_set = set()
list_type_keyset = set()
for item in library:
    for key in item.keys():
        types_set.add(type(item[key]))
        if type(item[key]) == list:
            list_type_keyset.add(key)
        
types_set, list_type_keyset

({dict, int, list, str},
 {'Artist Catalog Identifier',
  'Artists',
  'Modified Container Identifiers',
  'Playlist Identifiers Added From Profile',
  'Playlist Identifiers Removed From Profile',
  'Playlist Identifiers Set From Profile',
  'Playlist Item Identifiers Appended',
  'Playlists',
  'Track Identifiers',
  'Tracks'})

In [9]:
df = pd.DataFrame(library)

In [10]:
df.to_csv('LibraryActivity.csv', index=False)

In [11]:
df['Transaction Type'].value_counts()

Transaction Type
updateItems                       2358
addItems                          1835
deleteItems                        678
appendContainerItems               486
updateContainer                    368
updateUser                         108
subscribeToPlaylist                 51
updateArtistAdamIdsLikedStatus      50
addContainer                        49
updateArtists                       34
deleteContainer                     24
updateFavoriteArtistAdamIds         13
addContainers                        5
setProfileContainerIds               4
optInUser                            3
backfillAlbumArtists                 2
optOutUser                           2
startPlaylistCollaboration           2
Name: count, dtype: int64

In [12]:
update_items_df = df[df['Transaction Type'] == 'updateItems']

In [13]:
update_items_df.shape

(2358, 29)

In [14]:
update_items_df.head()

Unnamed: 0,Transaction Type,Transaction Identifier,Transaction Date,UserAgent,Subscription Start Date,Subscription Type,User,Country,Language,Tracks,...,Playlist Identifiers Removed From Profile,Playlist Identifiers Set From Profile,Playlist Identifiers Added From Profile,Artist Catalog Identifier,Favorite Type,Liked Status,Artists,Collaboration Identifier,Source Playlist Identifier,Collaboration Playlist Identifier
7,updateItems,10000007,2021-03-01T07:34:49Z,"itunescloudd/1.0 iOS/14.4 model/iPhone9,3 hwp/...",,,,USA,en-us,"[{'Track Identifier': 182857262, 'Last Played ...",...,,,,,,,,,,
23,updateItems,10000023,2021-03-07T00:44:13Z,"itunescloudd/1.0 iOS/14.4 model/iPhone9,3 hwp/...",,,,USA,en-us,"[{'Track Identifier': 182857518, 'Date of Last...",...,,,,,,,,,,
24,updateItems,10000024,2021-03-08T07:26:18Z,"itunescloudd/1.0 iOS/14.4 model/iPhone9,3 hwp/...",,,,USA,en-us,"[{'Track Identifier': 182857730, 'Last Played ...",...,,,,,,,,,,
39,updateItems,10000039,2021-03-09T12:09:14Z,"itunescloudd/1.0 iOS/14.4 model/iPhone9,3 hwp/...",,,,USA,en-us,"[{'Track Identifier': 182857774, 'Last Played ...",...,,,,,,,,,,
40,updateItems,10000040,2021-03-10T02:40:45Z,"itunescloudd/1.0 iOS/14.4 model/iPhone9,3 hwp/...",,,,USA,en-us,"[{'Track Identifier': 182857774, 'Last Played ...",...,,,,,,,,,,


In [32]:
for key in list_type_keyset:
    print (key)
    for i, row in df.iterrows():
        if (type(row[key]) == list) or (type(row[key]) == pd.Series):
            print(key, row[key])
            print ("")
            break

Playlist Identifiers Added From Profile
Playlist Identifiers Added From Profile [256001537, 256000521]

Playlist Item Identifiers Appended
Playlist Item Identifiers Appended [182860082]

Playlist Identifiers Removed From Profile
Playlist Identifiers Removed From Profile [256001293, 256001041, 256000521]

Artist Catalog Identifier
Artist Catalog Identifier ['430932944']

Modified Container Identifiers
Modified Container Identifiers [256001041]

Artists
Artists [{'Artist Identifier': 'r.ibIa6Y5', 'Is Favorite': True}]

Playlist Identifiers Set From Profile
Playlist Identifiers Set From Profile [256002053, 256002049, 256001809, 256001801, 256001797, 256001793, 256001553, 256001537, 256000521]

Tracks
Tracks [{'Content Type': 'Song', 'Track Identifier': 182857262, 'Date Added To Library': '2021-02-28T19:34:08Z', 'Date Added To iCloud Music Library': '2021-02-28T19:34:08Z', 'Last Modified Date': '2021-02-28T19:34:08Z', 'Is Checked': False, 'Apple Music Track Identifier': 1137641426}]

Track

In [34]:
df_non_nans = df.isna().sum(axis=1)
min_len = df_non_nans.min()
min_len_idx = df_non_nans.idxmin()

In [36]:
mdown_text = f"""
# Library Activity Data Definition

This data dictionary describes the fields in the {FILE_NAME} data. There are {df.shape[0]} rows and {df.shape[1]} columns. 

## Library Activity Data

"""

In [37]:
def generate_markdown_table(df, min_index):
    # Helper function to map data types to more readable formats
    def map_data_type(dtype, sample_value):
        # Check for datetime
        if pd.api.types.is_datetime64_any_dtype(dtype) or (
            isinstance(sample_value, str) and pd.to_datetime(sample_value, errors='coerce') is not pd.NaT
        ):
            return "Datetime"
        # Check for Boolean (Python Boolean or string representation)
        elif isinstance(sample_value, bool) or (isinstance(sample_value, str) and sample_value.lower() in ["true", "false"]):
            return "Boolean"
        elif pd.api.types.is_string_dtype(dtype):
            return "String"
        elif pd.api.types.is_numeric_dtype(dtype):
            if pd.api.types.is_integer_dtype(dtype):
                return "Integer"
            elif pd.api.types.is_float_dtype(dtype):
                return "Float"
        elif isinstance(sample_value, list):
            return "List"
        else:
            return str(dtype)  # Fallback to original dtype if no match

    # Generate the table header
    markdown = "| Column Name | Data Type | Description | Example Value |\n"
    markdown += "|-------------|-----------|-------------|---------------|\n"

    # Iterate over DataFrame columns to generate each row
    for column in df.columns:
        sample_value = df[column].iloc[min_len_idx] if not df[column].empty else "N/A"
        data_type = map_data_type(df[column].dtype, sample_value)
        # Convert example_value to string to avoid formatting issues
        example_value = str(sample_value).replace("\n", " ").replace("|", "\\|")
        # Create a new row for each column
        markdown += f"| `{column}` | {data_type} |  | {example_value} |\n"

    return markdown

mdown_text += generate_markdown_table(df, min_len_idx)



In [38]:
def write_markdown_file(mdown_text, file_path):
    with open(file_path, 'w') as f:
        f.write(mdown_text)
    print(f"Markdown file saved to {file_path}")

In [39]:
write_markdown_file(mdown_text, os.path.join(DOCS_PATH, DOC_NAME))

Markdown file saved to ../docs/Library/LIBRARY_ACTIVITY.md


In [42]:
transaction_type_count = df['Transaction Type'].value_counts()

In [43]:
def convert_to_markdown_table(df):
    markdown = "|"
    for col in df.columns:
        markdown += f" {col} |"
    markdown += "\n|"
    for col in df.columns:
        markdown += " --- |"
    markdown += "\n"
    for i, row in df.iterrows():
        markdown += "|"
        for col in df.columns:
            markdown += f" {row[col]} |"
        markdown += "\n"
    return markdown

In [44]:
convert_to_markdown_table(transaction_type_count)

AttributeError: 'Series' object has no attribute 'columns'

In [46]:
def generate_markdown_table(data):
    # Check if the input is a Series or a DataFrame
    if isinstance(data, pd.Series):
        markdown = "| Value | Count |\n"
        markdown += "|-------|-------|\n"
        for index, value in data.items():
            # Convert index and value to strings, replacing problematic characters for Markdown
            index_str = str(index).replace("|", "\\|")
            value_str = str(value).replace("|", "\\|")
            markdown += f"| {index_str} | {value_str} |\n"
        return markdown

    elif isinstance(data, pd.DataFrame):
        # Helper function to map data types to more readable formats
        def map_data_type(dtype, sample_value):
            # Check for datetime
            if pd.api.types.is_datetime64_any_dtype(dtype) or (
                isinstance(sample_value, str) and pd.to_datetime(sample_value, errors='coerce') is not pd.NaT
            ):
                return "Datetime"
            # Check for Boolean (Python Boolean or string representation)
            elif isinstance(sample_value, bool) or (isinstance(sample_value, str) and sample_value.lower() in ["true", "false"]):
                return "Boolean"
            elif pd.api.types.is_string_dtype(dtype):
                return "String"
            elif pd.api.types.is_numeric_dtype(dtype):
                if pd.api.types.is_integer_dtype(dtype):
                    return "Integer"
                elif pd.api.types.is_float_dtype(dtype):
                    return "Float"
            elif (isinstance(sample_value, list)) or type(sample_value) == pd.Series:
                return "List"
            else:
                return str(dtype)  # Fallback to original dtype if no match

        # Generate the table header for DataFrame
        markdown = "| Column Name | Data Type | Description | Example Value |\n"
        markdown += "|-------------|-----------|-------------|---------------|\n"

        # Iterate over DataFrame columns to generate each row
        for column in data.columns:
            sample_value = data[column].iloc[0] if not data[column].empty else "N/A"
            data_type = map_data_type(data[column].dtype, sample_value)
            # Convert example_value to string to avoid formatting issues
            example_value = str(sample_value).replace("\n", " ").replace("|", "\\|")
            # Create a new row for each column
            markdown += f"| `{column}` | {data_type} |  | {example_value} |\n"

        return markdown

    else:
        raise TypeError("Input must be a pandas DataFrame or Series")

In [48]:
transaction_type_count_mkdown = generate_markdown_table(transaction_type_count)
pyperclip.copy(transaction_type_count_mkdown)
