In [1]:
import pandas as pd
import json

df = pd.read_csv('HF_topmodels.csv')

commit_data = []

for index, row in df.iterrows():
    commits_column = row['commits']
    model_id = row['modelId']
    
    try:
        # Parse the JSON string into Python objects (list of dictionaries)
        commits = json.loads(commits_column)
        
        for commit in commits:
            commit_id = commit.get("commit_id", "")
            authors = ", ".join(commit.get("authors", []))  # Combine multiple authors if needed
            created_at = commit.get("created_at", "")
            title = commit.get("title", "")
            message = commit.get("message", "")
            
            commit_data.append({
                'model_id':model_id,
                'commit_id': commit_id,
                'authors': authors,
                'created_at': created_at,
                'title': title,
                'message': message
            })
    
    except json.JSONDecodeError:
        print(f"Error decoding JSON in row {index}")
        continue

commit_df = pd.DataFrame(commit_data)

commit_df.to_csv('commit_data.csv', index=False)

Error decoding JSON in row 104
Error decoding JSON in row 245
Error decoding JSON in row 249
Error decoding JSON in row 302


In [2]:
commit_df

Unnamed: 0,model_id,commit_id,authors,created_at,title,message
0,1-800-BAD-CODE/punctuation_fullstop_truecase_e...,b26fd1c40e88678859048898218ea4edcc24c84a,1-800-BAD-CODE,2023-03-19T21:35:48.0000Z,Update README.md,
1,1-800-BAD-CODE/punctuation_fullstop_truecase_e...,d720825407920f43cf81a8b0de81069bed40dee9,1-800-BAD-CODE,2023-03-19T21:33:22.0000Z,Create requirements.txt,
2,1-800-BAD-CODE/punctuation_fullstop_truecase_e...,e7e3b60c7db202ee71e645d842b7a37084c6d569,1-800-BAD-CODE,2023-03-19T21:33:09.0000Z,Create pipeline.py,
3,1-800-BAD-CODE/punctuation_fullstop_truecase_e...,044ffe8f3dc2b8d1b45e627a86adef88b0027196,1-800-BAD-CODE,2023-03-19T21:32:36.0000Z,Update README.md,
4,1-800-BAD-CODE/punctuation_fullstop_truecase_e...,ef324ed1c83e072ff3a86f84b417135f60b94fb0,1-800-BAD-CODE,2023-03-19T18:04:17.0000Z,Delete handler.py,
...,...,...,...,...,...,...
8037,zer0int/CLIP-GmP-ViT-L-14,1458e46823fc8db6bdf5bf64df7d95c7cb5595a9,zer0int,2024-06-16T20:59:37.0000Z,Added .safetensors and full model object pickle,"\n\nThe full model object has ""OpenAI/CLIP ori..."
8038,zer0int/CLIP-GmP-ViT-L-14,0ce05f9a0a96125396495a940c5acaac573c9223,zer0int,2024-06-16T13:42:28.0000Z,Update README.md,
8039,zer0int/CLIP-GmP-ViT-L-14,0a9e2b51ec3743cb977680225e55f1a708ed8c92,zer0int,2024-06-15T12:08:57.0000Z,Upload ViT-L-14-GmP-ft-state_dict.pt,
8040,zer0int/CLIP-GmP-ViT-L-14,31d01a8d025790b723db2ea3cee2ff007d4c3583,zer0int,2024-06-15T11:49:18.0000Z,Update README.md,


In [3]:
commit_df = commit_df[commit_df['authors'].notnull() & (commit_df['authors'] != '')]
commit_df['authors'] = commit_df['authors'].apply(lambda x: x.split(', ') if isinstance(x, str) else [])

# Explode the DataFrame so that each author has its own row (if there are multiple authors for a commit)
commit_df_exploded = commit_df.explode('authors')

author_commit_count_df = commit_df_exploded.groupby(['model_id', 'authors']).size().reset_index(name='commit_count')
author_commit_count_df.to_csv('author_commit_count_by_model.csv', index=False)

author_commit_count_df

Unnamed: 0,model_id,authors,commit_count
0,1-800-BAD-CODE/punctuation_fullstop_truecase_e...,1-800-BAD-CODE,34
1,AdamCodd/vit-base-nsfw-detector,Adam,1
2,AdamCodd/vit-base-nsfw-detector,AdamCodd,23
3,Alibaba-NLP/gte-large-en-v1.5,Xenova,2
4,Alibaba-NLP/gte-large-en-v1.5,izhx,11
...,...,...,...
1144,yiyanghkust/finbert-tone,yiyanghkust,18
1145,yosuke/bert-base-japanese-char,Julien Chaumond,1
1146,yosuke/bert-base-japanese-char,patrickvonplaten,2
1147,yosuke/bert-base-japanese-char,system,13
