In [13]:
import pandas as pd
import json

df = pd.read_csv('HF_image-classification_models.csv')

commit_data = []

for index, row in df.iterrows():
    commits_column = row['commits']
    model_id = row['modelId']
    
    try:
        # Parse the JSON string into Python objects (list of dictionaries)
        commits = json.loads(commits_column)
        
        for commit in commits:
            commit_id = commit.get("commit_id", "")
            authors = ", ".join(commit.get("authors", []))  # Combine multiple authors if needed
            created_at = commit.get("created_at", "")
            title = commit.get("title", "")
            message = commit.get("message", "")
            
            commit_data.append({
                'model_id':model_id,
                'commit_id': commit_id,
                'authors': authors,
                'created_at': created_at,
                'title': title,
                'message': message
            })
    
    except json.JSONDecodeError:
        print(f"Error decoding JSON in row {index}")
        continue

commit_df = pd.DataFrame(commit_data)

commit_df.to_csv('commit_data.csv', index=False)

Error decoding JSON in row 0
Error decoding JSON in row 9
Error decoding JSON in row 18


In [14]:
commit_df

Unnamed: 0,model_id,commit_id,authors,created_at,title,message
0,sentence-transformers/all-MiniLM-L6-v2,8b3219a92973c328a8e22fadcfa821b5dc75636a,"tomaarsen, Xenova",2024-05-29T14:43:28.0000Z,Upload ONNX weights exported via optimum with ...,\n\n\n- Upload ONNX weights exported via optim...
1,sentence-transformers/all-MiniLM-L6-v2,e4ce9877abf3edfe10b0d82785e83bdcb973e22e,"tomaarsen, Arsive",2024-03-27T10:43:07.0000Z,spelling corrections (#13),\n\n\n- spelling corrections (2a7cca6ad7605f31...
2,sentence-transformers/all-MiniLM-L6-v2,46605decb5369335a3847c9f41bb0b896c07dd1a,"tomaarsen, SFconvertbot",2024-03-27T10:41:11.0000Z,Adding `safetensors` variant of this model (#53),\n\n\n- Adding `safetensors` variant of this m...
3,sentence-transformers/all-MiniLM-L6-v2,44eb4044493a3c34bc6d7faae1a71ec76665ebc6,"tomaarsen, joelwigton",2024-02-15T09:59:44.0000Z,Several small spelling errors in README (#48),\n\n\n- Several small spelling errors in READM...
4,sentence-transformers/all-MiniLM-L6-v2,1a310852cf8e58d22c5ebff537711d504ad4ad66,tomaarsen,2024-02-12T11:16:40.0000Z,Update Sentence Transformers metadata (#46),\n\n\n- Update Sentence Transformers metadata ...
...,...,...,...,...,...,...
673,nesaorg/benchmark_v0,cd92f1cc1545dbc462217b595da096d83c48c98b,jamesdslab2,2024-08-14T02:45:00.0000Z,Upload model.safetensors,
674,nesaorg/benchmark_v0,ab0374aefaba11bd56b2af63567dbd84d4ae8bb2,jamesdslab2,2024-08-14T02:28:03.0000Z,Upload model.safetensors,
675,nesaorg/benchmark_v0,a045e5b7fd67756a396ca9beccdb5e8408d1ece1,jamesdslab2,2024-08-14T02:14:26.0000Z,Upload model.safetensors,
676,nesaorg/benchmark_v0,48ad3e2aa086a7a95fef4eb88f9b1537cc11eb13,jamesdslab2,2024-08-13T21:51:14.0000Z,Upload 3 files,


In [15]:
commit_df = commit_df[commit_df['authors'].notnull() & (commit_df['authors'] != '')]
commit_df['authors'] = commit_df['authors'].apply(lambda x: x.split(', ') if isinstance(x, str) else [])

# Explode the DataFrame so that each author has its own row (if there are multiple authors for a commit)
commit_df_exploded = commit_df.explode('authors')

author_commit_count_df = commit_df_exploded.groupby(['model_id', 'authors']).size().reset_index(name='commit_count')
author_commit_count_df.to_csv('author_commit_count_by_model.csv', index=False)

author_commit_count_df


Unnamed: 0,model_id,authors,commit_count
0,Alibaba-NLP/gte-large-en-v1.5,Xenova,2
1,Alibaba-NLP/gte-large-en-v1.5,izhx,11
2,Alibaba-NLP/gte-large-en-v1.5,thenlper,3
3,Alibaba-NLP/gte-large-en-v1.5,tomaarsen,1
4,BAAI/bge-base-en-v1.5,Shitao,6
...,...,...,...
207,timm/resnet50.a1_in1k,rwightman,3
208,trl-internal-testing/dummy-GPT2-correct-vocab,Younes Belkada,1
209,trl-internal-testing/dummy-GPT2-correct-vocab,qgallouedec,22
210,trl-internal-testing/dummy-GPT2-correct-vocab,ybelkada,2
