In [6]:
import json
import bz2
from collections import defaultdict, Counter
import re
import csv

def extract_ngrams(message, n):
    # Clean and split the message
    words = re.sub(r'[^\w\s]', '', message.lower()).split()
    # Generate n-grams
    ngrams = [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]
    return ngrams

def process_file(filename):
    authors_ngrams = defaultdict(list)

    # Determine how to open the file based on its extension
    if filename.endswith('.bz2'):
        opener = bz2.open
        open_args = {'mode': 'rt', 'encoding': 'utf-8'}
    else:
        opener = open
        open_args = {'mode': 'r', 'encoding': 'utf-8'}

    with opener(filename, **open_args) as file:
        for line in file:
            try:
                data = json.loads(line)
                if data['type'] == 'PushEvent':
                    commits = data['payload'].get('commits', [])
                    for commit in commits:
                        author_name = commit['author'].get('name')
                        message = commit['message']
                        ngrams = extract_ngrams(message, 3)
                        authors_ngrams[author_name].extend(ngrams)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")
            
    results = {}
    for author, ngrams in authors_ngrams.items():
        counter = Counter(ngrams)
        top_ngrams = counter.most_common(5)
        results[author] = [ngram for ngram, _ in top_ngrams]
    
    return results


def write_results_to_csv(results, output_file):
    with open(output_file, 'w', newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file)
        headers = ['author', 'first 3-gram', 'second 3-gram', 'third 3-gram', 'fourth 3-gram', 'fifth 3-gram']
        writer.writerow(headers)
        for author, ngrams in results.items():
            # Make sure to have exactly five n-grams in the output
            row = [author] + ngrams + [''] * (5 - len(ngrams))
            writer.writerow(row)

# Process the GitHub file and write 
results = process_file('10K.github.jsonl')
write_results_to_csv(results, 'top_ngrams_per_author.csv')
