In [None]:
import re

def parse_git_log(file_path):
    commit_pattern = re.compile(r'^\[([a-f0-9]+)\] (.+) (\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} [+-]\d{4}) (.+)$')
    file_pattern = re.compile(r'^(\d+)\s+(\d+)\s+(.+)$')

    commits = []
    files = []

    with open(file_path, 'r', encoding='latin-1') as file:
        lines = file.readlines()
    
    current_commit = None

    for line in lines:
        commit_match = commit_pattern.match(line.strip())
        if commit_match:
            commit_id, author, date, message = commit_match.groups()
            current_commit = {
                'commit_id': commit_id,
                'author': author,
                'date': date,
                'message': message
            }
            commits.append(current_commit)
        else:
            file_match = file_pattern.match(line.strip())
            if file_match and current_commit:
                added, deleted, filepath = file_match.groups()
                files.append({
                    'commit_id': current_commit['commit_id'],
                    'added': int(added),
                    'deleted': int(deleted),
                    'filepath': filepath
                })

    return commits, files

In [None]:
import os
import pandas as pd
import fasttext

model = fasttext.load_model("classification/fasttext.model")

def process_log_files(task_id: str):
    if os.path.exists(f"logs/{task_id}_logfile.log"):

        commits, files = parse_git_log(f"logs/{task_id}_logfile.log")

        commits_df = pd.DataFrame(commits)
        files_df = pd.DataFrame(files)
    
        def classify_change(message):
            return model.predict([message])[0][0][0]

        commits_df['change_type'] = commits_df['message'].apply(classify_change)
        commits_df['date'] = pd.to_datetime(commits_df['date'], format='%Y-%m-%d %H:%M:%S %z', utc=True)

        commits_df = commits_df.sort_values(by='date')

        files_df['filename'] = files_df['filepath']

        commits_df.to_csv(f"commits/{task_id}-commits.csv")
        files_df.to_csv(f"commits/{task_id}-files.csv")


In [None]:
log_dir = "logs/"
for filename in os.listdir(log_dir):
    if filename.endswith("_all_files.csv"):
        task_id = filename.replace("_all_files.csv", "")
        process_log_files(task_id)