In [38]:
import pandas as pd
import pickle
import numpy as np
import os
import glob
from datetime import datetime, timedelta, timezone

def load_data(data_dir):
    all_files = glob.glob(os.path.join(data_dir, "*.pkl"))
    df_list = []
    for filename in all_files:
        with open(filename, 'rb') as f:
            data = pickle.load(f)
        df = pd.DataFrame(data)
        df['repo'] = os.path.basename(filename).split('.')[0]  # Add repo name
        df_list.append(df)
    return pd.concat(df_list, ignore_index=True)

def calculate_cumulative_stars(df):
    df['starred_at'] = pd.to_datetime(df['starred_at'], utc=True)
    df = df.sort_values('starred_at')
    df['cumulative_stars'] = df.groupby('repo').cumcount() + 1
    return df

def find_milestone_date(df, milestone=1000):
    return df[df['cumulative_stars'] >= milestone].groupby('repo')['starred_at'].first()

def calculate_growth_rate(df, milestone_date, end_date):
    start_stars = df[df['starred_at'] <= milestone_date]['cumulative_stars'].max()
    end_stars = df[df['starred_at'] <= end_date]['cumulative_stars'].max()
    growth_rate = end_stars / start_stars if start_stars > 0 else 0
    return {
        'start_stars': start_stars,
        'end_stars': end_stars,
        'growth_rate': growth_rate
    }

def get_quarter_dates(quarter, year):
    quarters = {
        '1': (f"{year}-01-01", f"{year}-03-31"),
        '2': (f"{year}-04-01", f"{year}-06-30"),
        '3': (f"{year}-07-01", f"{year}-09-30"),
        '4': (f"{year}-10-01", f"{year}-12-31")
    }
    start, end = quarters[quarter.lower()]
    return (pd.to_datetime(start).replace(tzinfo=timezone.utc),
            pd.to_datetime(end).replace(tzinfo=timezone.utc))

def analyze_quarter(df, quarter, year):
    quarter_start, quarter_end = get_quarter_dates(quarter, year)

    milestone_dates = find_milestone_date(df)
    eligible_repos = milestone_dates[milestone_dates <= quarter_end]

    results = []
    for repo, milestone_date in eligible_repos.items():
        end_date = min(milestone_date + timedelta(days=90), quarter_end)

        repo_df = df[df['repo'] == repo]
        growth_data = calculate_growth_rate(repo_df, milestone_date, end_date)

        results.append({
            'repo': repo,
            'milestone_date': milestone_date.tz_convert(timezone.utc),
            'start_date': milestone_date.tz_convert(timezone.utc),
            'end_date': end_date.tz_convert(timezone.utc),
            'start_stars': growth_data['start_stars'],
            'end_stars': growth_data['end_stars'],
            'growth_rate': growth_data['growth_rate']
        })

    return pd.DataFrame(results)

def print_results_table(results):
    # Sort the results by growth_rate in descending order
    results = results.sort_values('growth_rate', ascending=False).reset_index(drop=True)

    # Define column headers and corresponding DataFrame column names
    columns = [
        ("Rank", "rank"),
        ("Repo", "repo"),
        ("Stars at start", "start_stars"),
        ("Stars at end", "end_stars"),
        ("Reached 1k stars", "milestone_date"),
        ("End date", "end_date"),
        ("Growth", "growth_rate")
    ]

    # Add rank column
    results['rank'] = range(1, len(results) + 1)

    # Filter columns that exist in the DataFrame
    existing_columns = [(header, col) for header, col in columns if col in results.columns]

    # Format date columns
    for date_col in ['milestone_date', 'end_date']:
        if date_col in results.columns:
            results[date_col] = results[date_col].dt.strftime('%d %b %Y')

    # Calculate column widths
    col_widths = [max(len(str(header)), results[col].astype(str).map(len).max())
                  for header, col in existing_columns]

    # Print table header
    header_row = " | ".join(f"{header:<{width}}" for (header, _), width in zip(existing_columns, col_widths))
    print(header_row)
    print("-" * len(header_row))

    # Print table rows
    for _, row in results.iterrows():
        row_data = []
        for (_, col), width in zip(existing_columns, col_widths):
            value = row[col]
            if col == 'growth_rate':
                value = f"{value:.1f}"  # Round to one decimal place
            elif isinstance(value, float):
                value = f"{value:.2f}"
            elif isinstance(value, int):
                value = f"{value}"
            else:
                value = str(value)
            row_data.append(f"{value:<{width}}")
        print(" | ".join(row_data))

def main():
    data_dir = 'data'  # Adjust this to your data directory path
    df = load_data(data_dir)
    df = calculate_cumulative_stars(df)

    quarter = input("Enter the quarter (1, 2, 3, 4): ")
    year = int(input("Enter the year: "))

    results = analyze_quarter(df, quarter, year)

    print("\nAnalysis Results:")
    print_results_table(results)

if __name__ == "__main__":
    main()


Analysis Results:
Rank | Repo                                   | Stars at start | Stars at end | Reached 1k stars | End date    | Growth
-----------------------------------------------------------------------------------------------------------------------
1    | zed-industries_zed_stargazers          | 1000           | 31667        | 24 Jan 2024      | 23 Apr 2024 | 31.7  
2    | opendevin_opendevin_stargazers         | 1000           | 27046        | 15 Mar 2024      | 13 Jun 2024 | 27.0  
3    | maybe-finance_maybe_stargazers         | 1000           | 25898        | 13 Jan 2024      | 12 Apr 2024 | 25.9  
4    | heyputer_puter_stargazers              | 1000           | 19002        | 05 Mar 2024      | 03 Jun 2024 | 19.0  
5    | subquery_subql_stargazers              | 1000           | 18542        | 10 Jan 2024      | 09 Apr 2024 | 18.5  
6    | stitionai_devika_stargazers            | 1000           | 17633        | 22 Mar 2024      | 20 Jun 2024 | 17.6  
7    | hpcaitech_open