In [2]:
import pandas as pd

In [3]:
def total_das_cps(df):
    """
    Total diagnostic_analyzers & codefix_providers
    """
    print("Total rows")
    diagnostic_analyzers = df[df['Type'].str.match('DIAGNOSTIC_ANALYZER')]
    codefix_providers = df[df['Type'].str.match('CODEFIX_PROVIDER')]
    num_da = len(diagnostic_analyzers.index)
    num_cp = len(codefix_providers.index)
    print("da: ", num_da)
    print("cp: ", num_cp)

In [4]:
def unique_diagnostic_ids(df):
    print("Unique diagnostic ids")
    diagnostic_analyzers = df[df['Type'].str.match('DIAGNOSTIC_ANALYZER')]
    codefix_providers = df[df['Type'].str.match('CODEFIX_PROVIDER')]
    num_da = diagnostic_analyzers['DiagnosticID'].nunique()
    num_cp = codefix_providers['DiagnosticID'].nunique()
    print("da: ", num_da)
    print("cp: ", num_cp)

In [5]:
def duplicate_diagnostic_ids(df):
    """
    Seems to happen mostly because the same packages are downloaded multiple times, 
    but with different versions. Specifically:
    StyleCop:
        1. StyleCop.Analyzers.1.1.118
        2. StyleCop.Analyzers.Unstable.1.2.0.333
        3. StyleCop.Analyzers.1.0.0
        4. StyleCop.Analyzers.1.0.2
    XUnit:
        1. xunit.analyzers.0.10.0
        2. xunit.analyzers.0.7.0
    SonarAnalyzer.CSharp:
        1. SonarAnalyzer.CSharp.8.20.0.28934
        2. SonarAnalyzer.CSharp.1.21.0
        3. SonarAnalyzer.CSharp.1.23.0.1857
        4. SonarAnalyzer.CSharp.8.6.0.16497
        5. SonarAnalyzer.CSharp.8.7.0.17535

    This is because a number of analyzer packages use other analyzer packages
    as dependencies. Sometimes they simply bundle different analyzer packages
    without creating any DiagnosticAnalyzers / CodeFixProviders themselves.
    The downside of this, is that they often reference outdated versions. This
    is why we can see so many different versions of the same packages.
    """
    print("Duplicate diagnostic ids")
    diagnostic_analyzers = df[df['Type'].str.match('DIAGNOSTIC_ANALYZER')]
    codefix_providers = df[df['Type'].str.match('CODEFIX_PROVIDER')]
    da_duplicates = pd.concat(
        g for _, g in diagnostic_analyzers.groupby("DiagnosticID") if len(g) > 1)
    cp_duplicates = pd.concat(
        g for _, g in codefix_providers.groupby("DiagnosticID") if len(g) > 1)

    with pd.option_context(
        'display.min_rows', 50,
        'display.max_rows', 50
    ):
        print(da_duplicates)
        print(cp_duplicates)

In [6]:
def unique_source_packages(df, original_packages='nuget_packages.txt'):
    "All packages that have their own diagnostic ids, disregarding versioning"

    # Not optimal - any dots followed by numbers are removed
    df = df['HostingPackageName'].str.replace(r'\.\d+', '')
    df.drop_duplicates(inplace=True)
    with pd.option_context(
        'display.min_rows', 70,
        'display.max_rows', 70,
        'display.max_colwidth', 300
    ):
        print(df)

In [7]:
def missed_packages(df, original_packages='nuget_packages.txt'):
    """
    All packages that were not in the original list of NuGet analyzer packages, but
    have DiagnosticAnalyzers/CodeFixProviders and packages of the original list use
    them as dependencies.

    This means we are using their diagnostics for the dataset, but they are potentially
    outdated versions.

    Queried for nuget.org for "analyzers"
    Problem:
    --> Did not query for "analyzer" e.g. 
          Microsoft.AnalyzerPowerPack
          SmartAnalyzers.ExceptionAnalyzer
          TODO: Redo search with "analyzer"
    --> Also missed "Microsoft.CodeAnalysis.CSharp"
    --> System packages may not be on NuGet.org e.g.
          System.Runtime.Analyzers
          System.Runtime.InteropServices.Analyzers
    """
    print("Missed packages")

    # Not optimal - any dots followed by numbers are removed
    df = df['HostingPackageName'].str.replace(r'\.\d+', '')
    df.drop_duplicates(inplace=True)

    original_packages_list = [line.strip() for line in open(original_packages)]
    df_missed_packages = df[~df.isin(original_packages_list)]

    with pd.option_context(
        'display.min_rows', 100,
        'display.max_rows', 100,
        'display.max_colwidth', 300
    ):
        print(df_missed_packages)

In [8]:
def calculate_analyzer_statistics(csv_file="analyzer_package_details.csv"):

    df = pd.read_csv(csv_file)
    # TODO: Find out why duplicates exist
    # df.drop_duplicates(inplace=True)
    # total_das_cps(df)
    # unique_diagnostic_ids(df)

    # duplicate_diagnostic_ids(df)

    unique_source_packages(df)
    # missed_packages(df)

    # TODO: Find
    # 1. Percentage of diagnostic_analyzers that have a codefix_provider


In [9]:
calculate_analyzer_statistics()

0                                      FluentAssertions.Analyzers
186                     Microsoft.AspNetCore.Components.Analyzers
193                            Microsoft.AspNetCore.Mvc.Analyzers
198      Microsoft.Azure.WebJobs.Extensions.DurableTask.Analyzers
233                              Microsoft.CodeAnalysis.Analyzers
335                     Microsoft.CodeAnalysis.BannedApiAnalyzers
341                     Microsoft.CodeAnalysis.PublicApiAnalyzers
355                   Microsoft.CodeAnalysis.VersionCheckAnalyzer
356                               Microsoft.CodeQuality.Analyzers
699                      Microsoft.DependencyValidation.Analyzers
705                       Microsoft.EntityFrameworkCore.Analyzers
706                                   Microsoft.NetCore.Analyzers
1082                             Microsoft.NetFramework.Analyzers
1118                         Microsoft.VisualStudio.SDK.Analyzers
1127                   Microsoft.VisualStudio.Threading.Analyzers
1235      