In [16]:
# code taken as-is or with minor modifications from https://doi.org/10.5281/zenodo.15871467
import pandas as pd

In [17]:
def get_statistics_by_column(cs_projects, java_projects, js_projects, py_projects, ts_projects, column):
    # Create a dictionary of the projects for easier iteration
    projects = {
        'C# Projects': cs_projects,
        'Java Projects': java_projects,
        'JavaScript Projects': js_projects,
        'Python Projects': py_projects,
        'TypeScript Projects': ts_projects
    }
    
    # Iterate through each project type
    for project_name, project_df in projects.items():
        print(f"Statistics for {project_name}:")
        
        # Check if the column exists in the DataFrame
        if column in project_df.columns:
            column_data = project_df[column]
            
            # Calculate statistics
            mean_value = column_data.mean()
            median_value = column_data.median()
            q1 = column_data.quantile(0.25)
            q3 = column_data.quantile(0.75)
            iqr = q3 - q1
            std_dev = column_data.std()
            
            # Display the statistics
            print(f"  Mean: {mean_value}")
            print(f"  Median: {median_value}")
            print(f"  1st Quartile (Q1): {q1}")
            print(f"  3rd Quartile (Q3): {q3}")
            print(f"  Interquartile Range (IQR): {iqr}")
            print(f"  Standard Deviation: {std_dev}\n")
        else:
            print(f"  Column '{column}' not found in {project_name}.\n")
            
def obtain_q1_by_column(projects, column):
    return projects[column].quantile(0.25)

def obtain_97_by_column(projects, column):
    mean = projects[column].mean()
    std_dev = projects[column].std()
    
    # Calculate thresholds for outliers
    lower_threshold = mean - 1.5 * std_dev
    if lower_threshold < 0: lower_threshold = 0
    upper_threshold = mean + 1.5 * std_dev
    if upper_threshold > 100: upper_threshold = 100
    return (upper_threshold, lower_threshold)

In [18]:
df1 = pd.read_csv('../data/engineered_projects_from_csn.csv')
df2 = pd.read_csv('../data/repo_code_comment_ratio_csn_engineered_projects.csv')

In [19]:
df1.columns

Index(['repository', 'isFork', 'commits', 'branches', 'releases', 'forks',
       'mainLanguage', 'defaultBranch', 'license', 'homepage', 'watchers',
       'stargazers', 'contributors', 'size', 'createdAt', 'pushedAt',
       'updatedAt', 'totalIssues', 'openIssues', 'totalPullRequests',
       'openPullRequests', 'lastCommit', 'lastCommitSHA', 'hasWiki',
       'isArchived', 'isDisabled', 'languages', 'labels', 'topics'],
      dtype='object')

In [20]:
df2.columns

Index(['repository', 'blankLines', 'codeLines', 'commentLines'], dtype='object')

In [21]:
py_projects = pd.merge(df1, df2, on='repository', how='inner')

In [22]:
py_projects.shape

(4290, 32)

In [23]:
py_projects.columns

Index(['repository', 'isFork', 'commits', 'branches', 'releases', 'forks',
       'mainLanguage', 'defaultBranch', 'license', 'homepage', 'watchers',
       'stargazers', 'contributors', 'size', 'createdAt', 'pushedAt',
       'updatedAt', 'totalIssues', 'openIssues', 'totalPullRequests',
       'openPullRequests', 'lastCommit', 'lastCommitSHA', 'hasWiki',
       'isArchived', 'isDisabled', 'languages', 'labels', 'topics',
       'blankLines', 'codeLines', 'commentLines'],
      dtype='object')

In [24]:
py_projects['codeRatio'] = py_projects.apply(
        lambda row: 
        0 if pd.isna(row['codeLines']) or pd.isna(row['commentLines']) or pd.isna(row['blankLines']) or (row['codeLines'] + row['commentLines'] + row['blankLines']) == 0 
        else row['codeLines'] / (row['codeLines'] + row['commentLines'] + row['blankLines']) * 100, 
        axis=1)

In [25]:
py_projects.shape

(4290, 33)

In [26]:
metrics = ['codeLines', 'totalPullRequests', 'totalIssues', 'codeRatio']

In [27]:
# Initialize a dictionary to hold the threshold values
thresholds = {
    'py': {}
}

# Calculate thresholds for each project type and metric
for project_type, projects in zip(['py'],
                                   [py_projects]):
    for metric in metrics:
        if metric == 'codeRatio':
            threshold = obtain_97_by_column(projects, metric)
        else:
            threshold = obtain_q1_by_column(projects, metric)
            
        thresholds[project_type][metric] = threshold

In [28]:
thresholds

{'py': {'codeLines': np.float64(1380.25),
  'totalPullRequests': np.float64(14.0),
  'totalIssues': np.float64(0.0),
  'codeRatio': (np.float64(87.14634132468765), np.float64(45.38691855805246))}}

In [29]:
filtered_projects = {}

for project_type, projects in [('py', py_projects)]:
    thresholds_project = thresholds[project_type]
    filtered_projects[project_type] = projects.loc[
        (projects['codeLines'] >= thresholds_project['codeLines']) &
        (projects['totalPullRequests'] >= thresholds_project['totalPullRequests']) &
        (projects['totalIssues'] >= thresholds_project['totalIssues']) & 
        ((projects['codeRatio'] > thresholds_project['codeRatio'][1]) & projects['codeRatio'] < thresholds_project['codeRatio'][0])
    ]
    filtered_projects[project_type] = filtered_projects[project_type].sort_values(by='stargazers', ascending=False)
    filtered_projects[project_type]['url'] = filtered_projects[project_type]['repository'].apply(lambda x: 'https://github.com/'+x)
    new_columns = ['url']
    existing_columns = [col for col in filtered_projects[project_type].columns if col not in new_columns] 
    new_column_order = new_columns + existing_columns
    filtered_projects[project_type] = filtered_projects[project_type][new_column_order]

In [30]:
py_projects.shape

(4290, 33)

In [31]:
filtered_projects['py'].shape

(2728, 34)

In [32]:
filtered_projects['py'].info()

<class 'pandas.core.frame.DataFrame'>
Index: 2728 entries, 268 to 508
Data columns (total 34 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   url                2728 non-null   object 
 1   repository         2728 non-null   object 
 2   isFork             2728 non-null   bool   
 3   commits            2728 non-null   int64  
 4   branches           2728 non-null   int64  
 5   releases           2728 non-null   int64  
 6   forks              2728 non-null   int64  
 7   mainLanguage       2728 non-null   object 
 8   defaultBranch      2728 non-null   object 
 9   license            2728 non-null   object 
 10  homepage           1339 non-null   object 
 11  watchers           2728 non-null   int64  
 12  stargazers         2728 non-null   int64  
 13  contributors       2728 non-null   int64  
 14  size               2728 non-null   int64  
 15  createdAt          2728 non-null   object 
 16  pushedAt           2728 non-

In [33]:
filtered_projects['py'].head(3)

Unnamed: 0,url,repository,isFork,commits,branches,releases,forks,mainLanguage,defaultBranch,license,...,hasWiki,isArchived,isDisabled,languages,labels,topics,blankLines,codeLines,commentLines,codeRatio
268,https://github.com/huggingface/pytorch-pretrai...,huggingface/pytorch-pretrained-BERT,False,20465,941,228,30430,Python,main,Apache License 2.0,...,True,False,False,"{""Python"": 71882666, ""Cuda"": 204026, ""Dockerfi...","[""Accelerate"", ""Agents"", ""Audio"", ""Benchmarks""...","[""nlp"", ""natural-language-processing"", ""pytorc...",301316,1162524,308824,65.580618
3902,https://github.com/nvbn/thefuck,nvbn/thefuck,False,1651,2,17,3765,Python,master,MIT License,...,True,False,False,"{""Python"": 543910, ""Dockerfile"": 754, ""PowerSh...","[""bug"", ""cannot reproduce"", ""contributor-frien...","[""python"", ""shell""]",3783,11276,2218,65.265961
2914,https://github.com/ageitgey/face_recognition,ageitgey/face_recognition,False,238,8,2,13681,Python,master,MIT License,...,True,False,False,"{""Python"": 35036, ""Dockerfile"": 6930, ""Makefil...","[""bug"", ""dependencies"", ""duplicate"", ""enhancem...","[""machine-learning"", ""face-detection"", ""face-r...",1698,3653,1095,56.670804


In [34]:
filtered_projects['py'].to_csv('../data/complete_engineered_csn_projects.csv', index=False)