In [2]:
import pandas as pd

In [4]:
def get_statistics_by_column(cs_projects, java_projects, js_projects, py_projects, ts_projects, column):
    # Create a dictionary of the projects for easier iteration
    projects = {
        'C# Projects': cs_projects,
        'Java Projects': java_projects,
        'JavaScript Projects': js_projects,
        'Python Projects': py_projects,
        'TypeScript Projects': ts_projects
    }
    
    # Iterate through each project type
    for project_name, project_df in projects.items():
        print(f"Statistics for {project_name}:")
        
        # Check if the column exists in the DataFrame
        if column in project_df.columns:
            column_data = project_df[column]
            
            # Calculate statistics
            mean_value = column_data.mean()
            median_value = column_data.median()
            q1 = column_data.quantile(0.25)
            q3 = column_data.quantile(0.75)
            iqr = q3 - q1
            std_dev = column_data.std()
            
            # Display the statistics
            print(f"  Mean: {mean_value}")
            print(f"  Median: {median_value}")
            print(f"  1st Quartile (Q1): {q1}")
            print(f"  3rd Quartile (Q3): {q3}")
            print(f"  Interquartile Range (IQR): {iqr}")
            print(f"  Standard Deviation: {std_dev}\n")
        else:
            print(f"  Column '{column}' not found in {project_name}.\n")
            
def obtain_q1_by_column(projects, column):
    return projects[column].quantile(0.25)

def obtain_97_by_column(projects, column):
    mean = projects[column].mean()
    std_dev = projects[column].std()
    
    # Calculate thresholds for outliers
    lower_threshold = mean - 1.5 * std_dev
    if lower_threshold < 0: lower_threshold = 0
    upper_threshold = mean + 1.5 * std_dev
    if upper_threshold > 100: upper_threshold = 100
    return (upper_threshold, lower_threshold)

In [5]:
py_projects = pd.read_csv('../data/post_cut-off_repos.csv', encoding='latin-1')

In [7]:
py_projects = py_projects[(~py_projects['license'].isna()) & (py_projects['license'] != 'Other') & (py_projects['contributors'] >= 2) & (py_projects['releases'] >= 1) & (py_projects['isArchived'] == False)]

In [8]:
py_projects.shape

(732, 40)

In [9]:
all_licenses = pd.concat([
    py_projects['license']
])

# Get distinct licenses
distinct_licenses = all_licenses.unique()

# Convert to list
distinct_licenses_list = distinct_licenses.tolist()
distinct_licenses_list

['MIT License',
 'Apache License 2.0',
 'GNU Affero General Public License v3.0',
 'GNU General Public License v3.0',
 'BSD 3-Clause New or Revised License',
 'Mozilla Public License 2.0',
 'GNU General Public License v2.0',
 'BSD 2-Clause Simplified License',
 'Universal Permissive License v1.0']

In [10]:
py_projects = py_projects[~py_projects['license'].str.contains(r'Creative Commons|Open Font',regex=True)]

In [11]:
py_projects.shape

(732, 40)

In [13]:
for projects in [py_projects]:
    projects['codeRatio'] = projects.apply(
        lambda row: 
        0 if pd.isna(row['codeLines']) or pd.isna(row['commentLines']) or pd.isna(row['blankLines']) or (row['codeLines'] + row['commentLines'] + row['blankLines']) == 0 
        else row['codeLines'] / (row['codeLines'] + row['commentLines'] + row['blankLines']) * 100, 
        axis=1)

In [14]:
metrics = ['codeLines', 'totalPullRequests', 'totalIssues', 'codeRatio']

# Initialize a dictionary to hold the threshold values
thresholds = {
    'py': {}
}

# Calculate thresholds for each project type and metric
for project_type, projects in zip(['py'],
                                   [py_projects]):
    for metric in metrics:
        if metric == 'codeRatio':
            threshold = obtain_97_by_column(projects, metric)
        else:
            threshold = obtain_q1_by_column(projects, metric)
            
        thresholds[project_type][metric] = threshold


In [15]:
thresholds

{'py': {'codeLines': np.float64(4591.5),
  'totalPullRequests': np.float64(12.0),
  'totalIssues': np.float64(4.0),
  'codeRatio': (np.float64(90.50833620867922), np.float64(54.94200913592581))}}

In [16]:
filtered_projects = {}

for project_type, projects in [('py', py_projects)]:
    thresholds_project = thresholds[project_type]
    filtered_projects[project_type] = projects.loc[
        (projects['codeLines'] >= thresholds_project['codeLines']) &
        (projects['totalPullRequests'] >= thresholds_project['totalPullRequests']) &
        (projects['totalIssues'] >= thresholds_project['totalIssues']) & 
        ((projects['codeRatio'] > thresholds_project['codeRatio'][1]) & projects['codeRatio'] < thresholds_project['codeRatio'][0])
    ]
    filtered_projects[project_type] = filtered_projects[project_type].sort_values(by='stargazers', ascending=False)
    filtered_projects[project_type]['url'] = filtered_projects[project_type]['name'].apply(lambda x: 'https://github.com/'+x)
    filtered_projects[project_type]['include'] = ""
    filtered_projects[project_type]['language'] = ""
    filtered_projects[project_type]['comment'] = ""
    new_columns = ['url', 'include', 'language', 'comment']
    existing_columns = [col for col in filtered_projects[project_type].columns if col not in new_columns] 
    new_column_order = new_columns + existing_columns
    filtered_projects[project_type] = filtered_projects[project_type][new_column_order]
    

In [17]:
filtered_projects['py'].shape

(365, 40)

In [19]:
filtered_projects['py'].to_csv('../data/complete_engineered_post_cut-off_projects.csv', index=False)