Checks how many repositories became abandoned in the time between the two files that are uploaded.

In [None]:
# Import necessary libraries
import pandas as pd
from google.colab import files
from datetime import datetime, timedelta
import os
import re

# Function to extract date and time from the filename with the format: features_YYYY-MM-DD_Hhh-Mmm-Sss (with optional (n) before .xlsx)
def extract_datetime_from_filename(filename):
    basename = os.path.basename(filename)
    try:
        # Remove any '(n)' suffix and extra spaces
        basename = re.sub(r'\(\d+\)', '', basename).strip()
        basename = basename.split('.')[0].strip()

        # Extract the date and time parts
        date_str = basename.split('_')[1]  # YYYY-MM-DD part
        time_str = basename.split('_')[2]  # Hhh-Mmm-Sss part

        # Remove 'H', 'M', and 'S' to form a valid time string
        time_str = time_str.replace('H', '').replace('-M', '').replace('-S', '')

        # Combine the date and time parts and convert to datetime object (tz-naive)
        date_time = datetime.strptime(f"{date_str} {time_str}", "%Y-%m-%d %H%M%S")
        return date_time
    except Exception as e:
        print(f"Error extracting date and time from filename {basename}: {e}")
        return None

# Function to determine if repositories are abandoned (3 months without update)
def check_abandonment(df, file_timestamp):
    # Ensure the "Last Update" column is in datetime format and tz-naive
    df['Last Update'] = pd.to_datetime(df['Last Update'], errors='coerce').dt.tz_localize(None)

    # Calculate the abandonment threshold (3 months)
    abandonment_threshold = timedelta(days=90)

    # Determine if repositories have been abandoned based on the file timestamp
    abandoned_repos = df[df['Last Update'] < (file_timestamp - abandonment_threshold)]

    return abandoned_repos

# Step 1: Upload two .xlsx files
print("Please upload the first .xlsx file")
uploaded_1 = files.upload()
file_name_1 = list(uploaded_1.keys())[0]  # Get the uploaded file name for the first file
df1 = pd.read_excel(file_name_1)

print("Please upload the second .xlsx file")
uploaded_2 = files.upload()
file_name_2 = list(uploaded_2.keys())[0]  # Get the uploaded file name for the second file
df2 = pd.read_excel(file_name_2)

# Step 2: Extract the timestamp from each file's name
timestamp_1 = extract_datetime_from_filename(file_name_1)
timestamp_2 = extract_datetime_from_filename(file_name_2)

if timestamp_1 and timestamp_2:
    # Step 3: Check abandoned repositories in both files
    abandoned_in_file1 = check_abandonment(df1, timestamp_1)
    abandoned_in_file2 = check_abandonment(df2, timestamp_2)

    # Step 4: Find repositories that became abandoned in file2 but not in file1
    new_abandonments = abandoned_in_file2[~abandoned_in_file2['Project URL'].isin(abandoned_in_file1['Project URL'])]

    print("Repositories newly abandoned in file2:")
    if not new_abandonments.empty:
        print(new_abandonments)
    else:
        print("No repositories became newly abandoned between the two data collections.")
else:
    print("Error extracting date from one or both file names.")


Please upload the first .xlsx file


Saving features_2024-09-24_H01-M36-S00.xlsx to features_2024-09-24_H01-M36-S00 (4).xlsx
Please upload the second .xlsx file


Saving features_2024-10-06_H20-M37-S49.xlsx to features_2024-10-06_H20-M37-S49 (4).xlsx
Repositories newly abandoned in file2:
                                            Project URL  \
27                        https://github.com/1rgs/MeGPT   
39           https://github.com/40Cakes/pokebot-bizhawk   
159        https://github.com/AsehesL/SceneSeparateDemo   
173   https://github.com/AutomationPanda/tau-pytest-bdd   
296        https://github.com/ClassicFaithful/32x-Jappa   
...                                                 ...   
6461             https://github.com/valpackett/evscript   
6539                 https://github.com/vtr0n/npyscreen   
6608                  https://github.com/weixin/WeIndex   
6718           https://github.com/xiangrongzeng/copy_re   
6881  https://github.com/zbtang/React-Native-TextInp...   

                                          Clone SSH URL  Organization  \
27                        git@github.com:1rgs/MeGPT.git          User   
39           git@g

Uses a logistic regression to get the feature weights, with the target variable being Abandonment. NOTE: This is using a logistic regression that just predicts whether a repository is abandoned or not, it does NOT predict future abandonment.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from datetime import datetime
import re
from google.colab import files
import numpy as np

# Function to extract the timestamp from a filename
def extract_datetime_from_filename(filename):
    date_match = re.search(r'\d{4}-\d{2}-\d{2}', filename)
    if date_match:
        return datetime.strptime(date_match.group(), '%Y-%m-%d')
    else:
        raise ValueError(f"Invalid filename format: {filename}")

# Function to convert strings with 'k', 'M', 'B' to numeric values
def convert_to_numeric(value, column_name):
    """
    Converts strings with 'k', 'M', 'B', and commas to their numerical equivalent.
    If the value is invalid, prints the column name and the value.
    """
    value = str(value).replace(',', '')  # Remove commas

    # Check for boolean-like values and convert them
    if value.lower() in ['true', 'yes']:
        return 1.0
    elif value.lower() in ['false', 'no']:
        return 0.0

    # Check for numeric suffixes and convert
    try:
        if 'k' in value:
            return float(value.replace('k', '')) * 1e3
        elif 'M' in value:
            return float(value.replace('M', '')) * 1e6
        elif 'B' in value:
            return float(value.replace('B', '')) * 1e9
        elif re.match(r'^[-+]?[0-9]*\.?[0-9]+$', value):  # Matches numeric values
            return float(value)
        else:
            return np.nan  # Return NaN for invalid values
    except Exception as e:
        print(f"Error in column '{column_name}' with value '{value}': {e}")
        return np.nan

# Step 1: Upload the .xlsx file
print("Please upload your .xlsx file.")
uploaded = files.upload()

# Step 2: Load the uploaded file into a DataFrame and extract the timestamp
for file_name, content in uploaded.items():
    df = pd.read_excel(file_name)
    current_date = extract_datetime_from_filename(file_name)

# Step 3: Process the DataFrame
df['last_update'] = pd.to_datetime(df['Last Update'], errors='coerce').dt.tz_localize(None)
df['days_since_last_update'] = (current_date - df['last_update']).dt.days
df['abandoned'] = df['days_since_last_update'].apply(lambda x: 1 if x > 30 else 0)

# Define the relevant columns to convert and use for model training
relevant_columns = [
    'Size', 'Number of Stars', 'Number of Watches_x', 'Number of Open Issues',
    'Number of forks', 'Topics', 'Open Pull Requests', 'Closed Pull Requests',
    'Followers of Owner', 'Number of Commits', 'Members of Owner',
    'Repos of Owner', 'Active Pull Requests', 'Active Issues',
    'Open Issues', 'Closed Issues', 'Number of Labels', 'Current Sponsors',
    'Sponsored', 'Number of Milestones', 'Number of Dependents',
    'Number of Files', 'Depth of Files', 'Number of Contributors',
    'Number of Merges', 'Number of Branches', 'Number of Tags',
    'Number of Links', 'Verified Owner', 'Has a Wiki', 'Has Discussions',
    'Has Projects', 'Has Pages', 'Archived', 'Has README', 'Has SECURITY',
    'Has Conduct', 'Has Contributing', 'Has ISSUE_TEMPLATE', 'Has PULL_TEMPLATE'
]

# Apply numeric conversion to relevant columns
df[relevant_columns] = df[relevant_columns].apply(lambda col: col.apply(lambda x: convert_to_numeric(x, col.name)))

# Add 'abandoned' to the list of relevant columns for logistic regression
model_columns = relevant_columns + ['abandoned']

# Step 4: Prepare the data for logistic regression
temp = df[model_columns]

# Drop rows with NaN values
temp = temp.dropna()

# Separate features (X) and target (y) for logistic regression
X = temp[relevant_columns]
y = temp['abandoned']

# Step 5: Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 6: Train the logistic regression model
model = LogisticRegression(max_iter=1000, solver='liblinear')
model.fit(X_train, y_train)

# Step 7: Make predictions
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# Step 8: Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))
print(f"AUC-ROC Score: {roc_auc_score(y_test, y_prob):.2f}")

# Step 9: Extract and display feature weights
feature_weights = pd.DataFrame({
    'Feature': X.columns,
    'Weight': model.coef_[0]
}).sort_values(by='Weight', ascending=False)


feature_weights['Odds Ratio'] = np.exp(feature_weights['Weight'])
print(feature_weights[['Feature', 'Weight', 'Odds Ratio']])

# Optional: Display some key results
print(df[['Project URL', 'days_since_last_update', 'abandoned']])


Please upload your .xlsx file.


Saving features_2024-09-24_H01-M36-S00.xlsx to features_2024-09-24_H01-M36-S00 (1).xlsx
Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.99      0.81      1427
           1       0.43      0.02      0.04       647

    accuracy                           0.69      2074
   macro avg       0.56      0.50      0.43      2074
weighted avg       0.61      0.69      0.57      2074

AUC-ROC Score: 0.74
                   Feature        Weight  Odds Ratio
2      Number of Watches_x  1.059091e-02    1.010647
22          Depth of Files  5.425466e-03    1.005440
6       Open Pull Requests  2.663078e-03    1.002667
33                Archived  1.244179e-03    1.001245
31            Has Projects  1.107599e-03    1.001108
29              Has a Wiki  1.038806e-03    1.001039
23  Number of Contributors  6.750187e-04    1.000675
34              Has README  6.442610e-04    1.000644
32               Has Pages  4.301318e-04    1.000430
19    Number

Uses a logistic regression to get the feature weights, with the target variable being Not Abandoned. NOTE: This is using a logistic regression that just predicts whether a repository is abandoned or not, it does NOT predict future abandonment.



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from datetime import datetime
import re
from google.colab import files
import numpy as np

# Function to extract the timestamp from a filename
def extract_datetime_from_filename(filename):
    date_match = re.search(r'\d{4}-\d{2}-\d{2}', filename)
    if date_match:
        return datetime.strptime(date_match.group(), '%Y-%m-%d')
    else:
        raise ValueError(f"Invalid filename format: {filename}")

# Function to convert strings with 'k', 'M', 'B' to numeric values
def convert_to_numeric(value, column_name):
    """
    Converts strings with 'k', 'M', 'B', and commas to their numerical equivalent.
    If the value is invalid, prints the column name and the value.
    """
    value = str(value).replace(',', '')  # Remove commas

    # Check for boolean-like values and convert them
    if value.lower() in ['true', 'yes']:
        return 1.0
    elif value.lower() in ['false', 'no']:
        return 0.0

    # Check for numeric suffixes and convert
    try:
        if 'k' in value:
            return float(value.replace('k', '')) * 1e3
        elif 'M' in value:
            return float(value.replace('M', '')) * 1e6
        elif 'B' in value:
            return float(value.replace('B', '')) * 1e9
        elif re.match(r'^[-+]?[0-9]*\.?[0-9]+$', value):  # Matches numeric values
            return float(value)
        else:
            return np.nan  # Return NaN for invalid values
    except Exception as e:
        print(f"Error in column '{column_name}' with value '{value}': {e}")
        return np.nan

# Step 1: Upload the .xlsx file
print("Please upload your .xlsx file.")
uploaded = files.upload()

# Step 2: Load the uploaded file into a DataFrame and extract the timestamp
for file_name, content in uploaded.items():
    df = pd.read_excel(file_name)
    current_date = extract_datetime_from_filename(file_name)

# Step 3: Process the DataFrame
df['last_update'] = pd.to_datetime(df['Last Update'], errors='coerce').dt.tz_localize(None)
df['days_since_last_update'] = (current_date - df['last_update']).dt.days
df['not_abandoned'] = df['days_since_last_update'].apply(lambda x: 1 if x <= 30 else 0)

# Define the relevant columns to convert and use for model training
relevant_columns = [
    'Size', 'Number of Stars', 'Number of Watches_x', 'Number of Open Issues',
    'Number of forks', 'Topics', 'Open Pull Requests', 'Closed Pull Requests',
    'Followers of Owner', 'Number of Commits', 'Members of Owner',
    'Repos of Owner', 'Active Pull Requests', 'Active Issues',
    'Open Issues', 'Closed Issues', 'Number of Labels', 'Current Sponsors',
    'Sponsored', 'Number of Milestones', 'Number of Dependents',
    'Number of Files', 'Depth of Files', 'Number of Contributors',
    'Number of Merges', 'Number of Branches', 'Number of Tags',
    'Number of Links', 'Verified Owner', 'Has a Wiki', 'Has Discussions',
    'Has Projects', 'Has Pages', 'Archived', 'Has README', 'Has SECURITY',
    'Has Conduct', 'Has Contributing', 'Has ISSUE_TEMPLATE', 'Has PULL_TEMPLATE'
]

# Apply numeric conversion to relevant columns
df[relevant_columns] = df[relevant_columns].apply(lambda col: col.apply(lambda x: convert_to_numeric(x, col.name)))

# Add 'abandoned' to the list of relevant columns for logistic regression
model_columns = relevant_columns + ['not_abandoned']

# Step 4: Prepare the data for logistic regression
temp = df[model_columns]

# Drop rows with NaN values
temp = temp.dropna()

# Separate features (X) and target (y) for logistic regression
X = temp[relevant_columns]
y = temp['not_abandoned']

# Step 5: Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 6: Train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Step 7: Make predictions
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# Step 8: Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))
print(f"AUC-ROC Score: {roc_auc_score(y_test, y_prob):.2f}")

# Step 9: Extract and display feature weights
feature_weights = pd.DataFrame({
    'Feature': X.columns,
    'Weight': model.coef_[0]
}).sort_values(by='Weight', ascending=False)


feature_weights['Odds Ratio'] = np.exp(feature_weights['Weight'])
print(feature_weights[['Feature', 'Weight', 'Odds Ratio']])

# Optional: Display some key results
print(df[['Project URL', 'days_since_last_update', 'not_abandoned']])

Please upload your .xlsx file.


Saving features_2024-09-24_H01-M36-S00.xlsx to features_2024-09-24_H01-M36-S00 (10).xlsx
Classification Report:
              precision    recall  f1-score   support

           0       0.25      0.01      0.01       647
           1       0.69      0.99      0.81      1427

    accuracy                           0.68      2074
   macro avg       0.47      0.50      0.41      2074
weighted avg       0.55      0.68      0.56      2074

AUC-ROC Score: 0.72
                   Feature        Weight  Odds Ratio
15           Closed Issues  1.871879e-03    1.001874
1          Number of Stars  1.719079e-03    1.001721
27         Number of Links  1.597620e-03    1.001599
7     Closed Pull Requests  1.465931e-03    1.001467
26          Number of Tags  1.260011e-03    1.001261
4          Number of forks  7.238823e-04    1.000724
14             Open Issues  3.706982e-04    1.000371
5                   Topics  3.411632e-04    1.000341
17        Current Sponsors  2.804560e-04    1.000280
16        N

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


This model uses a Random Forest Classifier instead of a Logistic Regression to predict whether a repository is abandoned or not. NOTE: This just predicts whether a repository is abandoned, NOT future abandonment.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from datetime import datetime
import re
from google.colab import files
import numpy as np

# Function to extract the timestamp from a filename
def extract_datetime_from_filename(filename):
    date_match = re.search(r'\d{4}-\d{2}-\d{2}', filename)
    if date_match:
        return datetime.strptime(date_match.group(), '%Y-%m-%d')
    else:
        raise ValueError(f"Invalid filename format: {filename}")

# Function to convert strings with 'k', 'M', 'B' to numeric values
def convert_to_numeric(value, col):
    """
    Converts strings with 'k', 'M', 'B', and commas to their numerical equivalent.
    For example:
        '1.2k' -> 1200
        '2.5M' -> 2500000
        '3.4B' -> 3400000000
    """
    value = str(value).replace(',', '')  # Remove commas
    # Check for suffix and multiply accordingly

    if value.lower() in ['true', 'yes']:
        return 1.0
    elif value.lower() in ['false', 'no']:
        return 0.0

    if 'k' in value:
        return float(value.replace('k', '')) * 1e3
    elif 'M' in value:
        return float(value.replace('M', '')) * 1e6
    elif 'B' in value:
        return float(value.replace('B', '')) * 1e9
    elif re.match(r'^[-+]?[0-9]*\.?[0-9]+$', value):  # Matches numeric values
        return float(value)  # No suffix, just return the float
    else:
        return np.nan  # Return NaN for invalid values

# Step 1: Upload the .xlsx file
print("Please upload your .xlsx file.")
uploaded = files.upload()

# Step 2: Load the uploaded file into a DataFrame and extract the timestamp
for file_name, content in uploaded.items():
    df = pd.read_excel(file_name)
    current_date = extract_datetime_from_filename(file_name)  # Use the timestamp from the filename

# Step 3: Process the DataFrame
df['last_update'] = pd.to_datetime(df['Last Update'], errors='coerce').dt.tz_localize(None)  # Convert 'Last Update' to datetime

# Calculate days since the last update
df['days_since_last_update'] = (current_date - df['last_update']).dt.days

# Define abandonment as 30 days of inactivity (adjustable threshold)
df['abandoned'] = df['days_since_last_update'].apply(lambda x: 1 if x > 30 else 0)

# Define the relevant columns to convert and use for model training
relevant_columns = [
    'Size', 'Number of Stars', 'Number of Watches_x', 'Number of Open Issues',
    'Number of forks', 'Topics', 'Open Pull Requests', 'Closed Pull Requests',
    'Followers of Owner', 'Number of Commits', 'Members of Owner',
    'Repos of Owner', 'Active Pull Requests', 'Active Issues',
    'Open Issues', 'Closed Issues', 'Number of Labels', 'Current Sponsors',
    'Sponsored', 'Number of Milestones', 'Number of Dependents',
    'Number of Files', 'Depth of Files', 'Number of Contributors',
    'Number of Merges', 'Number of Branches', 'Number of Tags',
    'Number of Links', 'Verified Owner', 'Has a Wiki', 'Has Discussions',
    'Has Projects', 'Has Pages', 'Archived', 'Has README', 'Has SECURITY',
    'Has Conduct', 'Has Contributing', 'Has ISSUE_TEMPLATE', 'Has PULL_TEMPLATE'
]

# Apply numeric conversion to relevant columns
df[relevant_columns] = df[relevant_columns].apply(lambda col: col.apply(lambda x: convert_to_numeric(x, col.name)))

# Add 'abandoned' to the list of relevant columns for logistic regression
model_columns = relevant_columns + ['abandoned']

# Step 4: Prepare the data for logistic regression
temp = df[model_columns]

# Drop rows with NaN values
temp = temp.dropna()

# Separate features (X) and target (y) for logistic regression
X = temp[relevant_columns]
y = temp['abandoned']

# Step 5: Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 6: Train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 7: Make predictions
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]  # Probability of being abandoned

# Step 8: Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))
print(f"AUC-ROC Score: {roc_auc_score(y_test, y_prob):.2f}")

# Step 9: Get feature importances and display them
importances = model.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Adding a rank column based on importance
feature_importance_df['Rank'] = feature_importance_df['Importance'].rank(ascending=False)

# Re-sort DataFrame by Rank
feature_importance_df = feature_importance_df.sort_values(by='Rank')

print("Feature Importances with Ranks:")
print(feature_importance_df)

# Optional: Display results
print(df[['Project URL', 'days_since_last_update', 'abandoned']])


Please upload your .xlsx file.


Saving features_2024-09-24_H01-M36-S00.xlsx to features_2024-09-24_H01-M36-S00 (13).xlsx
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.91      0.84      1427
           1       0.67      0.42      0.52       647

    accuracy                           0.76      2074
   macro avg       0.72      0.66      0.68      2074
weighted avg       0.74      0.76      0.74      2074

AUC-ROC Score: 0.81
Feature Importances with Ranks:
                   Feature  Importance  Rank
1          Number of Stars    0.113380   1.0
4          Number of forks    0.076559   2.0
16        Number of Labels    0.064560   3.0
9        Number of Commits    0.055852   4.0
0                     Size    0.052097   5.0
21         Number of Files    0.048919   6.0
11          Repos of Owner    0.048767   7.0
8       Followers of Owner    0.047037   8.0
2      Number of Watches_x    0.045490   9.0
15           Closed Issues    0.042276  10.0
27         Num

This script compares the feature importance from the logistic regressions using Abandoned and Not Abandoned as the targets.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from datetime import datetime
import re
from google.colab import files
import numpy as np

# Function to extract the timestamp from a filename
def extract_datetime_from_filename(filename):
    date_match = re.search(r'\d{4}-\d{2}-\d{2}', filename)
    if date_match:
        return datetime.strptime(date_match.group(), '%Y-%m-%d')
    else:
        raise ValueError(f"Invalid filename format: {filename}")

# Function to convert strings with 'k', 'M', 'B' to numeric values
def convert_to_numeric(value, column_name):
    value = str(value).replace(',', '')  # Remove commas
    if value.lower() in ['true', 'yes']:
        return 1.0
    elif value.lower() in ['false', 'no']:
        return 0.0
    try:
        if 'k' in value:
            return float(value.replace('k', '')) * 1e3
        elif 'M' in value:
            return float(value.replace('M', '')) * 1e6
        elif 'B' in value:
            return float(value.replace('B', '')) * 1e9
        elif re.match(r'^[-+]?[0-9]*\.?[0-9]+$', value):
            return float(value)
        else:
            return np.nan
    except Exception as e:
        print(f"Error in column '{column_name}' with value '{value}': {e}")
        return np.nan

# Step 1: Upload the .xlsx file
print("Please upload your .xlsx file.")
uploaded = files.upload()

# Step 2: Load the uploaded file into a DataFrame and extract the timestamp
for file_name, content in uploaded.items():
    df = pd.read_excel(file_name)
    current_date = extract_datetime_from_filename(file_name)

# Step 3: Process the DataFrame
df['last_update'] = pd.to_datetime(df['Last Update'], errors='coerce').dt.tz_localize(None)
df['days_since_last_update'] = (current_date - df['last_update']).dt.days
df['not_abandoned'] = df['days_since_last_update'].apply(lambda x: 1 if x <= 30 else 0)
df['abandoned'] = df['days_since_last_update'].apply(lambda x: 1 if x > 30 else 0)

# Define relevant columns
relevant_columns = [
    'Size', 'Number of Stars', 'Number of Watches_x', 'Number of Open Issues',
    'Number of forks', 'Topics', 'Open Pull Requests', 'Closed Pull Requests',
    'Followers of Owner', 'Number of Commits', 'Members of Owner',
    'Repos of Owner', 'Active Pull Requests', 'Active Issues',
    'Open Issues', 'Closed Issues', 'Number of Labels', 'Current Sponsors',
    'Sponsored', 'Number of Milestones', 'Number of Dependents',
    'Number of Files', 'Depth of Files', 'Number of Contributors',
    'Number of Merges', 'Number of Branches', 'Number of Tags',
    'Number of Links', 'Verified Owner', 'Has a Wiki', 'Has Discussions',
    'Has Projects', 'Has Pages', 'Archived', 'Has README', 'Has SECURITY',
    'Has Conduct', 'Has Contributing', 'Has ISSUE_TEMPLATE', 'Has PULL_TEMPLATE'
]

# Apply numeric conversion to relevant columns
df[relevant_columns] = df[relevant_columns].apply(lambda col: col.apply(lambda x: convert_to_numeric(x, col.name)))

# Prepare data for logistic regression
temp = df[relevant_columns + ['not_abandoned', 'abandoned']]
temp = temp.dropna()

# Separate features (X) and targets (y) for both models
X = temp[relevant_columns]

# Train model with "Not Abandoned" as the target
y_not_abandoned = temp['not_abandoned']
X_train_na, X_test_na, y_train_na, y_test_na = train_test_split(X, y_not_abandoned, test_size=0.3, random_state=42)
model_na = LogisticRegression(max_iter=1000)
model_na.fit(X_train_na, y_train_na)
weights_na = np.abs(model_na.coef_[0])  # Absolute weights for Not Abandoned

# Train model with "Abandoned" as the target
y_abandoned = temp['abandoned']
X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(X, y_abandoned, test_size=0.3, random_state=42)
model_a = LogisticRegression(max_iter=1000)
model_a.fit(X_train_a, y_train_a)
weights_a = np.abs(model_a.coef_[0])  # Absolute weights for Abandoned

# Calculate absolute differences between weights
abs_diff_weights = np.abs(weights_a - weights_na)

# Create DataFrames for each table and sort as requested
# Table 1: Abandoned weights (sorted by absolute values)
df_abandoned = pd.DataFrame({'Feature': X.columns, 'Abandoned Weight': weights_a})
df_abandoned = df_abandoned.sort_values(by='Abandoned Weight', ascending=False)
print("Abandoned Weights (sorted):")
print(df_abandoned)

# Table 2: Not Abandoned weights (sorted by absolute values)
df_not_abandoned = pd.DataFrame({'Feature': X.columns, 'Not Abandoned Weight': weights_na})
df_not_abandoned = df_not_abandoned.sort_values(by='Not Abandoned Weight', ascending=False)
print("\nNot Abandoned Weights (sorted):")
print(df_not_abandoned)

# Table 3: Absolute difference between Abandoned and Not Abandoned weights (sorted)
df_abs_diff = pd.DataFrame({'Feature': X.columns, 'Abs Difference': abs_diff_weights})
df_abs_diff = df_abs_diff.sort_values(by='Abs Difference', ascending=False)
print("\nAbsolute Differences between Abandoned and Not Abandoned Weights (sorted):")
print(df_abs_diff)


Please upload your .xlsx file.


Saving features_2024-09-24_H01-M36-S00.xlsx to features_2024-09-24_H01-M36-S00.xlsx


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Abandoned Weights (sorted):
                   Feature  Abandoned Weight
7     Closed Pull Requests      2.091319e-03
15           Closed Issues      1.863380e-03
1          Number of Stars      1.734730e-03
2      Number of Watches_x      1.394566e-03
27         Number of Links      1.294995e-03
26          Number of Tags      1.069456e-03
4          Number of forks      7.764990e-04
23  Number of Contributors      4.934562e-04
14             Open Issues      3.815720e-04
6       Open Pull Requests      2.914556e-04
5                   Topics      2.412890e-04
17        Current Sponsors      1.934109e-04
25      Number of Branches      1.731524e-04
16        Number of Labels      1.583540e-04
3    Number of Open Issues      1.112431e-04
22          Depth of Files      6.757657e-05
10        Members of Owner      4.421334e-05
12    Active Pull Requests      4.173447e-05
13           Active Issues      3.672186e-05
11          Repos of Owner      2.638396e-05
33                Archived 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
