In [17]:
import pandas as pd

# Load the preprocessed dataset
df = pd.read_csv(input_path)

# Display dataset information
df.info()
 


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255 entries, 0 to 254
Columns: 1141 entries, branches to desc_trigram_most_common_yet_anoth_upload
dtypes: float64(9), int64(1132)
memory usage: 2.2 MB


In [None]:
# Define corrected feature sets
feature_sets = {
    "repo_meta": [
        "branches", "releases", "forks", "watchers", "stargazers", 
        "contributors", "size"
    ],
    "repo_activity": [
        "totalIssues", "openIssues", "totalPullRequests", "openPullRequests", 
        "commit_count", "issue_count", "network_count", "subscribers_count"
    ],
    "dependency_only": ["dep_count"] + [col for col in df.columns if col.startswith("dep_")],
    "all_but_text": [],  # To be populated dynamically
    "full_features": df.columns.tolist()  # Use all available features
}

# Exclude text-based features from 'all_but_text'
text_features = [col for col in df.columns if col.startswith(("desc_bigram_", "desc_trigram_"))]
feature_sets["all_but_text"] = [col for col in df.columns if col not in text_features]

# Verify the corrected feature sets
for key, features in feature_sets.items():
    print(f"{key}: {len(features)} features")


repo_meta: 7 features
repo_activity: 8 features
dependency_only: 769 features
all_but_text: 817 features
full_features: 1141 features


In [19]:
# Define the correct target variable
target_variable = "vp-category-equalfreq"

# Create DataFrames for each feature set, ensuring the target variable is included
df_feature_sets = {
    key: df[features + [target_variable]] for key, features in feature_sets.items()
}

# Display the shape of each feature set DataFrame
for key, df_subset in df_feature_sets.items():
    print(f"{key}: {df_subset.shape}")


repo_meta: (255, 8)
repo_activity: (255, 9)
dependency_only: (255, 770)
all_but_text: (255, 818)
full_features: (255, 1142)


In [20]:
import os

# Define the output directory
output_dir = "datasets"

# Ensure the directory exists
os.makedirs(output_dir, exist_ok=True)

# Save each feature set as a CSV file
for key, df_subset in df_feature_sets.items():
    file_path = os.path.join(output_dir, f"{key}.csv")
    df_subset.to_csv(file_path, index=False)
    print(f"Saved: {file_path}")


Saved: datasets/repo_meta.csv
Saved: datasets/repo_activity.csv
Saved: datasets/dependency_only.csv
Saved: datasets/all_but_text.csv
Saved: datasets/full_features.csv
