### Data Pre-Processing & Visualization
Given the size of the dataset (50 GB), we will need to stream its contents for each Q1-3.
We will extract the following objects:
* repositories
* users

Each object will have several associated features which will be determined as we conduct EDA (exploratory data analysis).

In [None]:
# Basic utilities and data handling
import json
import numpy as np
import pandas as pd
from IPython.display import display

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning preprocessing and modeling
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso, LassoCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Data serialization
import ijson

Loads primary dataset as JSON into a dataframe based on chunk size (e.g. 1000 JSON entries per chunk)

In [None]:
# Debugging purposes; loads & prints the first json entry only
def load_json(file, index = 0):
    try:
        chunks = pd.read_json(file, lines=True, chunksize=1000)
        
        repo_list = None
        for i, chunk in enumerate(chunks):
            if i == index:
                repo_list = chunk["repo_list"].iloc[0]
                print(repo_list)
                break
        df = pd.DataFrame(repo_list)

        # Save the DataFrame to a CSV file
        df.to_csv("sample.csv", index=False)
        print(f"Data saved to data.csv")
        
    except FileNotFoundError:
        print(f"File '{file_path}' not found.")
    except Exception as e:
        print(f"Error: {e}")

In [None]:
file = 'data.json'
load_json(file)

### Features for Q1:
* repo_list (to access the repo's and their nested features)
* all repo features most associated with 'popularity'
* id (user's id)
* login (to identify the user; e.g. I would call Hashem drthetasigma on github, not Hashem)

### Preprocessing for ALL Repository Data, 1000 chunks/objects at a time

In [None]:
# Function to extract specified features
def preprocess_data(infile, outfile, chunk_size=1000):
    first_chunk = True # header needs to be written to CSV for first write operation, after which it will be disabled

    for chunk in pd.read_json(infile, lines=True, chunksize=chunk_size):
        repos = []
        for _, row in chunk.iterrows():
            user_login = row['login']
            repo_list = row['repo_list'] if row['repo_list'] else []

            for repo in repo_list:
                repo_data = {
                    'user_login': user_login,
                    'repo_full_name': repo.get('full_name', ''),
                    'repo_id': repo.get('id', ''),
                    'repo_description': repo.get('description', ''),
                    'repo_size': repo.get('size', 0),
                    'repo_license': repo.get('license', ''),
                    'repo_stargazers_count': repo.get('stargazers_count', 0),
                    'repo_fork': repo.get('fork', False),
                    'repo_owner_id': repo.get('owner_id', ''),
                    'repo_created_at': repo.get('created_at', ''),
                    'repo_pushed_at': repo.get('pushed_at', ''),
                    'repo_updated_at': repo.get('updated_at', ''),
                    'repo_has_wiki': repo.get('has_wiki', False),
                    'repo_open_issues': repo.get('open_issues', 0),
                    'repo_language': repo.get('language', ''),
                    'repo_forks_count': repo.get('forks_count', 0),
                    'repo_default_branch': repo.get('default_branch', '')    
                }
                repos.append(repo_data)
        
        df_processed = pd.DataFrame(repos)
        write_mode = 'w' if first_chunk else 'a'
        df_processed.to_csv(outfile, mode=write_mode, index=False, header=first_chunk)
        first_chunk = False
    
        

In [None]:
preprocess_data('data.json', './datasets/repos.csv')

### Relevant Features

* Repo Name
* Repo Size
* Repo Star Count
* Repo Fork Count
* Repo Fork (1- True, 0 - False)
* Repo Age (Use created_at and pushed_at/updated_at to create ths new feature)
* Repo Open Issues
* Repo Language


#### But that leaves the question of, which metrics do we want to train our models to predict for 'popularity'?
1. Star Count
2. Fork Count
3. Open Issue Count

### Let's extract the *important features* first!

How:
* Read 1000 rows of the CSV file per iteration.
* Manually create 'age' feature from existing columns.
* Append modified rows to chunk_list.

In [None]:
# Specify the columns you want to keep
columns_to_keep = [
    'repo_full_name', 'repo_size', 'repo_stargazers_count',
    'repo_forks_count', 'repo_fork', 'repo_language',
    'repo_created_at', 'repo_open_issues', 'repo_description',
    'repo_updated_at'
]

# Define the chunk size
chunksize = 10**3  # chunk_size = 1000

# Initialize an empty list to store each chunk DataFrame
chunk_list = []

# Read the CSV file in chunks
for chunk in pd.read_csv('./datasets/repos.csv', usecols=columns_to_keep, chunksize=chunksize):

    # Convert the 'repo_created_at' and 'repo_updated_at' columns to datetime
    chunk['repo_created_at'] = pd.to_datetime(chunk['repo_created_at'])
    chunk['repo_updated_at'] = pd.to_datetime(chunk['repo_updated_at'])

    # Calculate the 'repo_age' in days as the difference between 'repo_created_at' and 'repo_updated_at'
    chunk['repo_age'] = (chunk['repo_updated_at'] - chunk['repo_created_at']).dt.days

    # Append the processed chunk to the list
    chunk_list.append(chunk)

How many values do we need to impute? How will we impute them? Let's observe the count.

In [None]:
# Check for null (NaN) values in the DataFrame
null_values = df.isnull().sum()
print("Null values in each column:\n", null_values)

In [None]:
# how many rows/repos do we have?
print(len(df))

## Correlation Matrices (Pearson, Spearman, Kendall) on Target Features

In [None]:
# Select the target features
target_features = df[['repo_stargazers_count',
                      'repo_forks_count',
                      'repo_open_issues']]

# Calculate Pearson's correlation
pearson_correlation = target_features.corr(method='pearson')
print("Pearson's correlation:")
print(pearson_correlation)
print()

# Calculate Spearman's rank correlation
spearman_correlation = target_features.corr(method='spearman')
print("Spearman's rank correlation:")
print(spearman_correlation)
print()

# Calculate Kendall's Tau correlation
kendall_correlation = target_features.corr(method='kendall')
print("Kendall's Tau correlation:")
print(kendall_correlation)

#### Impute missing values within columns: repository language and description.

In [None]:
df['repo_language'] = df['repo_language'].fillna('No language')
df['repo_description'] = df['repo_description'].fillna('No description')

#### Save to CSV file.

In [None]:
df.to_csv('./datasets/original_repos.csv', index=False)

## One Hot Encoding
* Using only the top 9 languages by frequency, and grouping all other languages into the 'Other' category
* Binary-encoded columns to represent the presence of a language (1 - in use, 0 - no use)

In [None]:
# Add 1 to all zero values in 'repo_age'
df['repo_age'] = df['repo_age'].apply(lambda x: x if x > 0 else x + 1)

In [None]:
# Convert 'repo_fork' from boolean to integers (1 for True, 0 for False)
df['repo_fork'] = df['repo_fork'].astype(int)

In [None]:
# Selecting the top 10 languages and grouping the rest as 'Other'
top_n = 9
top_languages = df['repo_language'].value_counts().nlargest(top_n).index
df['repo_language'] = df['repo_language'].apply(lambda x: x if x in top_languages else 'Other')


# One-hot encoding for 'repo_language'
ohe = OneHotEncoder(sparse_output=False)
language_encoded = ohe.fit_transform(df[['repo_language']])
language_encoded_df = pd.DataFrame(language_encoded, columns=ohe.get_feature_names_out(['repo_language']))

# Drop the original 'repo_language' column and concatenate the one-hot encoded language dataframe
df_ohe = df.drop('repo_language', axis=1)
df_ohe = pd.concat([df_ohe, language_encoded_df], axis=1)

In [None]:
# Filter out zero values in repo_size, target features, etc.
columns_to_check = ['repo_size', 'repo_age', 'repo_stargazers_count', 'repo_open_issues', 'repo_forks_count']

# Filtering out rows where any specified column has value <= 0
filtered_ohe = df.loc[(df_ohe[columns_to_check] > 0).all(axis=1)]

In [None]:
# Set up the matplotlib figure
plt.figure(figsize=(15, 10))

numerical_columns = ['repo_size', 'repo_stargazers_count', 'repo_open_issues', 'repo_forks_count', 'repo_age']
categorical_columns = ['repo_fork', 'repo_language']

print(f"Maximum repo size: {df['repo_size'].max()}")
print(f"Maximum Star Count: {df['repo_stargazers_count'].max()}")
print(f"Maximum Issue Count: {df['repo_open_issues'].max()}")


# Plotting the numerical columns after applying log1p transformation
for i, column in enumerate(numerical_columns, 1):
    transformed_column = np.log1p(filtered_df[column])  # Applying log1p transformation
    plt.subplot(3, 3, i)
    sns.histplot(transformed_column, kde=False, bins=30)
    plt.title(f'Log-transformed Distribution of {column}')
    
# Since repo_full_name and repo_language could have many unique values, we plot only the top 10 for demonstration
# Adjust the number of shown categories based on your data and needs
for i, column in enumerate(categorical_columns, len(numerical_columns) + 1):
    plt.subplot(3, 3, i)
    top_categories = df[column].value_counts().head(10)
    sns.barplot(x=top_categories.index, y=top_categories)
    plt.title(f'Top categories of {column}')
    plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

#### Top Repositories by Star Count

In [None]:
# Top 10 number of stars
df.sort_values(by='repo_stargazers_count', ascending=False).head(10)

#### Top Repositories by Size

In [None]:
# Top 10 number of size
df.sort_values(by='repo_size', ascending=False).head(10)

#### Top Repositories by Issue Count

In [None]:
# Top 10 number of issue counts
df.sort_values(by='repo_open_issues', ascending=False).head(10)

#### Top Repositories by Fork Count

In [None]:
# Top 10 number of issue counts
df.sort_values(by='repo_forks_count', ascending=False).head(10)

#### Percentage Distribution of Forked Repos

In [None]:
# Calculating percentage distribution
percentage_distribution = df['repo_fork'].value_counts(normalize=True) * 100

# Displaying the percentage distribution
print(percentage_distribution)

##### Before we decide to keep/drop this column, let's observe the characteristics of forked vs unforked repos:

In [None]:
forked_repos = df[df['repo_fork'] == True]
non_forked_repos = df[df['repo_fork'] == False]

print("Forked Repositories:")
print(forked_repos.describe())

print("\nNon-Forked Repositories:")
print(non_forked_repos.describe())

#### Distribution of Star Count (Forked vs Non-Forked)

In [None]:
# Example: Comparing the distribution of stargazers count
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.hist(forked_repos['repo_stargazers_count'], bins=50, color='blue', alpha=0.7)
plt.title('Forked Repos - Star Count')
plt.xlabel('Star Count')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
plt.hist(non_forked_repos['repo_stargazers_count'], bins=50, color='green', alpha=0.7)
plt.title('Non-Forked Repos - Star Count')
plt.xlabel('Star Count')

plt.tight_layout()
plt.show()

#### Displaying all Languages Used:
* Langages below 2% will be grouped into Other for visualization purposes.
* Languages within the 'Other' group will be displayed in a separate graphic.

In [None]:
# Count the occurrences of each language and calculate the percentage
language_counts = df['repo_language'].value_counts(normalize=True) * 100

# Identify languages to be grouped into 'Other'
others = language_counts[language_counts <= 2]
# Sum their percentages
others_percentage = others.sum()

# Filter out the 'others'
main_languages = language_counts[language_counts > 2]

# If there are any languages in 'others', add them as a single entry
if len(others) > 0:
    main_languages['Other'] = others_percentage

plt.figure(figsize=(10, 8))
plt.pie(main_languages, labels=main_languages.index, autopct='%1.1f%%', startangle=140)
plt.axis('equal')  # Ensure pie chart is circular
plt.title('Distribution of Repository Languages')
plt.show()

# Convert 'others' Series to DataFrame for a nicer display
others_df = others.reset_index()
others_df.columns = ['Language', 'Percentage']

# Display the DataFrame
display(others_df)

### Feature Selection: One-Hot Encoding + LASSO Regression

* **Why One-Hot Encoding?** Converts categorical variables into a form that could be provided to ML algorithms to do a better job in prediction by creating dummy variables that indicate the presence of an attribute.
* **Why LASSO Regression?** Employs shrinkage where data values are shrunk towards a central point as the mean. This technique helps in feature selection by reducing the coefficients of less important features to zero, effectively removing them from the equation.
* This method assists in preserving important features and minimizing/discarding unnecessary features.

In [None]:
chunk_size = 10000  # You can adjust this size depending on your system's memory capacity
chunks = []

for chunk in pd.read_csv('./datasets/original_repos.csv', chunksize=chunk_size):
    chunks.append(chunk)

# Concatenate chunks to form the full DataFrame
df_orig = pd.concat(chunks, ignore_index=True)

In [None]:
# Convert 'repo_fork' from boolean to integers (1 for True, 0 for False)
df_orig['repo_fork'] = df_orig['repo_fork'].astype(int)

In [None]:
# Selecting the top 10 languages and grouping the rest as 'Other'
top_n = 9
top_languages = df_orig['repo_language'].value_counts().nlargest(top_n).index
df_orig['repo_language'] = df_orig['repo_language'].apply(lambda x: x if x in top_languages else 'Other')

# Creating a copy for the non-one-hot encoded version
df_non_ohe = df_orig.copy()

# One-hot encoding for 'repo_language'
ohe = OneHotEncoder(sparse_output=False)
language_encoded = ohe.fit_transform(df_orig[['repo_language']])
language_encoded_df = pd.DataFrame(language_encoded, columns=ohe.get_feature_names_out(['repo_language']))

# Drop the original 'repo_language' column and concatenate the one-hot encoded language dataframe
df_ohe = df_orig.drop('repo_language', axis=1)
df_ohe = pd.concat([df_ohe, language_encoded_df], axis=1)

In [None]:
df_ohe.head()

### Feature Selection Method

In [None]:
def feature_selection(X, y, target_name, model_params=None, classification=False):
    if model_params is None:
        model_params = {'n_estimators': 50, 'random_state': 42} # n_estimators = 100 for other instances, set to 50 for target encoding
    if classification:
        model = RandomForestClassifier(**model_params)
    else:
        model = RandomForestRegressor(**model_params)
    model.fit(X, y)
    feature_importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
    print(f"Feature importances for predicting {target_name}:\n{feature_importances}\n")
    feature_importances.plot(kind='bar')
    plt.title(f"Feature Importances for {target_name}")
    plt.ylabel('Importance')
    plt.xlabel('Features')
    plt.show()
    selector = SelectFromModel(model, prefit=True)
    selected_features = X.columns[selector.get_support()]
    print(f"Selected features for predicting {target_name}:", selected_features)

In [None]:
# Assuming df is your original DataFrame and df_ohe already contains one-hot encoded features
normalized_features = ['repo_stargazers_count', 'repo_open_issues', 'repo_forks_count']

# Filter out rowas where repo_age = 0
df_orig = df_orig[df_orig['repo_age'] != 0]

for feature in normalized_features:
    df_ohe[f"{feature}_per_year"] = df_orig[feature] / df_orig['repo_age']

# Presuming you want to drop certain non-target, non-feature columns like 'repo_full_name', 'repo_description'
columns_to_drop = ['repo_full_name', 'repo_description'] + normalized_features
df_modified = df_ohe.drop(columns=columns_to_drop, errors='ignore')

normalized_targets = [f"{feature}_per_year" for feature in normalized_features]

In [None]:
df_ohe = df_ohe[df_ohe['repo_age'] != 0]
df_ohe.head()

In [None]:
for target in normalized_targets:
    print(f"Feature selection for: {target}")
    # Ensure 'repo_age' is included as a feature, exclude all other normalized targets except the current one
    X_columns = [col for col in df_modified.columns if col not in normalized_targets or col == target]
    X_columns.remove(target)  # Remove the current target from the feature set
    print(X_columns)
    
    X = df_modified[X_columns]
    y = df_ohe[target].dropna() # Accessing from df_ohe to ensure it's always available
    feature_selection(X, y, target)

## LASSO Regression

In [None]:
# Check the dataframe for missing columns, before beginning
df_modified.head()

In [None]:
# Assuming df_modified is your DataFrame and normalized_targets are your targets
normalized_features = ['repo_stargazers_count', 'repo_open_issues', 'repo_forks_count']
normalized_targets = [f"{feature}_per_year" for feature in normalized_features]
X = df_modified.drop(normalized_targets, axis=1)

In [None]:
df_modified.head()

In [None]:
# Testing & debugging
df_modified[normalized_targets[0]]

In [None]:
# 1. Scale X outside the loop
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Dictionary to store optimal alphas and selected strong features
optimal_alphas = {}
selected_features_per_target = {}

for normalized_target in normalized_targets:
    y = df_modified[normalized_target]

    # 2. Split the data into training and testing sets
    X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # 3. LassoCV for automatic alpha selection, using 5-fold cross-validation
    lasso_cv = LassoCV(cv=5, random_state=42, max_iter=10000)
    lasso_cv.fit(X_train_scaled, y_train)

    # After fitting LassoCV
    cv_means = lasso_cv.mse_path_.mean(axis=1)
    cv_stds = lasso_cv.mse_path_.std(axis=1)
    min_error = np.min(cv_means)
    min_error_std = cv_stds[np.argmin(cv_means)]

    # Select the largest alpha with mean error within one std of the minimum
    eligible_alphas = lasso_cv.alphas_[cv_means <= min_error + min_error_std]
    target_alpha = eligible_alphas[-1]  # The largest alpha meeting the criteria

    optimal_alphas[normalized_target] = target_alpha

    # Fit Lasso model with the selected best alpha on the training data
    lasso = Lasso(alpha=target_alpha, max_iter=10000, random_state=42)
    lasso.fit(X_train_scaled, y_train)

    # 4. Determine a threshold for selecting features, e.g., the 75th percentile of the absolute coefficients
    coefs = lasso.coef_
    threshold = np.percentile(np.abs(coefs), 75)  # Adjust the percentile as needed

    # Select features where the absolute coefficient is above the threshold
    strong_features = X.columns[np.abs(coefs) > threshold].tolist()

    # Store the selected strong features
    selected_features_per_target[normalized_target] = strong_features

In [None]:
# Print optimal alphas and selected strong features for each target
print("Optimal Alphas:")
for target, alpha in optimal_alphas.items():
    print(f"Optimal alpha for {target}: {alpha}")

print("\nSelected Strong Features:")
for target, features in selected_features_per_target.items():
    print(f"Selected strong features for {target}: {features}")