### Code

#### Import Necessary Packages

In [2]:
%pip install --q pandas
%pip install --q numpy
%pip install --q matplotlib
%pip install --q seaborn
%pip install --q scikit-learn
%pip install --q xgboost

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


#### Load Dataset

In [19]:
projects_df = pd.read_csv('Datasets/projects.csv')
outcomes_df = pd.read_csv('Datasets/outcomes.csv')

# Merge datasets on project ID
df = pd.merge(projects_df, outcomes_df[['projectid', 'fully_funded']], on='projectid', how='inner')

# Drop columns
columns_to_drop = [
    'school_ncesid', 'school_latitude', 'school_longitude', 'school_city',
    'school_state', 'school_metro', 'school_district', 'school_county',
    'total_price_including_optional_support', 'teacher_prefix',
    'teacher_teach_for_america','teacher_ny_teaching_fellow',
    'secondary_focus_subject', 'secondary_focus_area', 'primary_focus_subject',
    'fulfillment_labor_materials'
    ]
df.drop(columns=columns_to_drop, inplace=True)

# Display basic information
print("Dataset shape:", df.shape)
print(f"\nColumn names ({len(df.columns)}):")
print(df.columns.tolist())

print("\nFirst few rows:")
df.head(3).T

Dataset shape: (619326, 20)

Column names (20):
['projectid', 'teacher_acctid', 'schoolid', 'school_zip', 'school_charter', 'school_magnet', 'school_year_round', 'school_nlns', 'school_kipp', 'school_charter_ready_promise', 'primary_focus_area', 'resource_type', 'poverty_level', 'grade_level', 'total_price_excluding_optional_support', 'students_reached', 'eligible_double_your_impact_match', 'eligible_almost_home_match', 'date_posted', 'fully_funded']

First few rows:


Unnamed: 0,0,1,2
projectid,62526d85d2a1818432d03d600969e99c,33d59ac771b80222ad63ef0f4ac47ade,1a3aaeffc56dd2a421e37d8298024c0a
teacher_acctid,ebc7c90b6c92a069432e0714b8d93dfd,de83b4c1f6428a15032c207c1d5e572a,f4c9ed095b85458dcf858e25f203af00
schoolid,5aca9711ff0e4b37db48701f46f73036,d91a805b213bf74ae77b94e0de2b73ad,9310d3eb447a4e46bc5fc31ed007ceac
school_zip,60103.0,83402.0,3038.0
school_charter,f,f,f
school_magnet,f,f,f
school_year_round,f,f,f
school_nlns,f,f,f
school_kipp,f,f,f
school_charter_ready_promise,f,f,f


#### Initial EDA

In [20]:
# Initial data exploration
print("Missing values per column:")
print(df.isnull().sum())
print("\nData types:")
print(df.dtypes)

print("\nTarget variable distribution:")
print(df['fully_funded'].value_counts(normalize=True))

# Basic statistics
print("\nNumerical features statistics:")
print(df.describe())

Missing values per column:
projectid                                   0
teacher_acctid                              0
schoolid                                    0
school_zip                                  4
school_charter                              0
school_magnet                               0
school_year_round                           0
school_nlns                                 0
school_kipp                                 0
school_charter_ready_promise                0
primary_focus_area                         36
resource_type                              43
poverty_level                               0
grade_level                                 7
total_price_excluding_optional_support      0
students_reached                          144
eligible_double_your_impact_match           0
eligible_almost_home_match                  0
date_posted                                 0
fully_funded                                0
dtype: int64

Data types:
projectid                  

#### Data Preprocessing

In [21]:
def preprocess_data(df, columns_to_encode):
    """
    Preprocess data with one-hot encoding for specified columns
    """    
    # Create a copy to avoid modifying original dataframe
    processed_df = df.copy()

    # Drop all rows with missing values (TODO reconsider, but I think we have enough data to do this)
    processed_df = processed_df.dropna()
    
    # One-hot encode specified columns
    for column in columns_to_encode:
        if column in processed_df.columns:
            # Create dummy variables
            dummies = pd.get_dummies(
                processed_df[column], 
                prefix=column.lower().replace(' ', '_'),
                drop_first=True
            ).astype(int)
            
            # Drop original column and add encoded columns
            processed_df = processed_df.drop(column, axis=1)
            processed_df = pd.concat([processed_df, dummies], axis=1)
            
            print(f"Encoded {column} into {dummies.shape[1]} dummy variables")
    
    bool_columns = [
        'fully_funded',
        'school_charter',
        'school_magnet',
        'school_year_round',
        'school_nlns',
        'school_kipp',
        'school_charter_ready_promise',
        'teacher_teach_for_america',
        'teacher_ny_teaching_fellow',
        'eligible_double_your_impact_match',
        'eligible_almost_home_match'
    ]
    
    for col in bool_columns:
        if col in processed_df.columns:
            processed_df[col] = processed_df[col].map({'t': 1, 'f': 0})

    return processed_df

# Apply preprocessing with specific columns
try:
    print("Starting preprocessing...")
    
    columns_to_encode = [
        'primary_focus_area',
        'school_metro',
        'grade_level',
        'poverty_level',
        'resource_type'
    ]
    
    # Process the data
    processed_df = preprocess_data(df, columns_to_encode)
    
    # Display sample of processed data
    print("\nSample of processed data (first 5 rows):")
    display(processed_df.head())
    
    # Show shape before and after
    print(f"\nOriginal shape: {df.shape}")
    print(f"Processed shape: {processed_df.shape}")
    
except Exception as e:
    print(f"An error occurred: {str(e)}")

Starting preprocessing...
Encoded primary_focus_area into 6 dummy variables
Encoded grade_level into 3 dummy variables
Encoded poverty_level into 3 dummy variables
Encoded resource_type into 5 dummy variables

Sample of processed data (first 5 rows):


Unnamed: 0,projectid,teacher_acctid,schoolid,school_zip,school_charter,school_magnet,school_year_round,school_nlns,school_kipp,school_charter_ready_promise,...,grade_level_Grades 9-12,grade_level_Grades PreK-2,poverty_level_highest poverty,poverty_level_low poverty,poverty_level_moderate poverty,resource_type_Other,resource_type_Supplies,resource_type_Technology,resource_type_Trips,resource_type_Visitors
0,62526d85d2a1818432d03d600969e99c,ebc7c90b6c92a069432e0714b8d93dfd,5aca9711ff0e4b37db48701f46f73036,60103.0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
1,33d59ac771b80222ad63ef0f4ac47ade,de83b4c1f6428a15032c207c1d5e572a,d91a805b213bf74ae77b94e0de2b73ad,83402.0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,1a3aaeffc56dd2a421e37d8298024c0a,f4c9ed095b85458dcf858e25f203af00,9310d3eb447a4e46bc5fc31ed007ceac,3038.0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
3,33aa19ee4da4c5adf47d0dfb84fab5ef,17768031eb40de8d4497dbb54df48742,9ac70da58322783f82152eecc140a812,23224.0,0,0,0,0,0,0,...,0,1,1,0,0,1,0,0,0,0
4,e31c0ea8b68f404699dfb0d39e9bc99b,0f1bc5b4700fd33383be104442660178,cb9f688cf59e3ee22a087d616ca8f5d7,60613.0,0,1,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0



Original shape: (619326, 20)
Processed shape: (619145, 33)


In [6]:
# # TODO learn feature selection

# df_heatmap = processed_df.drop(columns=['school_ncesid', 'school_latitude', 'school_longitude'])
# df_heatmap = df_heatmap.drop(columns=['total_price_excluding_optional_support', 'total_price_including_optional_support',
#                                       'fulfillment_labor_materials'])

# # Create correlation heatmap
# plt.figure(figsize=(15, 10))

# # Select numerical columns and convert 't'/'f' to 1/0 for fully_funded
# numerical_df = df_heatmap.select_dtypes(include=['int64', 'float64']).copy()

# # Create correlation matrix
# corr_matrix = numerical_df.corr()

# mask = np.tril(np.ones_like(corr_matrix, dtype=bool), k=-1)

# # Create heatmap
# sns.heatmap(
#     corr_matrix, mask=mask, #annot=True,fmt='.2f',
#     cmap='PuOr', center=0, square=True
#     )

# plt.title('Correlation Heatmap of Numerical Features')
# plt.tight_layout()
# plt.show()

#### Feature Selection and Train/Test

In [22]:
X = processed_df.drop(['fully_funded','projectid','teacher_acctid','schoolid'], axis=1)

X = X.select_dtypes(exclude=['object', 'bool'])
y = processed_df['fully_funded']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=100000, test_size=0.2,
    random_state=42, stratify=y
    )

# Feature selection
selector = SelectKBest(f_classif, k=20)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Get selected feature names
selected_features = X.columns[selector.get_support()]
print("Selected features:", selected_features)

Selected features: Index(['school_zip', 'school_charter', 'school_magnet', 'school_nlns',
       'school_kipp', 'school_charter_ready_promise',
       'total_price_excluding_optional_support',
       'eligible_double_your_impact_match', 'eligible_almost_home_match',
       'primary_focus_area_Literacy & Language',
       'primary_focus_area_Math & Science',
       'primary_focus_area_Music & The Arts',
       'primary_focus_area_Special Needs', 'grade_level_Grades 9-12',
       'poverty_level_highest poverty', 'poverty_level_low poverty',
       'poverty_level_moderate poverty', 'resource_type_Supplies',
       'resource_type_Technology', 'resource_type_Trips'],
      dtype='object')


In [23]:
def train_and_evaluate_model(model, X_train, X_test, y_train, y_test):
    # Train model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Evaluate model
    print(f"Classification Report for {str(model).split('(')[0]} model:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nROC AUC Score:", roc_auc_score(y_test, y_pred_proba))
    
    return model

#### Run

In [24]:
# Train and evaluate different models
models = {
    'Logistic Regression': LogisticRegression(max_iter=100000),
    'Naive Bayes': GaussianNB(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': xgb.XGBClassifier()
}

for name, model in models.items():
    print(f"\n{name} Results:")
    train_and_evaluate_model(model, X_train_selected, X_test_selected, y_train, y_test)

print(f"\nBefore SMOTE - Training examples: {len(X_train_selected)}")
print(f"Class distribution: {np.bincount(y_train)}")

# TODO undersample majority class


Logistic Regression Results:
Classification Report for LogisticRegression model:
              precision    recall  f1-score   support

           0       0.58      0.08      0.14     37715
           1       0.71      0.97      0.82     86114

    accuracy                           0.70    123829
   macro avg       0.64      0.53      0.48    123829
weighted avg       0.67      0.70      0.61    123829


Confusion Matrix:
[[ 2983 34732]
 [ 2167 83947]]

ROC AUC Score: 0.6626875967094308

Naive Bayes Results:
Classification Report for GaussianNB model:
              precision    recall  f1-score   support

           0       0.56      0.04      0.07     37715
           1       0.70      0.99      0.82     86114

    accuracy                           0.70    123829
   macro avg       0.63      0.51      0.44    123829
weighted avg       0.66      0.70      0.59    123829


Confusion Matrix:
[[ 1349 36366]
 [ 1075 85039]]

ROC AUC Score: 0.6209736672559177

Gradient Boosting Results:


#### Identify high-risk projects

In [11]:
def identify_high_risk_projects(model, X, test_indices, original_df, threshold=0.1):
    # Get probability predictions
    proba = model.predict_proba(X)[:, 1]
    
    # Get indices of bottom 10% projects based on probability
    n_high_risk = int(len(proba) * threshold)
    high_risk_indices = np.argsort(proba)[:n_high_risk]
    
    # Map back to original indices
    original_indices = test_indices[high_risk_indices]
    
    # Get project IDs and probabilities
    high_risk_df = pd.DataFrame({
        'projectid': original_df.loc[original_indices, 'projectid'],
        'funding_probability': proba[high_risk_indices]
    })
    
    # Sort by probability (most risky first)
    high_risk_df = high_risk_df.sort_values('funding_probability')
    
    return high_risk_df

best_model = models['XGBoost']
test_indices = y_test.index

# Get high risk projects
high_risk_df = identify_high_risk_projects(
    model=best_model,
    X=X_test_selected,
    test_indices=test_indices,
    original_df=processed_df
)

# Display results
print(f"Identified {len(high_risk_df)} high-risk projects")
print("\nMost at-risk projects (lowest funding probability):")
high_risk_df = pd.DataFrame(high_risk_df).reset_index(drop=True)
high_risk_df

Identified 8128 high-risk projects

Most at-risk projects (lowest funding probability):


Unnamed: 0,projectid,funding_probability
0,c9f5ac689d60d4c8e23efcdf6a138e89,0.057517
1,8113ac9993b2e4d771d7f962fd484976,0.075700
2,fdddaab9d1fac3ff9f32f783bd4e377e,0.102794
3,d0bc01ce2d1967fb3189cc7b243bcc65,0.108868
4,10d86601dbe0d0a411840971c5a0fdcd,0.111939
...,...,...
8123,4c7d72655bb1fa04083ba7dbcdf9f0d1,0.479142
8124,3575c22cecf77f78020afa2ea2482933,0.479145
8125,ce50a938dea441998954a79b6d85aa53,0.479156
8126,966a6d2198257a551ec1492d423d7ad1,0.479167


In [None]:
# TODO upload a day's worth of uploads and run this model
# TODO cross validation