### Code

#### Import Necessary Packages

In [1]:
%pip install --q pandas
%pip install --q numpy
%pip install --q matplotlib
%pip install --q seaborn
%pip install --q scikit-learn
%pip install --q xgboost

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


#### Load Dataset

In [4]:
projects_df = pd.read_csv('Datasets/projects.csv')
outcomes_df = pd.read_csv('Datasets/outcomes.csv')

# Merge datasets on project ID
df = pd.merge(projects_df, outcomes_df[['projectid', 'fully_funded']], on='projectid', how='inner')

# Display basic information
print("Dataset shape:", df.shape)
print("\nFirst few rows:")
df.head()

Dataset shape: (619326, 36)

First few rows:


Unnamed: 0,projectid,teacher_acctid,schoolid,school_ncesid,school_latitude,school_longitude,school_city,school_state,school_zip,school_metro,...,poverty_level,grade_level,fulfillment_labor_materials,total_price_excluding_optional_support,total_price_including_optional_support,students_reached,eligible_double_your_impact_match,eligible_almost_home_match,date_posted,fully_funded
0,62526d85d2a1818432d03d600969e99c,ebc7c90b6c92a069432e0714b8d93dfd,5aca9711ff0e4b37db48701f46f73036,171371000000.0,41.972419,-88.174597,Bartlett,IL,60103.0,suburban,...,moderate poverty,Grades 3-5,30.0,444.36,522.78,7.0,f,f,2013-12-31,t
1,33d59ac771b80222ad63ef0f4ac47ade,de83b4c1f6428a15032c207c1d5e572a,d91a805b213bf74ae77b94e0de2b73ad,160153000000.0,43.501154,-112.05678,Idaho Falls,ID,83402.0,urban,...,high poverty,Grades 3-5,30.0,233.24,274.4,30.0,f,f,2013-12-31,f
2,1a3aaeffc56dd2a421e37d8298024c0a,f4c9ed095b85458dcf858e25f203af00,9310d3eb447a4e46bc5fc31ed007ceac,330261000000.0,42.888244,-71.320224,Derry,NH,3038.0,suburban,...,moderate poverty,Grades 6-8,30.0,285.09,335.4,230.0,f,f,2013-12-31,f
3,33aa19ee4da4c5adf47d0dfb84fab5ef,17768031eb40de8d4497dbb54df48742,9ac70da58322783f82152eecc140a812,510324000000.0,37.476158,-77.488397,Richmond,VA,23224.0,urban,...,highest poverty,Grades PreK-2,30.0,232.94,274.05,18.0,f,f,2013-12-31,f
4,e31c0ea8b68f404699dfb0d39e9bc99b,0f1bc5b4700fd33383be104442660178,cb9f688cf59e3ee22a087d616ca8f5d7,170993000000.0,41.952851,-87.650233,Chicago,IL,60613.0,urban,...,highest poverty,Grades 6-8,30.0,513.41,604.01,70.0,t,f,2013-12-31,t


#### Initial EDA

In [5]:
# Initial data exploration
print("Missing values per column:")
print(df.isnull().sum())
print("\nData types:")
print(df.dtypes)

print("\nTarget variable distribution:")
print(df['fully_funded'].value_counts(normalize=True))

# Basic statistics
print("\nNumerical features statistics:")
print(df.describe())

Missing values per column:
projectid                                      0
teacher_acctid                                 0
schoolid                                       0
school_ncesid                              38848
school_latitude                                0
school_longitude                               0
school_city                                    0
school_state                                   0
school_zip                                     4
school_metro                               75488
school_district                              922
school_county                                 17
school_charter                                 0
school_magnet                                  0
school_year_round                              0
school_nlns                                    0
school_kipp                                    0
school_charter_ready_promise                   0
teacher_prefix                                 0
teacher_teach_for_america                 

#### Data Preprocessing

In [6]:
def preprocess_data(df, columns_to_encode):
    """
    Preprocess data with one-hot encoding for specified columns
    """    
    # Create a copy to avoid modifying original dataframe
    processed_df = df.copy()

    # Drop all rows with missing values (TODO reconsider, but I think we have enough data to do this)
    processed_df = processed_df.dropna()
    
    # One-hot encode specified columns
    for column in columns_to_encode:
        if column in processed_df.columns:
            # Create dummy variables
            dummies = pd.get_dummies(
                processed_df[column], 
                prefix=column.lower().replace(' ', '_'),
                drop_first=True
            ).astype(int)
            
            # Drop original column and add encoded columns
            processed_df = processed_df.drop(column, axis=1)
            processed_df = pd.concat([processed_df, dummies], axis=1)
            
            print(f"Encoded {column} into {dummies.shape[1]} dummy variables")
    
    bool_columns = [
        'fully_funded',
        'school_charter',
        'school_magnet',
        'school_year_round',
        'school_nlns',
        'school_kipp',
        'school_charter_ready_promise',
        'teacher_teach_for_america',
        'teacher_ny_teaching_fellow',
        'eligible_double_your_impact_match',
        'eligible_almost_home_match',
        'great_chat',
        'at_least_1_teacher_referred_donor',
        'at_least_1_green_donation',
        'three_or_more_non_teacher_referred_donors',
        'one_non_teacher_referred_donor_giving_100_plus',
        'donation_from_thoughtful_donor'
    ]
    
    for col in bool_columns:
        if col in processed_df.columns:
            processed_df[col] = processed_df[col].map({'t': 1, 'f': 0})

    return processed_df

# Apply preprocessing with specific columns
try:
    print("Starting preprocessing...")
    
    columns_to_encode = [
        'primary_focus_area',
        'school_metro',
        'grade_level',
        'poverty_level',
        'resource_type'
    ]
    
    # Process the data
    processed_df = preprocess_data(df, columns_to_encode)
    
    # Display sample of processed data
    print("\nSample of processed data (first 5 rows):")
    display(processed_df.head())
    
    # Show shape before and after
    print(f"\nOriginal shape: {df.shape}")
    print(f"Processed shape: {processed_df.shape}")
    
except Exception as e:
    print(f"An error occurred: {str(e)}")

Starting preprocessing...
Encoded primary_focus_area into 6 dummy variables
Encoded school_metro into 2 dummy variables
Encoded grade_level into 3 dummy variables
Encoded poverty_level into 3 dummy variables
Encoded resource_type into 5 dummy variables

Sample of processed data (first 5 rows):


Unnamed: 0,projectid,teacher_acctid,schoolid,school_ncesid,school_latitude,school_longitude,school_city,school_state,school_zip,school_district,...,grade_level_Grades 9-12,grade_level_Grades PreK-2,poverty_level_highest poverty,poverty_level_low poverty,poverty_level_moderate poverty,resource_type_Other,resource_type_Supplies,resource_type_Technology,resource_type_Trips,resource_type_Visitors
0,62526d85d2a1818432d03d600969e99c,ebc7c90b6c92a069432e0714b8d93dfd,5aca9711ff0e4b37db48701f46f73036,171371000000.0,41.972419,-88.174597,Bartlett,IL,60103.0,Elgin School District U-46,...,0,0,0,0,1,1,0,0,0,0
2,1a3aaeffc56dd2a421e37d8298024c0a,f4c9ed095b85458dcf858e25f203af00,9310d3eb447a4e46bc5fc31ed007ceac,330261000000.0,42.888244,-71.320224,Derry,NH,3038.0,School Administrative Unit 10,...,0,0,0,0,1,0,0,1,0,0
6,a4b234feb2b72921ed59850d5c873d62,620982375045fa11d872702f26ab98bb,da1985df161ba5c3842fc99579b6e4b1,210299000000.0,38.249919,-85.72231,Louisville,KY,40206.0,Jefferson Co School District,...,0,1,1,0,0,0,0,1,0,0
7,0ff5dec32bf793243a8b0b2c023a81f0,ec5b110df87bd511b508961676d08b6c,72e2b0cb2eecfdc37e67e0eaf10da07b,120198000000.0,30.507978,-86.132003,Freeport,FL,32439.0,Walton Co School District,...,0,1,1,0,0,0,0,1,0,0
9,72d58b8e22adbe6e6aab65d8b2ab4973,0793851e2f40ecc28e40513622aea670,a180e99ef60be5a9c5f96e21f9868f13,560609000000.0,44.097156,-104.622147,Upton,WY,82730.0,Weston Co School District 7,...,0,1,0,0,1,0,1,0,0,0



Original shape: (619326, 36)
Processed shape: (340210, 50)


In [5]:
# # TODO learn feature selection

# df_heatmap = processed_df.drop(columns=['school_ncesid', 'school_latitude', 'school_longitude'])
# df_heatmap = df_heatmap.drop(columns=['total_price_excluding_optional_support', 'total_price_including_optional_support',
#                                       'fulfillment_labor_materials'])

# # Create correlation heatmap
# plt.figure(figsize=(15, 10))

# # Select numerical columns and convert 't'/'f' to 1/0 for fully_funded
# numerical_df = df_heatmap.select_dtypes(include=['int64', 'float64']).copy()

# # Create correlation matrix
# corr_matrix = numerical_df.corr()

# mask = np.tril(np.ones_like(corr_matrix, dtype=bool), k=-1)

# # Create heatmap
# sns.heatmap(
#     corr_matrix, mask=mask, #annot=True,fmt='.2f',
#     cmap='PuOr', center=0, square=True
#     )

# plt.title('Correlation Heatmap of Numerical Features')
# plt.tight_layout()
# plt.show()

#### Feature Selection and Train/Test

In [7]:
X = processed_df.drop(['fully_funded','projectid','teacher_acctid','schoolid','school_ncesid'], axis=1)

X = X.select_dtypes(exclude=['object', 'bool'])
y = processed_df['fully_funded']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=100000, test_size=0.2,
    random_state=42, stratify=y
    )

# Feature selection
selector = SelectKBest(f_classif, k=20)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Get selected feature names
selected_features = X.columns[selector.get_support()]
print("Selected features:", selected_features)

Selected features: Index(['school_latitude', 'school_charter', 'school_magnet', 'school_nlns',
       'teacher_teach_for_america', 'teacher_ny_teaching_fellow',
       'total_price_excluding_optional_support',
       'total_price_including_optional_support',
       'eligible_double_your_impact_match', 'eligible_almost_home_match',
       'primary_focus_area_Literacy & Language',
       'primary_focus_area_Math & Science',
       'primary_focus_area_Music & The Arts', 'school_metro_suburban',
       'school_metro_urban', 'grade_level_Grades 9-12',
       'poverty_level_highest poverty', 'poverty_level_moderate poverty',
       'resource_type_Supplies', 'resource_type_Technology'],
      dtype='object')


In [None]:
tscv = TImeSeriesSplit(n_splits=5)

NameError: name 'TImeSeriesSplit' is not defined

In [8]:
def train_and_evaluate_model(model, X_train, X_test, y_train, y_test):
    # Train model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Evaluate model
    print(f"Classification Report for {str(model).split('(')[0]} model:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nROC AUC Score:", roc_auc_score(y_test, y_pred_proba))
    
    return model

#### Run

In [9]:
# Train and evaluate different models
models = {
    'Logistic Regression': LogisticRegression(max_iter=100000),
    'Naive Bayes': GaussianNB(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': xgb.XGBClassifier()
}

for name, model in models.items():
    print(f"\n{name} Results:")
    train_and_evaluate_model(model, X_train_selected, X_test_selected, y_train, y_test)


Logistic Regression Results:
Classification Report for LogisticRegression model:
              precision    recall  f1-score   support

           0       0.59      0.11      0.19     20436
           1       0.72      0.97      0.82     47606

    accuracy                           0.71     68042
   macro avg       0.65      0.54      0.51     68042
weighted avg       0.68      0.71      0.63     68042


Confusion Matrix:
[[ 2282 18154]
 [ 1616 45990]]

ROC AUC Score: 0.6768583244921265

Naive Bayes Results:
Classification Report for GaussianNB model:
              precision    recall  f1-score   support

           0       0.48      0.14      0.22     20436
           1       0.72      0.93      0.81     47606

    accuracy                           0.70     68042
   macro avg       0.60      0.54      0.51     68042
weighted avg       0.64      0.70      0.63     68042


Confusion Matrix:
[[ 2900 17536]
 [ 3203 44403]]

ROC AUC Score: 0.6279835378358144

Gradient Boosting Results:


#### Identify high-risk projects

In [10]:
def identify_high_risk_projects(model, X, test_indices, original_df, threshold=0.1):
    # Get probability predictions
    proba = model.predict_proba(X)[:, 1]
    
    # Get indices of bottom 10% projects based on probability
    n_high_risk = int(len(proba) * threshold)
    high_risk_indices = np.argsort(proba)[:n_high_risk]
    
    # Map back to original indices
    original_indices = test_indices[high_risk_indices]
    
    # Get project IDs and probabilities
    high_risk_df = pd.DataFrame({
        'projectid': original_df.loc[original_indices, 'projectid'],
        'funding_probability': proba[high_risk_indices]
    })
    
    # Sort by probability (most risky first)
    high_risk_df = high_risk_df.sort_values('funding_probability')
    
    return high_risk_df

best_model = models['XGBoost']
test_indices = y_test.index

# Get high risk projects
high_risk_df = identify_high_risk_projects(
    model=best_model,
    X=X_test_selected,
    test_indices=test_indices,
    original_df=processed_df
)

# Display results
print(f"Identified {len(high_risk_df)} high-risk projects")
print("\nMost at-risk projects (lowest funding probability):")
high_risk_df = pd.DataFrame(high_risk_df).reset_index(drop=True)
high_risk_df

Identified 6804 high-risk projects

Most at-risk projects (lowest funding probability):


Unnamed: 0,projectid,funding_probability
0,cb619835d6681b38e5d30e232d66eb81,0.062474
1,4eaf3ed0a910bd19bee9551c60b4af9f,0.092377
2,daf15087b1371d03fcb1b7b7c989ae63,0.093376
3,59ddf1d14e5bdc856e1a5d86b87c65c9,0.096766
4,23d492fb5baa2bb5c9edb6f6cb177239,0.099554
...,...,...
6799,d09304b604724b9f5f056f4a6e1f34e4,0.461305
6800,f949feb07c5bf8d656e033750f473db7,0.461312
6801,e3efb9c90ef37a1ce40d033c515e5a28,0.461354
6802,3039dbe4417e9a96318df3fb37c84f16,0.461357


In [None]:
# TODO upload a day's worth of uploads and run this model
# TODO cross validation