In [21]:
import pandas as pd
import joblib
import json
from IPython.display import display, Markdown

def load_config(config_file="../config.json"):
    """Load configuration from a JSON file."""
    with open(config_file, 'r') as file:
        config = json.load(file)
    return config

In [22]:
test_df = pd.read_csv('../outputs/test_df.csv')

In [23]:
test_df.columns

Index(['projectid', 'teacher_referred_count',
       'total_price_excluding_optional_support', 'students_reached',
       'fully_funded', 'date_posted', 'total_projects_in_city',
       'total_projects_in_state', 'poverty_level_high poverty',
       'poverty_level_highest poverty', 'poverty_level_low poverty',
       'poverty_level_moderate poverty', 'school_metro_rural',
       'school_metro_suburban', 'school_metro_urban', 'grade_level_Grades 3-5',
       'grade_level_Grades 6-8', 'grade_level_Grades 9-12',
       'grade_level_Grades PreK-2', 'primary_focus_subject_Applied Sciences',
       'primary_focus_subject_Character Education',
       'primary_focus_subject_Civics & Government',
       'primary_focus_subject_College & Career Prep',
       'primary_focus_subject_Community Service', 'primary_focus_subject_ESL',
       'primary_focus_subject_Early Development',
       'primary_focus_subject_Economics',
       'primary_focus_subject_Environmental Science',
       'primary_focus_su

In [24]:
test_df['date_posted'] = pd.to_datetime(test_df['date_posted'])

In [25]:
# Set max_date to filter projects posted for at least 3 months
fixed_max_date = pd.to_datetime('2013-12-31')
test_df['months_since_posted'] = ((fixed_max_date - test_df['date_posted']) / pd.Timedelta(days=30)).astype(int)
eligible_projects = test_df[test_df['months_since_posted'] >= 3].copy()

In [26]:
test_df

Unnamed: 0,projectid,teacher_referred_count,total_price_excluding_optional_support,students_reached,fully_funded,date_posted,total_projects_in_city,total_projects_in_state,poverty_level_high poverty,poverty_level_highest poverty,...,primary_focus_subject_Special Needs,primary_focus_subject_Sports,primary_focus_subject_Visual Arts,resource_type_Books,resource_type_Other,resource_type_Supplies,resource_type_Technology,resource_type_Trips,resource_type_Visitors,months_since_posted
0,a11d5c86692ef20e9f702094504bb5db,0.0,387.13,34.0,1.0,2013-01-01,290,126242,False,False,...,False,False,False,False,False,True,False,False,False,12
1,8bdc6b4cfc850f6f7cc3963c48ba454b,0.0,1045.00,120.0,0.0,2013-01-01,172,2127,False,True,...,False,False,False,False,False,False,False,False,True,12
2,88f263f1cd657bccce254ffc0b79df4d,2.0,868.02,28.0,1.0,2013-01-01,102,126242,True,False,...,False,False,False,False,False,True,False,False,False,12
3,a8a6e08b6ae7ab1a88c16ba82ac8e9da,0.0,515.00,96.0,1.0,2013-01-01,206,14853,False,True,...,False,False,False,False,True,False,False,False,False,12
4,be21151602e4fd47dd4012114fd519c6,0.0,238.56,25.0,1.0,2013-01-01,337,18615,False,True,...,False,False,False,False,False,True,False,False,False,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114032,a0e839f24645e3d6dcbd327f8441b043,0.0,567.18,60.0,1.0,2013-12-31,20364,73182,False,True,...,False,False,True,False,False,True,False,False,False,0
114033,f820ef3537f4445b0716244fae36f763,0.0,277.37,30.0,0.0,2013-12-31,55,2030,True,False,...,False,False,False,True,False,False,False,False,False,0
114034,95ee208a51831edffa7cc2e0aa3e83cd,0.0,202.34,98.0,1.0,2013-12-31,2945,9837,False,True,...,False,False,False,True,False,False,False,False,False,0
114035,1aa5e1d739a40c2763da1a0bb0f0f335,2.0,177.18,23.0,1.0,2013-12-31,855,17299,False,True,...,False,False,False,False,False,False,True,False,False,0


In [27]:
config = load_config()
poverty_levels = ["low", "moderate", "high", "highest"]
models = ["random_forest", "logistic_regression"]
recommendations = {}

In [28]:
recommendations = {}

poverty_levels = ["low", "moderate", "high", "highest"]
models = ["random_forest", "logistic_regression"]

for model_type in models:
    for pov_level in poverty_levels:
        pov_column = f"poverty_level_{pov_level} poverty"
        pov_projects = eligible_projects[eligible_projects[pov_column] == 1].copy()
        
        # Load the classifier
        classifier = joblib.load(f"../outputs/{model_type}_{pov_level}_poverty.pkl")
        
        X_test_filtered = pov_projects.drop(columns=['fully_funded', 'date_posted', 'months_since_posted', 'projectid'], errors='ignore')
        X_test_filtered = X_test_filtered.reindex(columns=classifier.feature_names_in_, fill_value=0)
        
        pov_projects[f'probability_fully_funded_{model_type}'] = classifier.predict_proba(X_test_filtered)[:, 1]
        
        pov_projects_sorted = pov_projects.sort_values(by=f'probability_fully_funded_{model_type}', ascending=False)

        if pov_level in ["high", "highest"]:
            top_recommendations = pov_projects_sorted.head(10)
        elif pov_level in ["low", "moderate"]:
            top_recommendations = pov_projects_sorted.head(3)

        recommendations[(model_type, pov_level)] = top_recommendations[['projectid', 'date_posted', f'probability_fully_funded_{model_type}']]

In [29]:
list(recommendations.keys())

[('random_forest', 'low'),
 ('random_forest', 'moderate'),
 ('random_forest', 'high'),
 ('random_forest', 'highest'),
 ('logistic_regression', 'low'),
 ('logistic_regression', 'moderate'),
 ('logistic_regression', 'high'),
 ('logistic_regression', 'highest')]

In [30]:
for key, recs in recommendations.items():
    model_type, pov_level = key  # Unpack manually
    # print(f"\nTop recommendations for {model_type.capitalize()} model, poverty level '{pov_level}':")
    display(key, recs)

('random_forest', 'low')

Unnamed: 0,projectid,date_posted,probability_fully_funded_random_forest
72437,f4064b13c27c1fbdc31b151719ba9f08,2013-09-25,1.0
14699,4d2509f0f54df77ab927609b49749094,2013-03-01,1.0
24210,71de20531b97210cbe5f1fa8d9c6c148,2013-04-19,1.0


('random_forest', 'moderate')

Unnamed: 0,projectid,date_posted,probability_fully_funded_random_forest
54870,b736a74ca2c9b8a6cc8d1d9ee1e4c2b2,2013-08-25,1.0
68404,8e4065284366cb9f7911e4f72de60add,2013-09-17,1.0
33098,7cec345ecf8bbf5457449a501ee97b88,2013-06-08,1.0


('random_forest', 'high')

Unnamed: 0,projectid,date_posted,probability_fully_funded_random_forest
16897,f5c1c43b25747deee09bdc0df6852e15,2013-03-09,1.0
4744,fc00aeeb883f2e406d20a3d318311204,2013-01-22,1.0
56225,21463e8eb4162599733b866fd63e8330,2013-08-29,1.0
5738,b122b0f1e82589ae076eefcb05dfdac5,2013-01-26,1.0
52032,a073eb5fcf7d7df4e62a48683c986de9,2013-08-19,1.0
5549,343c4906f8ba5149ab919206939030bc,2013-01-25,1.0
5472,1afb141002c832acfdde7e95047056f3,2013-01-25,1.0
61031,b93d387e199d6580803e7c57e2fa8da1,2013-09-04,1.0
52767,20e5fa11b99a27e6afd86beff29bca67,2013-08-21,1.0
39097,ed59d86f60de5f62bf58f2839d470062,2013-07-07,1.0


('random_forest', 'highest')

Unnamed: 0,projectid,date_posted,probability_fully_funded_random_forest
34044,924524717941a48039f1669e1ed75a45,2013-06-13,1.0
24567,a9b321ce37334e4a51d326ea6bd837cf,2013-04-21,1.0
6326,d6a288c45dc6137097dd0a599bdfb797,2013-01-27,1.0
73355,b6b9c69899f58e216fe5c2bb09be2696,2013-09-27,1.0
6348,49ca3027ed636f35dc2278d51865b96e,2013-01-27,1.0
17826,e20c432e34d240c658a9199e4b84f9be,2013-03-14,1.0
46175,90baf355397c227003d45df0b1019a7f,2013-08-05,1.0
24562,a54842e48e93677e350fcbc1350ae98a,2013-04-21,1.0
46184,222abfb6618b575f88768f423a3b2ea8,2013-08-05,1.0
6367,cb9931e5beb6f1279022e684bfb488e6,2013-01-27,1.0


('logistic_regression', 'low')

Unnamed: 0,projectid,date_posted,probability_fully_funded_logistic_regression
9079,b00faec78dcbf574752354bd418b9448,2013-02-08,1.0
71056,9e7ab5c57e54e4c4e6d75e86c7e71a1f,2013-09-22,1.0
67342,469d2fc9421faf51530fcf25c80c154d,2013-09-15,0.999478


('logistic_regression', 'moderate')

Unnamed: 0,projectid,date_posted,probability_fully_funded_logistic_regression
9200,eccaded79e062ee44c74f229a5f71367,2013-02-09,1.0
8674,55a8875e022b4cdab5d12ad81231696a,2013-02-07,1.0
36806,bb481f89c2caf2558fc219d41f328931,2013-06-27,1.0


('logistic_regression', 'high')

Unnamed: 0,projectid,date_posted,probability_fully_funded_logistic_regression
62786,60b5e96c664651ed0f46a6f9c3a164cb,2013-09-07,1.0
64221,16137012e6bb28e5f2675a13778f6fdd,2013-09-09,1.0
15548,d696d7a8a4654156a2fac41c6fbd72b1,2013-03-04,1.0
7026,63783da406806d5025e539b58b10a1b7,2013-01-31,1.0
10150,71ad58a7eccf5c3ac402f8e06d9965cb,2013-02-11,1.0
6938,8d6fdceaf067517eb2e446c5036a5134,2013-01-29,1.0
72319,f71b0045e9861618c587caf94211d2b7,2013-09-25,1.0
60075,61b0b7269de78fe9375ace64e2e53004,2013-09-03,1.0
49680,26a46ded38b21614030ef8abe40d4879,2013-08-13,1.0
11724,dea5ebd7921e9433a9b418c4cd35d884,2013-02-15,1.0


('logistic_regression', 'highest')

Unnamed: 0,projectid,date_posted,probability_fully_funded_logistic_regression
24288,35945268966f1c97ef091d943ddf15f9,2013-04-19,0.777195
27144,87bb2dacd8d43a84f2a4160b00d2615f,2013-05-06,0.77706
23098,55f36bf7347af7c4d9c93bb0c56a4ae2,2013-04-13,0.77705
6,0b2a1dfb1fb9b45fa473a525d81b5d58,2013-01-01,0.776668
9096,5a92f3e1f3ce98cbcefe9ec829fd5d17,2013-02-08,0.776583
42016,8f9b2542712dfe7fe51be67d331d2438,2013-07-21,0.776501
23520,5144b1e86585cef363a24e24590004a7,2013-04-14,0.776456
53759,9fbc5b978062ad84c2f9967f99366854,2013-08-23,0.776431
59673,f48bc6277416f8138df2b58d874b1a15,2013-09-03,0.776406
69385,b0f6fc8f51cb235a42a71e649983db60,2013-09-19,0.776404


In [31]:
for key, recs in recommendations.items():
    model_type, pov_level = key  # Unpack model type and poverty level
    
    # Sort projects by probability to create a ranking
    recs_sorted = recs.sort_values(f'probability_fully_funded_{model_type}', ascending=False).reset_index(drop=True)
    recs_sorted['rank'] = recs_sorted.index + 1  # Add rank column
    
    # Add model type and poverty level columns
    recs_sorted['model_type'] = model_type.capitalize()
    recs_sorted['poverty_level'] = pov_level.capitalize()
    
    # Select only relevant columns and rename for consistency
    table = recs_sorted[['rank', 'projectid', 'model_type', 'poverty_level', f'probability_fully_funded_{model_type}']]
    table.rename(columns={f'probability_fully_funded_{model_type}': 'probability_fully_funded'}, inplace=True)
    
    # Display table with a header for context
    display(Markdown(f"### {model_type.capitalize()} Model - {pov_level.capitalize()} Poverty Level"))
    display(table)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table.rename(columns={f'probability_fully_funded_{model_type}': 'probability_fully_funded'}, inplace=True)


### Random_forest Model - Low Poverty Level

Unnamed: 0,rank,projectid,model_type,poverty_level,probability_fully_funded
0,1,f4064b13c27c1fbdc31b151719ba9f08,Random_forest,Low,1.0
1,2,4d2509f0f54df77ab927609b49749094,Random_forest,Low,1.0
2,3,71de20531b97210cbe5f1fa8d9c6c148,Random_forest,Low,1.0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table.rename(columns={f'probability_fully_funded_{model_type}': 'probability_fully_funded'}, inplace=True)


### Random_forest Model - Moderate Poverty Level

Unnamed: 0,rank,projectid,model_type,poverty_level,probability_fully_funded
0,1,b736a74ca2c9b8a6cc8d1d9ee1e4c2b2,Random_forest,Moderate,1.0
1,2,8e4065284366cb9f7911e4f72de60add,Random_forest,Moderate,1.0
2,3,7cec345ecf8bbf5457449a501ee97b88,Random_forest,Moderate,1.0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table.rename(columns={f'probability_fully_funded_{model_type}': 'probability_fully_funded'}, inplace=True)


### Random_forest Model - High Poverty Level

Unnamed: 0,rank,projectid,model_type,poverty_level,probability_fully_funded
0,1,f5c1c43b25747deee09bdc0df6852e15,Random_forest,High,1.0
1,2,fc00aeeb883f2e406d20a3d318311204,Random_forest,High,1.0
2,3,21463e8eb4162599733b866fd63e8330,Random_forest,High,1.0
3,4,b122b0f1e82589ae076eefcb05dfdac5,Random_forest,High,1.0
4,5,a073eb5fcf7d7df4e62a48683c986de9,Random_forest,High,1.0
5,6,343c4906f8ba5149ab919206939030bc,Random_forest,High,1.0
6,7,1afb141002c832acfdde7e95047056f3,Random_forest,High,1.0
7,8,b93d387e199d6580803e7c57e2fa8da1,Random_forest,High,1.0
8,9,20e5fa11b99a27e6afd86beff29bca67,Random_forest,High,1.0
9,10,ed59d86f60de5f62bf58f2839d470062,Random_forest,High,1.0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table.rename(columns={f'probability_fully_funded_{model_type}': 'probability_fully_funded'}, inplace=True)


### Random_forest Model - Highest Poverty Level

Unnamed: 0,rank,projectid,model_type,poverty_level,probability_fully_funded
0,1,924524717941a48039f1669e1ed75a45,Random_forest,Highest,1.0
1,2,a9b321ce37334e4a51d326ea6bd837cf,Random_forest,Highest,1.0
2,3,d6a288c45dc6137097dd0a599bdfb797,Random_forest,Highest,1.0
3,4,b6b9c69899f58e216fe5c2bb09be2696,Random_forest,Highest,1.0
4,5,49ca3027ed636f35dc2278d51865b96e,Random_forest,Highest,1.0
5,6,e20c432e34d240c658a9199e4b84f9be,Random_forest,Highest,1.0
6,7,90baf355397c227003d45df0b1019a7f,Random_forest,Highest,1.0
7,8,a54842e48e93677e350fcbc1350ae98a,Random_forest,Highest,1.0
8,9,222abfb6618b575f88768f423a3b2ea8,Random_forest,Highest,1.0
9,10,cb9931e5beb6f1279022e684bfb488e6,Random_forest,Highest,1.0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table.rename(columns={f'probability_fully_funded_{model_type}': 'probability_fully_funded'}, inplace=True)


### Logistic_regression Model - Low Poverty Level

Unnamed: 0,rank,projectid,model_type,poverty_level,probability_fully_funded
0,1,b00faec78dcbf574752354bd418b9448,Logistic_regression,Low,1.0
1,2,9e7ab5c57e54e4c4e6d75e86c7e71a1f,Logistic_regression,Low,1.0
2,3,469d2fc9421faf51530fcf25c80c154d,Logistic_regression,Low,0.999478


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table.rename(columns={f'probability_fully_funded_{model_type}': 'probability_fully_funded'}, inplace=True)


### Logistic_regression Model - Moderate Poverty Level

Unnamed: 0,rank,projectid,model_type,poverty_level,probability_fully_funded
0,1,eccaded79e062ee44c74f229a5f71367,Logistic_regression,Moderate,1.0
1,2,55a8875e022b4cdab5d12ad81231696a,Logistic_regression,Moderate,1.0
2,3,bb481f89c2caf2558fc219d41f328931,Logistic_regression,Moderate,1.0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table.rename(columns={f'probability_fully_funded_{model_type}': 'probability_fully_funded'}, inplace=True)


### Logistic_regression Model - High Poverty Level

Unnamed: 0,rank,projectid,model_type,poverty_level,probability_fully_funded
0,1,60b5e96c664651ed0f46a6f9c3a164cb,Logistic_regression,High,1.0
1,2,16137012e6bb28e5f2675a13778f6fdd,Logistic_regression,High,1.0
2,3,d696d7a8a4654156a2fac41c6fbd72b1,Logistic_regression,High,1.0
3,4,63783da406806d5025e539b58b10a1b7,Logistic_regression,High,1.0
4,5,71ad58a7eccf5c3ac402f8e06d9965cb,Logistic_regression,High,1.0
5,6,8d6fdceaf067517eb2e446c5036a5134,Logistic_regression,High,1.0
6,7,f71b0045e9861618c587caf94211d2b7,Logistic_regression,High,1.0
7,8,61b0b7269de78fe9375ace64e2e53004,Logistic_regression,High,1.0
8,9,26a46ded38b21614030ef8abe40d4879,Logistic_regression,High,1.0
9,10,dea5ebd7921e9433a9b418c4cd35d884,Logistic_regression,High,1.0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table.rename(columns={f'probability_fully_funded_{model_type}': 'probability_fully_funded'}, inplace=True)


### Logistic_regression Model - Highest Poverty Level

Unnamed: 0,rank,projectid,model_type,poverty_level,probability_fully_funded
0,1,35945268966f1c97ef091d943ddf15f9,Logistic_regression,Highest,0.777195
1,2,87bb2dacd8d43a84f2a4160b00d2615f,Logistic_regression,Highest,0.77706
2,3,55f36bf7347af7c4d9c93bb0c56a4ae2,Logistic_regression,Highest,0.77705
3,4,0b2a1dfb1fb9b45fa473a525d81b5d58,Logistic_regression,Highest,0.776668
4,5,5a92f3e1f3ce98cbcefe9ec829fd5d17,Logistic_regression,Highest,0.776583
5,6,8f9b2542712dfe7fe51be67d331d2438,Logistic_regression,Highest,0.776501
6,7,5144b1e86585cef363a24e24590004a7,Logistic_regression,Highest,0.776456
7,8,9fbc5b978062ad84c2f9967f99366854,Logistic_regression,Highest,0.776431
8,9,f48bc6277416f8138df2b58d874b1a15,Logistic_regression,Highest,0.776406
9,10,b0f6fc8f51cb235a42a71e649983db60,Logistic_regression,Highest,0.776404
