In [48]:
import numpy as np
import pandas as pd
# Set option manually
pd.set_option('display.max_columns', 70)

# Compile Analysis Frame

Starting with outcomes DataFrame, it holds the "ground truth" of exciting projects from a business perspective. Applying cleaning:

In [2]:
dfout = pd.read_csv('Data/outcomes.csv', index_col=[0]) # ProjectID as index for easy merge

In [3]:
null_values_columns = dfout.columns[-3:] # from exploration notebook

# Cleaning 't' - 'f' values
dfout = dfout.assign(
                        **dfout.select_dtypes(include='object')
                            .replace({'f': 0, 't': 1})
                            .fillna(0)
                            .astype('uint'),

                        **dfout[null_values_columns]
                            .fillna(0)
                    )

dfout

Unnamed: 0_level_0,is_exciting,at_least_1_teacher_referred_donor,fully_funded,at_least_1_green_donation,great_chat,three_or_more_non_teacher_referred_donors,one_non_teacher_referred_donor_giving_100_plus,donation_from_thoughtful_donor,great_messages_proportion,teacher_referred_count,non_teacher_referred_count
projectid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ffffc4f85b60efc5b52347df489d0238,0,0,0,0,0,0,0,0,0.0,0.0,0.0
ffffac55ee02a49d1abc87ba6fc61135,0,0,1,1,0,1,0,0,57.0,0.0,7.0
ffff97ed93720407d70a2787475932b0,0,0,1,1,1,1,1,0,100.0,0.0,3.0
ffff418bb42fad24347527ad96100f81,0,0,0,1,1,0,0,0,100.0,0.0,1.0
ffff2d9c769c8fb5335e949c615425eb,1,1,1,1,1,0,1,0,63.0,6.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...
0000ee613c92ddc5298bf63142996a5c,0,1,1,1,0,1,1,0,0.0,2.0,4.0
0000b38bbc7252972f7984848cf58098,0,0,1,1,0,0,1,0,50.0,0.0,2.0
00002d691c05c51a5fdfbb2baef0ba25,0,0,0,0,1,1,0,0,100.0,0.0,5.0
00002bff514104264a6b798356fdd893,0,0,1,1,0,0,1,0,50.0,0.0,2.0


At this stage the outcomes DataFrame is cleaned.

Let's add information from the projects table. In the exploration notebook is noted how there are more projects - in the _project table_- than in the outcomes table, therefore, since we're using outcomes as the ground true (is_exciting column) let's enrich only for the projects in the outcomes table.

Let's import 'projects.csv', for the relevant columns (from exploration) and do basic cleaning

In [13]:
# Text columns
text_relv_col = [
    'poverty_level', 'resource_type','primary_focus_subject', 'primary_focus_area', 'school_metro'
    ]

# Binary columns
bin_relv_col = [
    'school_year_round', 'school_charter', 'school_magnet', 'eligible_double_your_impact_match',
    'eligible_almost_home_match'
]

# Numeric columns
cont_relv_col = [
    'fulfillment_labor_materials', 'total_price_excluding_optional_support', 
    'total_price_including_optional_support', 'students_reached'
]

In [5]:
# Reading Selected columns and setting projectid as index from the get-go
dfprj = pd.read_csv('Data/projects.csv', index_col=[0], usecols=(['projectid'] + text_relv_col + bin_relv_col +cont_relv_col + ['date_posted']) )
dfprj

Unnamed: 0_level_0,school_metro,school_charter,school_magnet,school_year_round,primary_focus_subject,primary_focus_area,resource_type,poverty_level,fulfillment_labor_materials,total_price_excluding_optional_support,total_price_including_optional_support,students_reached,eligible_double_your_impact_match,eligible_almost_home_match,date_posted
projectid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
316ed8fb3b81402ff6ac8f721bb31192,,f,f,f,Literature & Writing,Literacy & Language,Books,highest poverty,30.0,555.81,653.89,32.0,f,f,2014-05-12
90de744e368a7e4883223ca49318ae30,urban,f,f,f,Literacy,Literacy & Language,Books,highest poverty,30.0,296.47,348.79,22.0,f,f,2014-05-12
32943bb1063267de6ed19fc0ceb4b9a7,rural,f,f,f,Literacy,Literacy & Language,Technology,high poverty,30.0,430.89,506.93,17.0,f,f,2014-05-11
bb18f409abda2f264d5acda8cab577a9,urban,f,t,f,Social Sciences,History & Civics,Books,highest poverty,30.0,576.07,677.73,12.0,f,f,2014-05-11
24761b686e18e5eace634607acbcc19f,urban,f,f,f,Mathematics,Math & Science,Other,highest poverty,30.0,408.40,480.47,24.0,f,f,2014-05-11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
a7236ea96c812895cafc5d700d779147,urban,f,f,f,Environmental Science,Math & Science,Supplies,highest poverty,,231.00,281.71,0.0,f,f,2002-09-17
e02da37beb332eb66c2d2ba989c597ad,urban,f,f,f,Economics,History & Civics,Technology,highest poverty,,1129.00,1376.83,0.0,f,f,2002-09-17
82e536f14eadf2671a70e03416f695a3,urban,f,t,f,Early Development,Applied Learning,Supplies,moderate poverty,,125.00,152.44,0.0,f,f,2002-09-16
e139df754a873a62d93daa56acbf8040,,f,f,f,Literacy,Literacy & Language,Books,highest poverty,,125.00,152.44,0.0,f,f,2002-09-13


---- quick refresh ----

In [58]:
dfprj[text_relv_col].isna().sum()
# fill nan in school_metro with a new type for the empties, is the same quantity than rural.

poverty_level                0
resource_type               45
primary_focus_subject       39
primary_focus_area          39
school_metro             81908
dtype: int64

In [59]:
dfprj[bin_relv_col].isna().sum()

school_year_round                    0
school_charter                       0
school_magnet                        0
eligible_double_your_impact_match    0
eligible_almost_home_match           0
dtype: int64

In [62]:
dfprj[cont_relv_col].isna().sum() # Fill with 0

fulfillment_labor_materials               35082
total_price_excluding_optional_support        0
total_price_including_optional_support        0
students_reached                            146
dtype: int64

In [50]:
# Applying Cleaning Pipeline for Projects
dfprj2 = (
        dfprj
            .assign(
                **dfprj[bin_relv_col]
                    .replace({'f': 0, 't': 1})
                    .fillna(0)
                    .astype('uint'),  # Binary columns cleaning

                **dfprj[cont_relv_col].fillna(0),  # Numeric columns missing values

                school_metro=dfprj.school_metro.fillna('not_given'),  # keep empty value as new category
                poverty_level=dfprj.poverty_level.replace({' poverty': ''}, regex=True),  # simplify names in column
                year_posted=dfprj.date_posted.astype(np.datetime64).dt.year,  # YoY increase could drag interest
            )
            .pipe(# Cleaning text data
                lambda _df: _df.assign(
                                        **{f"{col}": _df[col].str.replace('&', 'and').str.replace(' ', '_').str.lower() for col in text_relv_col}, # Replacing characters and lower-casing
                                    )
            )
            .pipe(# Dummy columns (ignoring NaNs, already handled above for school_metro wich is intended to keep)
                lambda _df: pd.concat([_df] + [pd.get_dummies(_df[col], prefix=col, dummy_na=False) for col in text_relv_col], axis=1 )
            )
            .drop(columns = text_relv_col + ['date_posted'])  # Drop the original text columns and date_posted
        )

dfprj2

Unnamed: 0_level_0,school_charter,school_magnet,school_year_round,fulfillment_labor_materials,total_price_excluding_optional_support,total_price_including_optional_support,students_reached,eligible_double_your_impact_match,eligible_almost_home_match,year_posted,poverty_level_high,poverty_level_highest,poverty_level_low,poverty_level_moderate,resource_type_books,resource_type_other,resource_type_supplies,resource_type_technology,resource_type_trips,resource_type_visitors,primary_focus_subject_applied_sciences,primary_focus_subject_character_education,primary_focus_subject_civics_and_government,primary_focus_subject_college_and_career_prep,primary_focus_subject_community_service,primary_focus_subject_early_development,primary_focus_subject_economics,primary_focus_subject_environmental_science,primary_focus_subject_esl,primary_focus_subject_extracurricular,primary_focus_subject_foreign_languages,primary_focus_subject_gym_and_fitness,primary_focus_subject_health_and_life_science,primary_focus_subject_health_and_wellness,primary_focus_subject_history_and_geography,primary_focus_subject_literacy,primary_focus_subject_literature_and_writing,primary_focus_subject_mathematics,primary_focus_subject_music,primary_focus_subject_nutrition,primary_focus_subject_other,primary_focus_subject_parent_involvement,primary_focus_subject_performing_arts,primary_focus_subject_social_sciences,primary_focus_subject_special_needs,primary_focus_subject_sports,primary_focus_subject_visual_arts,primary_focus_area_applied_learning,primary_focus_area_health_and_sports,primary_focus_area_history_and_civics,primary_focus_area_literacy_and_language,primary_focus_area_math_and_science,primary_focus_area_music_and_the_arts,primary_focus_area_special_needs,school_metro_not_given,school_metro_rural,school_metro_suburban,school_metro_urban
projectid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1
316ed8fb3b81402ff6ac8f721bb31192,0,0,0,30.0,555.81,653.89,32.0,0,0,2014,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
90de744e368a7e4883223ca49318ae30,0,0,0,30.0,296.47,348.79,22.0,0,0,2014,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
32943bb1063267de6ed19fc0ceb4b9a7,0,0,0,30.0,430.89,506.93,17.0,0,0,2014,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
bb18f409abda2f264d5acda8cab577a9,0,1,0,30.0,576.07,677.73,12.0,0,0,2014,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1
24761b686e18e5eace634607acbcc19f,0,0,0,30.0,408.40,480.47,24.0,0,0,2014,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
a7236ea96c812895cafc5d700d779147,0,0,0,0.0,231.00,281.71,0.0,0,0,2002,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
e02da37beb332eb66c2d2ba989c597ad,0,0,0,0.0,1129.00,1376.83,0.0,0,0,2002,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1
82e536f14eadf2671a70e03416f695a3,0,1,0,0.0,125.00,152.44,0.0,0,0,2002,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
e139df754a873a62d93daa56acbf8040,0,0,0,0.0,125.00,152.44,0.0,0,0,2002,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0


In [62]:
(dfprj2.isna().sum()>0).sum() # no null values

0

Merging Projects information into Outcomes for analysis:

In [63]:
afr = dfout.merge( dfprj2, left_index=True, right_index=True, how='left' )
afr

Unnamed: 0_level_0,is_exciting,at_least_1_teacher_referred_donor,fully_funded,at_least_1_green_donation,great_chat,three_or_more_non_teacher_referred_donors,one_non_teacher_referred_donor_giving_100_plus,donation_from_thoughtful_donor,great_messages_proportion,teacher_referred_count,non_teacher_referred_count,school_charter,school_magnet,school_year_round,fulfillment_labor_materials,total_price_excluding_optional_support,total_price_including_optional_support,students_reached,eligible_double_your_impact_match,eligible_almost_home_match,year_posted,poverty_level_high,poverty_level_highest,poverty_level_low,poverty_level_moderate,resource_type_books,resource_type_other,resource_type_supplies,resource_type_technology,resource_type_trips,resource_type_visitors,primary_focus_subject_applied_sciences,primary_focus_subject_character_education,primary_focus_subject_civics_and_government,primary_focus_subject_college_and_career_prep,primary_focus_subject_community_service,primary_focus_subject_early_development,primary_focus_subject_economics,primary_focus_subject_environmental_science,primary_focus_subject_esl,primary_focus_subject_extracurricular,primary_focus_subject_foreign_languages,primary_focus_subject_gym_and_fitness,primary_focus_subject_health_and_life_science,primary_focus_subject_health_and_wellness,primary_focus_subject_history_and_geography,primary_focus_subject_literacy,primary_focus_subject_literature_and_writing,primary_focus_subject_mathematics,primary_focus_subject_music,primary_focus_subject_nutrition,primary_focus_subject_other,primary_focus_subject_parent_involvement,primary_focus_subject_performing_arts,primary_focus_subject_social_sciences,primary_focus_subject_special_needs,primary_focus_subject_sports,primary_focus_subject_visual_arts,primary_focus_area_applied_learning,primary_focus_area_health_and_sports,primary_focus_area_history_and_civics,primary_focus_area_literacy_and_language,primary_focus_area_math_and_science,primary_focus_area_music_and_the_arts,primary_focus_area_special_needs,school_metro_not_given,school_metro_rural,school_metro_suburban,school_metro_urban
projectid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1
ffffc4f85b60efc5b52347df489d0238,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,9.0,703.34,857.73,90.0,0,0,2009,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
ffffac55ee02a49d1abc87ba6fc61135,0,0,1,1,0,1,0,0,57.0,0.0,7.0,0,0,0,35.0,207.43,244.04,8.0,0,0,2011,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0
ffff97ed93720407d70a2787475932b0,0,0,1,1,1,1,1,0,100.0,0.0,3.0,0,1,0,35.0,440.01,517.66,25.0,1,0,2010,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
ffff418bb42fad24347527ad96100f81,0,0,0,1,1,0,0,0,100.0,0.0,1.0,0,0,0,17.0,798.85,974.21,20.0,0,0,2009,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
ffff2d9c769c8fb5335e949c615425eb,1,1,1,1,1,0,1,0,63.0,6.0,2.0,0,0,0,30.0,579.74,682.05,25.0,0,0,2013,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0000ee613c92ddc5298bf63142996a5c,0,1,1,1,0,1,1,0,0.0,2.0,4.0,0,1,0,30.0,384.86,452.78,19.0,0,0,2013,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
0000b38bbc7252972f7984848cf58098,0,0,1,1,0,0,1,0,50.0,0.0,2.0,0,0,0,30.0,547.86,644.54,36.0,1,0,2013,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
00002d691c05c51a5fdfbb2baef0ba25,0,0,0,0,1,1,0,0,100.0,0.0,5.0,0,0,0,35.0,892.31,1049.78,250.0,0,0,2010,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
00002bff514104264a6b798356fdd893,0,0,1,1,0,0,1,0,50.0,0.0,2.0,0,0,0,35.0,477.32,561.55,20.0,1,0,2010,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0


In [64]:
(afr.isna().sum()>0).sum() # no null values

0

Fixing data types before saving into disk

In [65]:
afr.dtypes

is_exciting                          uint32
at_least_1_teacher_referred_donor    uint32
fully_funded                         uint32
at_least_1_green_donation            uint32
great_chat                           uint32
                                      ...  
primary_focus_area_special_needs      uint8
school_metro_not_given                uint8
school_metro_rural                    uint8
school_metro_suburban                 uint8
school_metro_urban                    uint8
Length: 69, dtype: object

There's a unit32 and uint8 discrepancy. Let's verify if downcasting uint32 to uint8 will end up in loosing information:

In [77]:
for DTYPE in ['uint32', 'uint8']:
    print(f"Columns with {DTYPE}:")
    for col in afr.select_dtypes(DTYPE).columns:
        print(f"COLUMN {col} | Values -> {afr[col].unique()}")

Columns with uint32:
COLUMN is_exciting | Values -> [0 1]
COLUMN at_least_1_teacher_referred_donor | Values -> [0 1]
COLUMN fully_funded | Values -> [0 1]
COLUMN at_least_1_green_donation | Values -> [0 1]
COLUMN great_chat | Values -> [0 1]
COLUMN three_or_more_non_teacher_referred_donors | Values -> [0 1]
COLUMN one_non_teacher_referred_donor_giving_100_plus | Values -> [0 1]
COLUMN donation_from_thoughtful_donor | Values -> [0 1]
COLUMN school_charter | Values -> [0 1]
COLUMN school_magnet | Values -> [0 1]
COLUMN school_year_round | Values -> [0 1]
COLUMN eligible_double_your_impact_match | Values -> [0 1]
COLUMN eligible_almost_home_match | Values -> [0 1]
Columns with uint8:
COLUMN poverty_level_high | Values -> [1 0]
COLUMN poverty_level_highest | Values -> [0 1]
COLUMN poverty_level_low | Values -> [0 1]
COLUMN poverty_level_moderate | Values -> [0 1]
COLUMN resource_type_books | Values -> [0 1]
COLUMN resource_type_other | Values -> [0 1]
COLUMN resource_type_supplies | Values

In [78]:
# Set to downcast
afr.select_dtypes(include='uint32')

Unnamed: 0_level_0,is_exciting,at_least_1_teacher_referred_donor,fully_funded,at_least_1_green_donation,great_chat,three_or_more_non_teacher_referred_donors,one_non_teacher_referred_donor_giving_100_plus,donation_from_thoughtful_donor,school_charter,school_magnet,school_year_round,eligible_double_your_impact_match,eligible_almost_home_match
projectid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
ffffc4f85b60efc5b52347df489d0238,0,0,0,0,0,0,0,0,0,0,0,0,0
ffffac55ee02a49d1abc87ba6fc61135,0,0,1,1,0,1,0,0,0,0,0,0,0
ffff97ed93720407d70a2787475932b0,0,0,1,1,1,1,1,0,0,1,0,1,0
ffff418bb42fad24347527ad96100f81,0,0,0,1,1,0,0,0,0,0,0,0,0
ffff2d9c769c8fb5335e949c615425eb,1,1,1,1,1,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
0000ee613c92ddc5298bf63142996a5c,0,1,1,1,0,1,1,0,0,1,0,0,0
0000b38bbc7252972f7984848cf58098,0,0,1,1,0,0,1,0,0,0,0,1,0
00002d691c05c51a5fdfbb2baef0ba25,0,0,0,0,1,1,0,0,0,0,0,0,0
00002bff514104264a6b798356fdd893,0,0,1,1,0,0,1,0,0,0,0,1,0


In [79]:
# Applying transformation
afr = afr.assign( **{col: afr[col].astype('uint8') for col in afr.select_dtypes(include='uint32').columns } )

In [80]:
afr.dtypes

is_exciting                          uint8
at_least_1_teacher_referred_donor    uint8
fully_funded                         uint8
at_least_1_green_donation            uint8
great_chat                           uint8
                                     ...  
primary_focus_area_special_needs     uint8
school_metro_not_given               uint8
school_metro_rural                   uint8
school_metro_suburban                uint8
school_metro_urban                   uint8
Length: 69, dtype: object

In [81]:
# Saving as pickle (preserving transformations and data types)
afr.to_pickle('Data/preprocessed.pkl')