Load all the necessary libraries:

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load Data

Start off by defining some conversion functions. Each is designed to reduce the file size a bit by converting text data into only a few bits worth of numerical data.

The the data being loaded is straight from Kaggle (except the Projects.csv set, which had to be treated separately due to it's size). If you are working with files ending with `_clean` these steps are irrelevant beyond discovering how the cleaned files were created.

In [2]:
# If the value is "Yes", convert to a 1. Otherwise, 0 (i.e "No")
yes_no_binary_converter = lambda x: 1 if x == 'Yes' else 0

# For Schools.csv
def metro_type_converter(x):
    if x == 'rural':
        return 1
    if x == 'suburban':
        return 2
    if x == 'urban':
        return 3
    if x == 'town':
        return 4
    
    return 0 # if metropolitan type is ambiguous

# For Teachers.csv
def teacher_prefix_converter(x):
    if x == 'Mrs.':
        return 1
    if x == 'Mr.':
        return 2
    
    return 0 # if prefix is ambiguous

# For Projects.csv
def grade_level_converter(x):
    if '2' in x:
        return 1
    if '3' in x:
        return 2
    if '6' in x:
        return 3
    if '9' in x:
        return 4
        
    return -1 # if grade is unknown/missing

# For Projects.csv
def project_type_converter(x):
    if x == 'Professional Development':
        return 0
    if x == 'Student-Led':
        return 1
    if x == 'Teacher-Led':
        return 2
    
# For Projects.csv
def project_status_converter(x):
    if x == 'Expired':
        return 0
    if x == 'Fully Funded':
        return 1
    if x == 'Live':
        return 2

Load *Donors.csv*. Modify headers to exclude spaces. Binary conversion for whether the donor is a teacher. Ensure zip code is of integer type.

In [4]:
donors_dat = pd.read_csv('data/Donors.csv',
                         header=0,
                         names=['donor_id', 'donor_city', 'donor_state', 'donor_teacher', 'donor_zip'],
                         converters={'donor_teacher': yes_no_binary_converter})

donors_dat['donor_zip'] = donors_dat['donor_zip'].astype(np.integer)
donors_dat.sample(5) # randomly sample 5

Unnamed: 0,donor_id,donor_city,donor_state,donor_teacher,donor_zip
1747238,ea389ef8ac8d378fb65daa7c73bebe45,Morgantown,West Virginia,0,265
1553998,d04bff76bb18d030dffe2bcf896a0996,Danville,California,0,945
766134,66907573d546c1ae558956f06903e3f4,Greensboro,North Carolina,0,274
1260554,a8eac50b9078d87774b5f542b2356f6d,New Albany,Ohio,0,430
747419,640f05d8afda58078cafd6689df4c8e6,New York,New York,0,100


Load *Donations.csv*. Modify headers to exclude spaces. Binary conversion for whether the donation included an optional donation. Ensure timestamps are converted to integers (for easy computation of timedeltas and reverse-conversion).

In [5]:
donations_dat = pd.read_csv('data/Donations.csv',
                         header=0,
                         names=['project_id', 'donation_id', 'donor_id', 'opt_donation', 'amount', 'cart_seq', 'date'],
                         converters={'opt_donation': yes_no_binary_converter},
                         parse_dates=['date'])

donations_dat['date'] = donations_dat['date'].apply(lambda t: int(t.timestamp()))
donations_dat.sample(5) # randomly sample 5

Unnamed: 0,project_id,donation_id,donor_id,opt_donation,amount,cart_seq,date
4088818,def8affaa4be52fd880e43286016f8e1,9e92710e9d76aa3479016ff34984c1d7,660f1b416793ea81ab79b048ae21b69c,0,50.0,1,1520855534
3445061,bc18ae934f4681c16a954e50ebbed023,95b4030763157142c48c99e1e2825d13,237db43817f34988f9d543ca518be4ee,1,1.0,10211,1514066173
3925515,d6183971404cabe4427f1afbbc2759fa,b99afb0f928780c95c5547b78260c557,b46f032e27b2ef56ae0078e49e070da8,1,1.0,116,1419550881
2676550,9261ebba94911a3f545da34a28049709,9c8adf4e7859e585274509f312da95f1,1a3b44b935e1608fa1446deca6d3724c,1,10.0,5,1523790384
3704674,ca292d2d7ac26ab107d4f80445a626d7,cd8ee25664433dfbb010142b55330358,21390ae66c27c90635df92e9d4e6c32d,0,20.0,28,1414632233


Load *Schools.csv*. Modify headers to exclude spaces. Conversion of metro type to numerical.

In [6]:
schools_dat = pd.read_csv('data/Schools.csv',
                         header=0,
                         names=['school_id', 'school_name', 'metro_type', 'free_lunch', 'school_state', 'school_zip', 'school_city', 'school_county', 'school_district'],
                         converters={'metro_type': metro_type_converter})
schools_dat.sample(5) # randomly sample 5

Unnamed: 0,school_id,school_name,metro_type,free_lunch,school_state,school_zip,school_city,school_county,school_district
31991,6f75cbd5b364eceb21f8ced437af9eab,Bonita Street Elementary School,3,70.0,California,90745,Carson,Los Angeles,Los Angeles Unif Sch Dist
60958,d59e19f1938d61dd5873e9481eefee2e,Chesterfield Technical Center @ Hull,0,34.0,Virginia,23112,Midlothian,Chesterfield,Chesterfield Co Public Schools
44457,9bad094dde23f26cb7f905c1a99230c5,El Camino Real Academy,3,95.0,New Mexico,87507,Santa Fe,Santa Fe,Santa Fe Public School Dist
37777,840d631b25316ede91b5ca2da1bf4808,Potomac Elementary School,4,41.0,Virginia,22448,Dahlgren,King George,King George Co School District
52653,b87c4ca01b70746dacc128151fea7c45,Roscoe Elementary School,3,86.0,California,91352,Sun Valley,Los Angeles,Los Angeles Unif Sch Dist


Load *Teachers.csv*. Modify headers to exclude spaces. Conversion of teacher prefix to numerical. Dates to integer (see above).

In [7]:
teachers_dat = pd.read_csv('data/Teachers.csv',
                           header=0,
                           names=['teacher_id', 'prefix', 'first_project_posted'],
                           parse_dates=['first_project_posted'],
                           converters={'prefix': teacher_prefix_converter})

teachers_dat['first_project_posted'] = teachers_dat['first_project_posted'].apply(lambda t: int(t.timestamp()))
teachers_dat.sample(5) # randomly sample 5

Unnamed: 0,teacher_id,prefix,first_project_posted
269954,ab77cfa616357ae1329eabb2c962c5dd,1,1415664000
355971,e22f537fac501c52232b8e29dada6e0f,1,1381968000
376159,eef3047adfea55b2c6f88a0e25c2383d,2,1410480000
235245,959a6902c5001039c1a8d0641949aa5d,0,1365206400
338210,d6e0fb4bc945f2e16f8409d40c2a2784,1,1480809600


Load *Resources.csv*. Modify headers to exclude spaces. Remove any rows without `quantity` or `price`, two key variables of the dataset.

In [8]:
resources_dat = pd.read_csv('data/Resources.csv',
                           header=0,
                           names=['project_id', 'resource', 'quantity', 'price', 'vendor'])

resources_dat = resources_dat.dropna(axis=0, subset=['quantity', 'price'])
resources_dat.sample(5) # randomly sample 5

Unnamed: 0,project_id,resource,quantity,price,vendor
6133121,d985d3cfd05a2ad1ec79dcb6ffab4384,lego(r) classic green baseplate (10700),16.0,8.46,Kaplan Early Learning Company
3415034,790a01d6ab9becef9f521b5f68c357e8,lego: the lego movie: junior novel,3.0,5.99,Amazon Business
1757975,3e78fc016185e2094a4b2cb4fff8eea3,"cricut standardgrip adhesive cutting mat, 12 b...",1.0,11.86,Amazon Business
4248468,96d58d2c0f9ff35c8ca7f5746d1d8f6a,"bintiva inflated stability wobble cushion, inc...",1.0,15.99,Amazon Business
5720770,cac1c5ace238b4ed6260a2774f43ea55,"otterbox defender series case for ipad mini 3,...",1.0,69.95,Amazon Business


Load two partitions of *Projects.csv* (previously split, code not included here). Conversion of text to integer (see above). Date values either remain null or are converted to integer (see above).

In [9]:
# LARGE FILE!
projects_textonly_dat = pd.read_csv('data/Projects_textonly.csv', index_col=0)
projects_textonly_dat.sample(5) # randomly sample 5

  mask |= (ar1 == a)


Unnamed: 0,project_id,title,essay,description,need_stmt
617671,5ccd8f37174c91fe183ed86476e1b444,Spread the Love of Reading to Oakland Youth,This year I have the privilege of working with...,This year I have the privilege of working with...,My students need leveled books in order to que...
586596,b9fc4dc1969d8bd2cfe55e5593e64e49,Hit The Target At Recess,"I teach the best students in the world. They ""...","I teach the best students in the world. They ""...","My students need elementary ball pack, scoop b..."
966511,06e91f3c7709fc90633c97f72ec6743c,One New iPad Needed,"My school is 98% low socio-economic school, bu...","My school is 98% low socio-economic school, bu...",My students need iPads to do all the things th...
710942,335810386e9d2b5c78fc8b8cfd8644f6,Full STEAM Ahead...iPads for Success!,I have a diverse class of students eager to ex...,I have a diverse class of students eager to ex...,My students need tablets to support their earl...
104012,bbc2ead20a7f89e6ebbcd8870c8bcdf0,Ensuring Science Success Through Intervention ...,The challenge facing my students is one of lan...,The challenge facing my students is one of lan...,My students need a Microsoft - Surface with 32...


In [10]:
# LARGE FILE!
projects_notext_dat = pd.read_csv('data/Projects_notext.csv', parse_dates=['posted', 'expire', 'funded'],
                                 converters={'proj_type': project_type_converter,
                                             'grade_level': grade_level_converter,
                                             'status': project_status_converter})

projects_notext_dat['posted'] = projects_notext_dat['posted'].apply(lambda t: np.nan if pd.isnull(t) else int(t.timestamp()))
projects_notext_dat['expire'] = projects_notext_dat['expire'].apply(lambda t: np.nan if pd.isnull(t) else int(t.timestamp()))
projects_notext_dat['funded'] = projects_notext_dat['funded'].apply(lambda t: np.nan if pd.isnull(t) else int(t.timestamp()))
projects_notext_dat.sample(5) # randomly sample 5

Unnamed: 0,project_id,school_id,teacher_id,proj_post_seq,proj_type,cat_tree,subcat_tree,grade_level,resource_cat,cost,posted,expire,status,funded
237829,4665f611c5e11b6edd6d8d7c1233dbdb,cbf0e3dcdfc9e00835dd70c7ae0a585c,e23cfc3209544c41bffbc1b692676b91,1,2,"Literacy & Language, Applied Learning","ESL, Early Development",1,Technology,2046.52,1411171200,1421453000.0,0,
891774,ffc601160737b36270ded7a53abfd314,e055e291e471f5dc83b81757f01533a5,ddacf3f341f3dd733525c84fb51997b0,1,2,Applied Learning,Character Education,2,Supplies,553.26,1505606400,1515456000.0,1,1505606000.0
143450,ba2196ceb554ea70f4c65770a1214404,1dbd07e65359ef5106f9483ca1a12465,d8a2eb8d22cf8ba4385927a0158cc91f,1,2,"Literacy & Language, Math & Science","Literacy, Mathematics",2,Technology,592.31,1393113600,1403309000.0,1,1402963000.0
994792,1798a66df38545337b222b90cf3012a8,97e5edd6f23e6d3fef2f13a2f50868c9,6a6a4356296bfc7078217c8205d6a67a,6,2,Math & Science,Mathematics,2,Educational Kits & Games,591.79,1516579200,1526861000.0,1,1516666000.0
680131,3de384121d74c739d6889e18848f6dd0,be7187d1c5dc0caaba2e309b41d64517,f0ca7d97ba6ed6c054fea0b3c0295268,2,2,Health & Sports,Team Sports,3,Supplies,2559.4,1479254400,1489536000.0,1,1485389000.0


# Explore

What are the unique resources a project can be categorized with? From docs:

> **Project Resource Category:** The types of items being requested by a teacher. There were five resource categories prior to October 10, 2017. After this date, there are fourteen. Prior to October 10, 2017, these categories were selected by teachers during project creation. After October 10, 2017, these categories were predicted via algorithm at roughly 95% accuracy.

In [36]:
projects_notext_dat['resource_cat'].unique()

array(['Technology', 'Supplies', 'Books', 'Other', 'Trips', 'Visitors',
       nan, 'Sports & Exercise Equipment', 'Computers & Tablets',
       'Flexible Seating', 'Reading Nooks, Desks & Storage',
       'Educational Kits & Games', 'Instructional Technology',
       'Art Supplies', 'Classroom Basics', 'Lab Equipment',
       'Food, Clothing & Hygiene', 'Musical Instruments'], dtype=object)

What are the unique categories and subcategories a project may be tagged as? From docs:

> **Project Subject Category Tree:** Every project can have either one or two subject categories. This is a comma-separated list of those subject categories.

> **Project Subject Subcategory Tree:** For every project subject category, there is also a project subject subcategory that contains greater specificity.

In [37]:
cats = []
for cat in projects_notext_dat['cat_tree'].unique():
    if type(cat) is str:
        for c in cat.split(', '):
            cats.append(c)
        
u = np.unique(cats)
u

array(['Applied Learning', 'Care & Hunger', 'Health & Sports',
       'History & Civics', 'Literacy & Language', 'Math & Science',
       'Music & The Arts', 'Special Needs', 'Warmth'],
      dtype='<U19')

In [38]:
cats = []
for cat in projects_notext_dat['subcat_tree'].unique():
    if type(cat) is str:
        for c in cat.split(', '):
            cats.append(c)
        
np.unique(cats)

array(['Applied Sciences', 'Care & Hunger', 'Character Education',
       'Civics & Government', 'College & Career Prep', 'Community Service',
       'ESL', 'Early Development', 'Economics', 'Environmental Science',
       'Extracurricular', 'Financial Literacy', 'Foreign Languages',
       'Gym & Fitness', 'Health & Life Science', 'Health & Wellness',
       'History & Geography', 'Literacy', 'Literature & Writing',
       'Mathematics', 'Music', 'Nutrition Education', 'Other',
       'Parent Involvement', 'Performing Arts', 'Social Sciences',
       'Special Needs', 'Team Sports', 'Visual Arts', 'Warmth'],
      dtype='<U21')