### Connect To Datasource

In [2]:
#connect to s3 bucket to import data
import urllib.parse
import urllib.request
from pyspark.sql.types import (StructField, StringType, IntegerType, StructType)

ACCESS_KEY = ""
SECRET_KEY = ""
ENCODED_SECRET_KEY = urllib.parse.quote(SECRET_KEY, "")
AWS_BUCKET_NAME = "donorchoose"
MOUNT_NAME = "kps3"

dbutils.fs.refreshMounts()
# comment out the following line after you have successfully mounted your bucket
#dbutils.fs.mount("s3n://%s:%s@%s" % (ACCESS_KEY, ENCODED_SECRET_KEY, AWS_BUCKET_NAME), "/mnt/%s" % MOUNT_NAME)

In [3]:
data_root = '/tmp/'
submission_path = '/mnt/{}/submission.txt'.format(MOUNT_NAME)

In [4]:
#display datasets from s3 bucket
display(dbutils.fs.ls("/mnt/%s" % MOUNT_NAME))

path,name,size
dbfs:/mnt/kps3/Donations.csv,Donations.csv,611355459
dbfs:/mnt/kps3/Donors.csv,Donors.csv,123981264
dbfs:/mnt/kps3/Projects.csv,Projects.csv,2571524598
dbfs:/mnt/kps3/Resources.csv,Resources.csv,819349026
dbfs:/mnt/kps3/Schools.csv,Schools.csv,9652203
dbfs:/mnt/kps3/Teachers.csv,Teachers.csv,19570934
dbfs:/mnt/kps3/_metadata,_metadata,0
dbfs:/mnt/kps3/e-29EF86DNUE13OHJK10BRUOE2Y/,e-29EF86DNUE13OHJK10BRUOE2Y/,0
dbfs:/mnt/kps3/e-AWAK3LGUC2SJCO56C5HWKE0NM/,e-AWAK3LGUC2SJCO56C5HWKE0NM/,0
dbfs:/mnt/kps3/j-3FMR59IUQ2U84/,j-3FMR59IUQ2U84/,0


### Data Wrangling

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pandas as pd
spark = SparkSession.builder.appName('ReadData').getOrCreate()
sc = spark.sparkContext
from pyspark.sql import HiveContext
hive_context = HiveContext(sc)
from pyspark.sql import SQLContext
from pyspark.sql.functions import desc
sqlContext = SQLContext(sc)
from plotly.offline import plot
import plotly.graph_objs as go
import requests
requests.packages.urllib3.disable_warnings()
from pyspark.sql import SQLContext
from pyspark.sql.functions import isnan, when, count, col, translate
from pyspark.sql.types import IntegerType, FloatType
from pyspark.sql.types import FloatType
from pyspark.sql import functions as F
from pyspark.sql.functions import when, col, mean, desc, round

##### Pre-Pre Processing

Initially there were problems with importing the projects.csv dataset. Special characters caused linebreak issues with the Project Essay, Project Short Description, Project Title and Project Need Statement. The issue caused the data to shift to the wrong columns and rows. In order to fix this issue, the dataset was uploaded in Jupyter notebook, and read in using Pandas. Once the file was read, I did the following cleaning:
  - Drop NA
  - Dropped columns: Project Essay, Project Short Description and Project Need Statement
  - Removed special characters from Project Title Column
Once these changes were made, I exported the dataframe to csv and re-saved it as newprojects.csv, which is read in below.

In [8]:
#set partitions 
sqlContext.setConf('spark.sql.shuffle.partitions', '6')

In [9]:
#create schema for projects .csv

projects_schema = StructType([
    StructField("Project ID", StringType(), True),
    StructField("School ID", StringType(), True),
    StructField("Teacher ID", StringType(), True),
    StructField("Teacher Project Posted Sequence", IntegerType(), True),
    StructField("Project Type", StringType(), True),
    StructField("Project Title", StringType(), True),
    StructField("Project Essay", StringType(), True),
    StructField("Project Short Description", StringType(), True),
    StructField("Project Need Statement", StringType(), True),
    StructField("Project Subject Category Tree", StringType(), True),
    StructField("Project Subject Subcategory Tree", StringType(), True),
    StructField("Project Grade Level Category", StringType(), True),
    StructField("Project Resource Category", StringType(), True),
    StructField("Project Cost", FloatType(), True),
    StructField("Project Posted Date", StringType(), True),
    StructField("Project Expiration Date", StringType(), True),
    StructField("Project Current Status", StringType(), True),
    StructField("Project Fully Funded Date", StringType(), True),
])

In [10]:
#import datasets from s3 bucket
donations_df = spark.read.csv("dbfs:/mnt/kps3/Donations.csv", header =True, inferSchema = True)
donors_df = spark.read.csv("dbfs:/mnt/kps3/Donors.csv", header =True,  inferSchema = True)
projects_df = spark.read.csv("dbfs:/mnt/kps3/Projects.csv", header =True, multiLine = True, schema = projects_schema, escape='"')
resources_df = spark.read.csv("dbfs:/mnt/kps3/Resources.csv", header =True,  inferSchema = True)
Teachers = spark.read.csv("dbfs:/mnt/kps3/Teachers.csv", header =True,  inferSchema = True)
schools_df = spark.read.csv("dbfs:/mnt/kps3/Schools.csv", header =True, inferSchema = True)

In [11]:
#join dataframes together for future modeling and feature engineering
donor_donations = donations_df.join(donors_df, on = ['Donor ID'], how = 'inner')
projects_schools = projects_df.join(schools_df, on = ['School ID'], how = 'inner')
donor_projects =  donor_donations.join(projects_schools, on = ['Project ID'], how = 'left')
projects_teachers = donor_projects.join(Teachers, on = ['Teacher ID'], how = 'inner')
df = projects_teachers.join(resources_df, on = ['Project ID'], how = 'inner')

In [12]:
#inspect combined dataset column types
df.printSchema()

In [13]:
#null values in city name due to washington DC - school state and school county are both district of columbia
df = df.fillna({'School City':'DC'})

In [14]:
#rename all columns and create new df
df_renamed = df.withColumnRenamed('Project ID', 'project_id').withColumnRenamed('Teacher ID', 'teacher_id').withColumnRenamed('Donor ID', 'donor_id').withColumnRenamed('Donation ID', 'donation_id').withColumnRenamed('Donation Included Optional Donation', 'optional_donation').withColumnRenamed('Donation Amount', 'donation_amount'). withColumnRenamed('Donor Cart Sequence', 'donor_cart_sequence').withColumnRenamed('Donation Received Date','donation_received_date').withColumnRenamed('Donor City', 'donor_city').withColumnRenamed('Donor State', 'donor_state').withColumnRenamed('Donor Is Teacher', 'donor_is_teacher').withColumnRenamed('Donor Zip', 'donor_zip').withColumnRenamed('School ID', 'school_id').withColumnRenamed('Teacher Project Posted Sequence', 'teacher_project_posted_seq').withColumnRenamed('Project ID', 'project_id').withColumnRenamed('Project Type', 'project_type').withColumnRenamed('Project Title', 'project_title').withColumnRenamed('Project Subject Category Tree', 'project_cat').withColumnRenamed('Project Subject Subcategory Tree', 'project_cat2').withColumnRenamed('Project Grade Level Category', 'project_grade_level_cat').withColumnRenamed('Project Resource Category', 'project_resource_cat').withColumnRenamed('Project Cost', 'project_cost').withColumnRenamed('Project Posted Date', 'project_posted_date').withColumnRenamed('Project Expiration Date', 'project_exp_date').withColumnRenamed('Project Current Status', 'project_curr_stat').withColumnRenamed('Project Fully Funded Date', 'project_fully_funded_date').withColumnRenamed('School Name', 'school_name').withColumnRenamed('School Metro Type', 'school_metro_type').withColumnRenamed('School Percentage Free Lunch', 'school_percent_free_lunch').withColumnRenamed('School State', 'school_state').withColumnRenamed('School Zip', 'school_zip').withColumnRenamed('School City', 'school_city').withColumnRenamed('School County', 'school_county').withColumnRenamed('School District', 'school_district').withColumnRenamed('Teacher Prefix', 'teacher_prefix').withColumnRenamed('Teacher First Project Posted Date', 'teacher_first_proj').withColumnRenamed('Resource Item Name', 'resource_item_name').withColumnRenamed('Resource Quantity', 'resource_quantity').withColumnRenamed('Resource Unit Price', 'resource_unit_price').withColumnRenamed('Resource Vendor Name', 'resource_vendor_name')

In [15]:
#cast timestamps
df_renamed = df_renamed.withColumn("donation_received_date",to_date(unix_timestamp(col("donation_received_date"), "yyyy-MM-dd").cast("timestamp")))
df_renamed = df_renamed.withColumn("project_posted_date",to_date(unix_timestamp(col("project_posted_date"), "yyyy-MM-dd").cast("timestamp")))
df_renamed = df_renamed.withColumn("project_exp_date",to_date(unix_timestamp(col("project_exp_date"), "yyyy-MM-dd").cast("timestamp")))
df_renamed = df_renamed.withColumn("project_fully_funded_date",to_date(unix_timestamp(col("project_fully_funded_date"), "yyyy-MM-dd").cast("timestamp")))
df_renamed = df_renamed.withColumn("teacher_first_proj",to_date(unix_timestamp(col("teacher_first_proj"), "yyyy-MM-dd").cast("timestamp")))

In [16]:
df_renamed.printSchema()

In [17]:
#create new date columns for future data exploration & modeling
df_renamed = df_renamed.withColumn('teacher_date_exp_dif',datediff(df_renamed.project_exp_date.cast('date'),df_renamed.teacher_first_proj.cast('date')))
df_renamed= df_renamed.withColumn('project_days',datediff(df_renamed.project_exp_date.cast('date'),df_renamed.project_posted_date.cast('date')))

df_renamed = df_renamed.withColumn('post_date_day', date_format(col('project_posted_date'), "E"))
df_renamed = df_renamed.withColumn('post_date_month', month(df_renamed['project_posted_date']))
df_renamed = df_renamed.withColumn('post_date_year', year(df_renamed['project_posted_date']))

df_renamed = df_renamed.withColumn('donation_day', date_format(col('donation_received_date'), "E"))
df_renamed = df_renamed.withColumn('donation_month', month(df_renamed['donation_received_date']))
df_renamed = df_renamed.withColumn('donation_year', year(df_renamed['donation_received_date']))

df_renamed = df_renamed.withColumn('t_proj_post_day', date_format(col('teacher_first_proj'), "E"))
df_renamed = df_renamed.withColumn('t_proj_post_month', month(df_renamed['teacher_first_proj']))
df_renamed = df_renamed.withColumn('t_proj_post_year', year(df_renamed['teacher_first_proj']))


In [18]:
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
#cast column types for ints and floats that weren't inferred at import

df_renamed = df_renamed.withColumn('school_percent_free_lunch',df_renamed.school_percent_free_lunch.cast('float'))
df_remamed = df_renamed.withColumn('donor_cart_sequence',df_renamed.donor_cart_sequence.cast('int'))
df_renamed = df_renamed.withColumn('donation_amount',df_renamed.donation_amount.cast('float'))
df_renamed = df_renamed.withColumn('teacher_project_posted_seq',df_renamed.teacher_project_posted_seq.cast('int'))
df_renamed = df_renamed.withColumn('project_cost',df_renamed.project_cost.cast('float'))
df_renamed = df_renamed.withColumn('resource_quantity',df_renamed.resource_quantity.cast('float'))
df_renamed = df_renamed.withColumn('resource_unit_price',df_renamed.resource_unit_price.cast('float'))
df_renamed = df_renamed.withColumn('resource_unit_price',df_renamed.resource_unit_price.cast('float'))

In [19]:
#split main categories and sub categories as they allow for two entries per column
split_col = split(df_renamed['project_cat'], ',')
df_renamed = df_renamed.withColumn('main_cat1', split_col.getItem(0))
df_renamed = df_renamed.withColumn('main_cat2', split_col.getItem(1))
split_col = split(df_renamed['project_cat2'], ',')
df_renamed = df_renamed.withColumn('subcat1', split_col.getItem(0))
df_renamed = df_renamed.withColumn('subcat2', split_col.getItem(1))

In [20]:
display(df_renamed)

project_id,teacher_id,donor_id,donation_id,optional_donation,donation_amount,donor_cart_sequence,donation_received_date,donor_city,donor_state,donor_is_teacher,donor_zip,school_id,teacher_project_posted_seq,project_type,project_title,Project Essay,Project Short Description,Project Need Statement,project_cat,project_cat2,project_grade_level_cat,project_resource_cat,project_cost,project_posted_date,project_exp_date,project_curr_stat,project_fully_funded_date,school_name,school_metro_type,school_percent_free_lunch,school_state,school_zip,school_city,school_county,school_district,teacher_prefix,teacher_first_proj,resource_item_name,resource_quantity,resource_unit_price,resource_vendor_name,teacher_date_exp_dif,project_days,post_date_day,post_date_month,post_date_year,donation_day,donation_month,donation_year,t_proj_post_day,t_proj_post_month,t_proj_post_year,main_cat1,main_cat2,subcat1,subcat2
0000c0bdc0f15bd239cfffa884791a10,a28e2bf8d9adc6da2c0fbb9a3206226d,17f3f75fe1d4c9058f9a36720b312fbc,74cad43aa81e1c9b570161c0c27230e9,Yes,25.0,1,2016-07-27,Arlington,Virginia,No,222,87d45b15ae6f5e40b72bf9e692e295c7,3,Teacher-Led,Code Green!,"This year, I'm welcoming 22 new first graders to my classroom! My school is Title I and over 70% of our students are English Language Learners. I love teaching ELL students, and new centers materials are vital to meeting my students at a variety of academic and language levels. I'm so excited to get to know my next group of kids and reach our goal of EVERY student leaving first grade performing on or above grade level - in the GREEN! This may seem ambitious, but I believe every single student can achieve it with the right support and resources. Thank you for your support! With all of the great developments in education, one thing I've learned over the past five years is how important basic hands-on learning is - and how much kids love it! This year, I want to give my students more opportunities to learn using games and puzzles instead of just paper or computers. Not only do games, puzzles, and activities such as Roll & Read Phonics Games, Addition Fluency Puzzles, Unifix® Cubes, Classroom Magnetic Letters Kit, Reading Match-Ups - Set 2, etc. to keep kids engaged, they provide a great opportunity for them to learn teamwork and social skills. While I am pulling small groups in reading and math, students work together to solve problems, build puzzles, try new activities, and learn a new skill. It's important to teach kids that we can learn while we have fun and build great attitudes towards school at an early age.","This year, I'm welcoming 22 new first graders to my classroom! My school is Title I and over 70% of our students are English Language Learners. I love teaching ELL students, and new centers...","My students need new centers materials such as Roll & Read Phonics Games, Addition Fluency Puzzles, Unifix® Cubes, Classroom Magnetic Letters Kit, Reading Match-Ups - Set 2, etc. for math and reading to help us reach our learning goals for 2016-17 school year!","Literacy & Language, Math & Science","Literacy, Mathematics",Grades PreK-2,Supplies,424.74,2016-07-18,2016-11-17,Fully Funded,2016-07-27,Brightwood Education Center,urban,95.0,District of Columbia,20011.0,DC,District Of Columbia,Dc Public Schools,Ms.,2011-09-03,aa620x - addition fluency puzzles - complete set,1.0,55.0,Lakeshore Learning Materials,1902,122,Mon,7,2016,Wed,7,2016,Sat,9,2011,Literacy & Language,Math & Science,Literacy & Language,Math & Science
0000c0bdc0f15bd239cfffa884791a10,a28e2bf8d9adc6da2c0fbb9a3206226d,17f3f75fe1d4c9058f9a36720b312fbc,74cad43aa81e1c9b570161c0c27230e9,Yes,25.0,1,2016-07-27,Arlington,Virginia,No,222,87d45b15ae6f5e40b72bf9e692e295c7,3,Teacher-Led,Code Green!,"This year, I'm welcoming 22 new first graders to my classroom! My school is Title I and over 70% of our students are English Language Learners. I love teaching ELL students, and new centers materials are vital to meeting my students at a variety of academic and language levels. I'm so excited to get to know my next group of kids and reach our goal of EVERY student leaving first grade performing on or above grade level - in the GREEN! This may seem ambitious, but I believe every single student can achieve it with the right support and resources. Thank you for your support! With all of the great developments in education, one thing I've learned over the past five years is how important basic hands-on learning is - and how much kids love it! This year, I want to give my students more opportunities to learn using games and puzzles instead of just paper or computers. Not only do games, puzzles, and activities such as Roll & Read Phonics Games, Addition Fluency Puzzles, Unifix® Cubes, Classroom Magnetic Letters Kit, Reading Match-Ups - Set 2, etc. to keep kids engaged, they provide a great opportunity for them to learn teamwork and social skills. While I am pulling small groups in reading and math, students work together to solve problems, build puzzles, try new activities, and learn a new skill. It's important to teach kids that we can learn while we have fun and build great attitudes towards school at an early age.","This year, I'm welcoming 22 new first graders to my classroom! My school is Title I and over 70% of our students are English Language Learners. I love teaching ELL students, and new centers...","My students need new centers materials such as Roll & Read Phonics Games, Addition Fluency Puzzles, Unifix® Cubes, Classroom Magnetic Letters Kit, Reading Match-Ups - Set 2, etc. for math and reading to help us reach our learning goals for 2016-17 school year!","Literacy & Language, Math & Science","Literacy, Mathematics",Grades PreK-2,Supplies,424.74,2016-07-18,2016-11-17,Fully Funded,2016-07-27,Brightwood Education Center,urban,95.0,District of Columbia,20011.0,DC,District Of Columbia,Dc Public Schools,Ms.,2011-09-03,jj518 - classroom magnetic letters kit,1.0,42.99,Lakeshore Learning Materials,1902,122,Mon,7,2016,Wed,7,2016,Sat,9,2011,Literacy & Language,Math & Science,Literacy & Language,Math & Science
0000c0bdc0f15bd239cfffa884791a10,a28e2bf8d9adc6da2c0fbb9a3206226d,17f3f75fe1d4c9058f9a36720b312fbc,74cad43aa81e1c9b570161c0c27230e9,Yes,25.0,1,2016-07-27,Arlington,Virginia,No,222,87d45b15ae6f5e40b72bf9e692e295c7,3,Teacher-Led,Code Green!,"This year, I'm welcoming 22 new first graders to my classroom! My school is Title I and over 70% of our students are English Language Learners. I love teaching ELL students, and new centers materials are vital to meeting my students at a variety of academic and language levels. I'm so excited to get to know my next group of kids and reach our goal of EVERY student leaving first grade performing on or above grade level - in the GREEN! This may seem ambitious, but I believe every single student can achieve it with the right support and resources. Thank you for your support! With all of the great developments in education, one thing I've learned over the past five years is how important basic hands-on learning is - and how much kids love it! This year, I want to give my students more opportunities to learn using games and puzzles instead of just paper or computers. Not only do games, puzzles, and activities such as Roll & Read Phonics Games, Addition Fluency Puzzles, Unifix® Cubes, Classroom Magnetic Letters Kit, Reading Match-Ups - Set 2, etc. to keep kids engaged, they provide a great opportunity for them to learn teamwork and social skills. While I am pulling small groups in reading and math, students work together to solve problems, build puzzles, try new activities, and learn a new skill. It's important to teach kids that we can learn while we have fun and build great attitudes towards school at an early age.","This year, I'm welcoming 22 new first graders to my classroom! My school is Title I and over 70% of our students are English Language Learners. I love teaching ELL students, and new centers...","My students need new centers materials such as Roll & Read Phonics Games, Addition Fluency Puzzles, Unifix® Cubes, Classroom Magnetic Letters Kit, Reading Match-Ups - Set 2, etc. for math and reading to help us reach our learning goals for 2016-17 school year!","Literacy & Language, Math & Science","Literacy, Mathematics",Grades PreK-2,Supplies,424.74,2016-07-18,2016-11-17,Fully Funded,2016-07-27,Brightwood Education Center,urban,95.0,District of Columbia,20011.0,DC,District Of Columbia,Dc Public Schools,Ms.,2011-09-03,ee590x - roll & read phonics games - complete set,1.0,69.99,Lakeshore Learning Materials,1902,122,Mon,7,2016,Wed,7,2016,Sat,9,2011,Literacy & Language,Math & Science,Literacy & Language,Math & Science
0000c0bdc0f15bd239cfffa884791a10,a28e2bf8d9adc6da2c0fbb9a3206226d,17f3f75fe1d4c9058f9a36720b312fbc,74cad43aa81e1c9b570161c0c27230e9,Yes,25.0,1,2016-07-27,Arlington,Virginia,No,222,87d45b15ae6f5e40b72bf9e692e295c7,3,Teacher-Led,Code Green!,"This year, I'm welcoming 22 new first graders to my classroom! My school is Title I and over 70% of our students are English Language Learners. I love teaching ELL students, and new centers materials are vital to meeting my students at a variety of academic and language levels. I'm so excited to get to know my next group of kids and reach our goal of EVERY student leaving first grade performing on or above grade level - in the GREEN! This may seem ambitious, but I believe every single student can achieve it with the right support and resources. Thank you for your support! With all of the great developments in education, one thing I've learned over the past five years is how important basic hands-on learning is - and how much kids love it! This year, I want to give my students more opportunities to learn using games and puzzles instead of just paper or computers. Not only do games, puzzles, and activities such as Roll & Read Phonics Games, Addition Fluency Puzzles, Unifix® Cubes, Classroom Magnetic Letters Kit, Reading Match-Ups - Set 2, etc. to keep kids engaged, they provide a great opportunity for them to learn teamwork and social skills. While I am pulling small groups in reading and math, students work together to solve problems, build puzzles, try new activities, and learn a new skill. It's important to teach kids that we can learn while we have fun and build great attitudes towards school at an early age.","This year, I'm welcoming 22 new first graders to my classroom! My school is Title I and over 70% of our students are English Language Learners. I love teaching ELL students, and new centers...","My students need new centers materials such as Roll & Read Phonics Games, Addition Fluency Puzzles, Unifix® Cubes, Classroom Magnetic Letters Kit, Reading Match-Ups - Set 2, etc. for math and reading to help us reach our learning goals for 2016-17 school year!","Literacy & Language, Math & Science","Literacy, Mathematics",Grades PreK-2,Supplies,424.74,2016-07-18,2016-11-17,Fully Funded,2016-07-27,Brightwood Education Center,urban,95.0,District of Columbia,20011.0,DC,District Of Columbia,Dc Public Schools,Ms.,2011-09-03,gg294 - fill-in-the-blank phonics stamps - set 2,1.0,29.99,Lakeshore Learning Materials,1902,122,Mon,7,2016,Wed,7,2016,Sat,9,2011,Literacy & Language,Math & Science,Literacy & Language,Math & Science
0000c0bdc0f15bd239cfffa884791a10,a28e2bf8d9adc6da2c0fbb9a3206226d,17f3f75fe1d4c9058f9a36720b312fbc,74cad43aa81e1c9b570161c0c27230e9,Yes,25.0,1,2016-07-27,Arlington,Virginia,No,222,87d45b15ae6f5e40b72bf9e692e295c7,3,Teacher-Led,Code Green!,"This year, I'm welcoming 22 new first graders to my classroom! My school is Title I and over 70% of our students are English Language Learners. I love teaching ELL students, and new centers materials are vital to meeting my students at a variety of academic and language levels. I'm so excited to get to know my next group of kids and reach our goal of EVERY student leaving first grade performing on or above grade level - in the GREEN! This may seem ambitious, but I believe every single student can achieve it with the right support and resources. Thank you for your support! With all of the great developments in education, one thing I've learned over the past five years is how important basic hands-on learning is - and how much kids love it! This year, I want to give my students more opportunities to learn using games and puzzles instead of just paper or computers. Not only do games, puzzles, and activities such as Roll & Read Phonics Games, Addition Fluency Puzzles, Unifix® Cubes, Classroom Magnetic Letters Kit, Reading Match-Ups - Set 2, etc. to keep kids engaged, they provide a great opportunity for them to learn teamwork and social skills. While I am pulling small groups in reading and math, students work together to solve problems, build puzzles, try new activities, and learn a new skill. It's important to teach kids that we can learn while we have fun and build great attitudes towards school at an early age.","This year, I'm welcoming 22 new first graders to my classroom! My school is Title I and over 70% of our students are English Language Learners. I love teaching ELL students, and new centers...","My students need new centers materials such as Roll & Read Phonics Games, Addition Fluency Puzzles, Unifix® Cubes, Classroom Magnetic Letters Kit, Reading Match-Ups - Set 2, etc. for math and reading to help us reach our learning goals for 2016-17 school year!","Literacy & Language, Math & Science","Literacy, Mathematics",Grades PreK-2,Supplies,424.74,2016-07-18,2016-11-17,Fully Funded,2016-07-27,Brightwood Education Center,urban,95.0,District of Columbia,20011.0,DC,District Of Columbia,Dc Public Schools,Ms.,2011-09-03,ra313 - unifix® cubes,2.0,24.99,Lakeshore Learning Materials,1902,122,Mon,7,2016,Wed,7,2016,Sat,9,2011,Literacy & Language,Math & Science,Literacy & Language,Math & Science
0000c0bdc0f15bd239cfffa884791a10,a28e2bf8d9adc6da2c0fbb9a3206226d,17f3f75fe1d4c9058f9a36720b312fbc,74cad43aa81e1c9b570161c0c27230e9,Yes,25.0,1,2016-07-27,Arlington,Virginia,No,222,87d45b15ae6f5e40b72bf9e692e295c7,3,Teacher-Led,Code Green!,"This year, I'm welcoming 22 new first graders to my classroom! My school is Title I and over 70% of our students are English Language Learners. I love teaching ELL students, and new centers materials are vital to meeting my students at a variety of academic and language levels. I'm so excited to get to know my next group of kids and reach our goal of EVERY student leaving first grade performing on or above grade level - in the GREEN! This may seem ambitious, but I believe every single student can achieve it with the right support and resources. Thank you for your support! With all of the great developments in education, one thing I've learned over the past five years is how important basic hands-on learning is - and how much kids love it! This year, I want to give my students more opportunities to learn using games and puzzles instead of just paper or computers. Not only do games, puzzles, and activities such as Roll & Read Phonics Games, Addition Fluency Puzzles, Unifix® Cubes, Classroom Magnetic Letters Kit, Reading Match-Ups - Set 2, etc. to keep kids engaged, they provide a great opportunity for them to learn teamwork and social skills. While I am pulling small groups in reading and math, students work together to solve problems, build puzzles, try new activities, and learn a new skill. It's important to teach kids that we can learn while we have fun and build great attitudes towards school at an early age.","This year, I'm welcoming 22 new first graders to my classroom! My school is Title I and over 70% of our students are English Language Learners. I love teaching ELL students, and new centers...","My students need new centers materials such as Roll & Read Phonics Games, Addition Fluency Puzzles, Unifix® Cubes, Classroom Magnetic Letters Kit, Reading Match-Ups - Set 2, etc. for math and reading to help us reach our learning goals for 2016-17 school year!","Literacy & Language, Math & Science","Literacy, Mathematics",Grades PreK-2,Supplies,424.74,2016-07-18,2016-11-17,Fully Funded,2016-07-27,Brightwood Education Center,urban,95.0,District of Columbia,20011.0,DC,District Of Columbia,Dc Public Schools,Ms.,2011-09-03,hh445x - reading match-ups - set 2,1.0,29.99,Lakeshore Learning Materials,1902,122,Mon,7,2016,Wed,7,2016,Sat,9,2011,Literacy & Language,Math & Science,Literacy & Language,Math & Science
0000c0bdc0f15bd239cfffa884791a10,a28e2bf8d9adc6da2c0fbb9a3206226d,17f3f75fe1d4c9058f9a36720b312fbc,74cad43aa81e1c9b570161c0c27230e9,Yes,25.0,1,2016-07-27,Arlington,Virginia,No,222,87d45b15ae6f5e40b72bf9e692e295c7,3,Teacher-Led,Code Green!,"This year, I'm welcoming 22 new first graders to my classroom! My school is Title I and over 70% of our students are English Language Learners. I love teaching ELL students, and new centers materials are vital to meeting my students at a variety of academic and language levels. I'm so excited to get to know my next group of kids and reach our goal of EVERY student leaving first grade performing on or above grade level - in the GREEN! This may seem ambitious, but I believe every single student can achieve it with the right support and resources. Thank you for your support! With all of the great developments in education, one thing I've learned over the past five years is how important basic hands-on learning is - and how much kids love it! This year, I want to give my students more opportunities to learn using games and puzzles instead of just paper or computers. Not only do games, puzzles, and activities such as Roll & Read Phonics Games, Addition Fluency Puzzles, Unifix® Cubes, Classroom Magnetic Letters Kit, Reading Match-Ups - Set 2, etc. to keep kids engaged, they provide a great opportunity for them to learn teamwork and social skills. While I am pulling small groups in reading and math, students work together to solve problems, build puzzles, try new activities, and learn a new skill. It's important to teach kids that we can learn while we have fun and build great attitudes towards school at an early age.","This year, I'm welcoming 22 new first graders to my classroom! My school is Title I and over 70% of our students are English Language Learners. I love teaching ELL students, and new centers...","My students need new centers materials such as Roll & Read Phonics Games, Addition Fluency Puzzles, Unifix® Cubes, Classroom Magnetic Letters Kit, Reading Match-Ups - Set 2, etc. for math and reading to help us reach our learning goals for 2016-17 school year!","Literacy & Language, Math & Science","Literacy, Mathematics",Grades PreK-2,Supplies,424.74,2016-07-18,2016-11-17,Fully Funded,2016-07-27,Brightwood Education Center,urban,95.0,District of Columbia,20011.0,DC,District Of Columbia,Dc Public Schools,Ms.,2011-09-03,dd468 - visualize math write & wipe boards,1.0,29.99,Lakeshore Learning Materials,1902,122,Mon,7,2016,Wed,7,2016,Sat,9,2011,Literacy & Language,Math & Science,Literacy & Language,Math & Science
0000c0bdc0f15bd239cfffa884791a10,a28e2bf8d9adc6da2c0fbb9a3206226d,e1a9ef3d18fad06f5ab2f7c1aab38e2e,e33a13180f8e17db9d3f41200a8ca9c5,Yes,50.0,1,2016-07-19,Monroe,North Carolina,No,281,87d45b15ae6f5e40b72bf9e692e295c7,3,Teacher-Led,Code Green!,"This year, I'm welcoming 22 new first graders to my classroom! My school is Title I and over 70% of our students are English Language Learners. I love teaching ELL students, and new centers materials are vital to meeting my students at a variety of academic and language levels. I'm so excited to get to know my next group of kids and reach our goal of EVERY student leaving first grade performing on or above grade level - in the GREEN! This may seem ambitious, but I believe every single student can achieve it with the right support and resources. Thank you for your support! With all of the great developments in education, one thing I've learned over the past five years is how important basic hands-on learning is - and how much kids love it! This year, I want to give my students more opportunities to learn using games and puzzles instead of just paper or computers. Not only do games, puzzles, and activities such as Roll & Read Phonics Games, Addition Fluency Puzzles, Unifix® Cubes, Classroom Magnetic Letters Kit, Reading Match-Ups - Set 2, etc. to keep kids engaged, they provide a great opportunity for them to learn teamwork and social skills. While I am pulling small groups in reading and math, students work together to solve problems, build puzzles, try new activities, and learn a new skill. It's important to teach kids that we can learn while we have fun and build great attitudes towards school at an early age.","This year, I'm welcoming 22 new first graders to my classroom! My school is Title I and over 70% of our students are English Language Learners. I love teaching ELL students, and new centers...","My students need new centers materials such as Roll & Read Phonics Games, Addition Fluency Puzzles, Unifix® Cubes, Classroom Magnetic Letters Kit, Reading Match-Ups - Set 2, etc. for math and reading to help us reach our learning goals for 2016-17 school year!","Literacy & Language, Math & Science","Literacy, Mathematics",Grades PreK-2,Supplies,424.74,2016-07-18,2016-11-17,Fully Funded,2016-07-27,Brightwood Education Center,urban,95.0,District of Columbia,20011.0,DC,District Of Columbia,Dc Public Schools,Ms.,2011-09-03,aa620x - addition fluency puzzles - complete set,1.0,55.0,Lakeshore Learning Materials,1902,122,Mon,7,2016,Tue,7,2016,Sat,9,2011,Literacy & Language,Math & Science,Literacy & Language,Math & Science
0000c0bdc0f15bd239cfffa884791a10,a28e2bf8d9adc6da2c0fbb9a3206226d,e1a9ef3d18fad06f5ab2f7c1aab38e2e,e33a13180f8e17db9d3f41200a8ca9c5,Yes,50.0,1,2016-07-19,Monroe,North Carolina,No,281,87d45b15ae6f5e40b72bf9e692e295c7,3,Teacher-Led,Code Green!,"This year, I'm welcoming 22 new first graders to my classroom! My school is Title I and over 70% of our students are English Language Learners. I love teaching ELL students, and new centers materials are vital to meeting my students at a variety of academic and language levels. I'm so excited to get to know my next group of kids and reach our goal of EVERY student leaving first grade performing on or above grade level - in the GREEN! This may seem ambitious, but I believe every single student can achieve it with the right support and resources. Thank you for your support! With all of the great developments in education, one thing I've learned over the past five years is how important basic hands-on learning is - and how much kids love it! This year, I want to give my students more opportunities to learn using games and puzzles instead of just paper or computers. Not only do games, puzzles, and activities such as Roll & Read Phonics Games, Addition Fluency Puzzles, Unifix® Cubes, Classroom Magnetic Letters Kit, Reading Match-Ups - Set 2, etc. to keep kids engaged, they provide a great opportunity for them to learn teamwork and social skills. While I am pulling small groups in reading and math, students work together to solve problems, build puzzles, try new activities, and learn a new skill. It's important to teach kids that we can learn while we have fun and build great attitudes towards school at an early age.","This year, I'm welcoming 22 new first graders to my classroom! My school is Title I and over 70% of our students are English Language Learners. I love teaching ELL students, and new centers...","My students need new centers materials such as Roll & Read Phonics Games, Addition Fluency Puzzles, Unifix® Cubes, Classroom Magnetic Letters Kit, Reading Match-Ups - Set 2, etc. for math and reading to help us reach our learning goals for 2016-17 school year!","Literacy & Language, Math & Science","Literacy, Mathematics",Grades PreK-2,Supplies,424.74,2016-07-18,2016-11-17,Fully Funded,2016-07-27,Brightwood Education Center,urban,95.0,District of Columbia,20011.0,DC,District Of Columbia,Dc Public Schools,Ms.,2011-09-03,jj518 - classroom magnetic letters kit,1.0,42.99,Lakeshore Learning Materials,1902,122,Mon,7,2016,Tue,7,2016,Sat,9,2011,Literacy & Language,Math & Science,Literacy & Language,Math & Science
0000c0bdc0f15bd239cfffa884791a10,a28e2bf8d9adc6da2c0fbb9a3206226d,e1a9ef3d18fad06f5ab2f7c1aab38e2e,e33a13180f8e17db9d3f41200a8ca9c5,Yes,50.0,1,2016-07-19,Monroe,North Carolina,No,281,87d45b15ae6f5e40b72bf9e692e295c7,3,Teacher-Led,Code Green!,"This year, I'm welcoming 22 new first graders to my classroom! My school is Title I and over 70% of our students are English Language Learners. I love teaching ELL students, and new centers materials are vital to meeting my students at a variety of academic and language levels. I'm so excited to get to know my next group of kids and reach our goal of EVERY student leaving first grade performing on or above grade level - in the GREEN! This may seem ambitious, but I believe every single student can achieve it with the right support and resources. Thank you for your support! With all of the great developments in education, one thing I've learned over the past five years is how important basic hands-on learning is - and how much kids love it! This year, I want to give my students more opportunities to learn using games and puzzles instead of just paper or computers. Not only do games, puzzles, and activities such as Roll & Read Phonics Games, Addition Fluency Puzzles, Unifix® Cubes, Classroom Magnetic Letters Kit, Reading Match-Ups - Set 2, etc. to keep kids engaged, they provide a great opportunity for them to learn teamwork and social skills. While I am pulling small groups in reading and math, students work together to solve problems, build puzzles, try new activities, and learn a new skill. It's important to teach kids that we can learn while we have fun and build great attitudes towards school at an early age.","This year, I'm welcoming 22 new first graders to my classroom! My school is Title I and over 70% of our students are English Language Learners. I love teaching ELL students, and new centers...","My students need new centers materials such as Roll & Read Phonics Games, Addition Fluency Puzzles, Unifix® Cubes, Classroom Magnetic Letters Kit, Reading Match-Ups - Set 2, etc. for math and reading to help us reach our learning goals for 2016-17 school year!","Literacy & Language, Math & Science","Literacy, Mathematics",Grades PreK-2,Supplies,424.74,2016-07-18,2016-11-17,Fully Funded,2016-07-27,Brightwood Education Center,urban,95.0,District of Columbia,20011.0,DC,District Of Columbia,Dc Public Schools,Ms.,2011-09-03,ee590x - roll & read phonics games - complete set,1.0,69.99,Lakeshore Learning Materials,1902,122,Mon,7,2016,Tue,7,2016,Sat,9,2011,Literacy & Language,Math & Science,Literacy & Language,Math & Science


### Exploratory Data Analysis

In [22]:
#total number of rows in joined df
df_renamed.count()

In [23]:
#total number of donors
df_renamed.select("donor_id").distinct().count()

In [24]:
#total number of donations
df_renamed.select("donation_id").distinct().count()

In [25]:
#create grouped dataframes by project id and project resources
#group data by project id - each row for the same project id
donations_grouped = df_renamed.groupBy('project_id').agg({'donation_id':'count','donation_amount':'avg', 'donation_amount':'sum'})
donations_grouped = donations_grouped.withColumnRenamed('project_id','project_id_2')

resources_grouped = df_renamed.groupBy('project_id').agg({'resource_item_name':'count','resource_unit_price':'sum','resource_quantity':'sum'})
resources_grouped = resources_grouped.withColumnRenamed('project_id','project_id_2')

In [26]:
display(donations_grouped)

project_id_2,sum(donation_amount),count(donation_id)
0000c0bdc0f15bd239cfffa884791a10,2944.7600021362305,105
0000d299ce46c8375f29f7bb792b9eae,80.0,3
0000fe73a95dae43c4bd72a142760efb,172.43999862670898,8
000177bef7ed7b7d1d0f5741d0b5fab8,540.6199951171875,4
0001a55b63eb85dfa06dac45fd0883b8,350.0,7
0002555bbe359440d6ceb34b699d3932,475.0,9
0002aa294cef2e329afa06d1198d0c6c,10174.11999130249,114
000475c3717556a33ecb54772dac9db5,938.6200001239775,36
0004c4500d7e39380bfa9ba4905bc4f8,270.1999969482422,4
000562cc943f4bb8aa6db6cd9baeefcf,669.399995803833,25


In [27]:
display(resources_grouped)

project_id_2,count(resource_item_name),sum(resource_unit_price),sum(resource_quantity)
0000c0bdc0f15bd239cfffa884791a10,105,4244.099979400635,120.0
0000d299ce46c8375f29f7bb792b9eae,3,321.17999267578125,12.0
0000fe73a95dae43c4bd72a142760efb,8,187.12000274658203,16.0
000177bef7ed7b7d1d0f5741d0b5fab8,4,106.07999992370604,16.0
0001a55b63eb85dfa06dac45fd0883b8,7,588.7700004577637,8.0
0002555bbe359440d6ceb34b699d3932,9,1531.7100219726562,27.0
0002aa294cef2e329afa06d1198d0c6c,114,2143.439992904663,138.0
000475c3717556a33ecb54772dac9db5,36,3456.3599395751958,72.0
0004c4500d7e39380bfa9ba4905bc4f8,4,1554.199951171875,4.0
000562cc943f4bb8aa6db6cd9baeefcf,25,622.1500015258789,25.0


In [28]:
#donation breakdowns by school
school_grouped = df_renamed.groupBy('school_id').agg({'project_id':'count', 'donation_amount':'sum'})

In [29]:
display(school_grouped)

school_id,sum(donation_amount),count(project_id)
c173f2ec4ca98447af348a893c46cd27,1257.1399955749512,17
df6ecb69520adfd062a1ed832e8c14ce,7597.740036010742,150
de38a5d97a7e2ed86927ec24ff374e6a,268.87000274658203,6
973260aeefac130efc735913f7904dea,209659.77981567383,3797
fa3c3c18a1fe56b9547292914627d133,334017.1596645117,11976
2dd836814d68bdd1c16fbd88ac810f31,442749.7720236778,5329
fb4aa984cccb6ceebdba39e32b37f0a5,4375.4500160217285,128
4ca13c43c97d5068c66638d238f8faa6,350076.9593696594,5957
def820c2ac31ce22a85e4efaba272e8a,192663.5090227127,1314
a486017eb747b89817061c7c3b3a04e6,28535.18987441063,755


In [30]:
#most frequently donated amounts
donation_top = df_renamed.groupBy("donation_amount").count()
donation_top = donation_top.orderBy(desc('count'))
display(donation_top.take(10))

donation_amount,count
25.0,5840673
50.0,5231285
10.0,2765390
100.0,2123106
1.0,1995368
20.0,1840746
5.0,965861
15.0,544775
2.0,496496
30.0,477843


##### Geographic Breakdown of Donations and Donors

In [32]:
#top donation TOTALS based on donor city
df_city2 = df_renamed.dropna(subset = ['donor_city'])
df_city2 = df_city2.groupBy("donor_city").sum("donation_amount")
df_city2 = df_city2.orderBy(desc('sum(donation_amount)'))
display(df_city2.take(10))


donor_city,sum(donation_amount)
New York,59866644.00822714
Chicago,37387078.27272779
San Francisco,31772739.75896436
Brooklyn,27372316.829140782
Seattle,21970238.294944763
Los Angeles,21463608.42353976
Houston,18310738.472555563
Portland,15001804.73082909
Washington,14122499.878153728
Indianapolis,11713870.480494888


In [33]:
#top donation TOTALS based on donor state
df_state = df_renamed.groupBy("donor_state").sum("donation_amount")
df_state = df_state.orderBy(desc('sum(donation_amount)'))
display(df_state.take(10))

donor_state,sum(donation_amount)
California,312282151.8997923
New York,173580507.67449126
Texas,136589328.36088562
Illinois,108624501.74141663
Florida,91595262.10115324
Massachusetts,73980828.91797316
North Carolina,69269240.0851383
Washington,58951649.07191147
Pennsylvania,55632143.38070137
other,52125842.33033627


In [34]:
#total unique donations by city
city_unique = df_renamed.dropna(subset = ['donor_city'])
city_unique = city_unique.groupBy('donor_city').count()
city_unique= city_unique.orderBy(desc('count'))
display(city_unique.take(10))

donor_city,count
Chicago,634464
New York,571586
Brooklyn,467900
Los Angeles,329851
San Francisco,320834
Seattle,254305
Houston,238563
Portland,227726
Indianapolis,197882
Philadelphia,195029


In [35]:
#total unique donations by state
state_unique = df_renamed.groupBy('donor_state', 'donation_id').count()
state_unique = state_unique.orderBy(desc('count'))
display(state_unique.take(10))

donor_state,count
California,4403128
New York,2407959
Texas,1787389
Illinois,1575997
Florida,1352446
other,1146211
North Carolina,1129428
Pennsylvania,934841
Massachusetts,866748
Georgia,856183


##### Time Series Analysis

In [37]:
#number of donations year over year
donations_year = df_renamed.dropDuplicates(subset = ['donation_id'])
donations_year = donations_year.groupBy('donation_year').agg({'donation_id':'count'})
donations_year = donations_year.orderBy(desc('donation_year'))
display(donations_year)

donation_year,count(donation_id)
2018,425209
2017,1166667
2016,937531
2015,772162
2014,736819
2013,569500
2012,149


In [38]:
#number of projects year over year
project_year = df_renamed.dropDuplicates(subset = ['project_id'])
project_year = project_year.groupBy('post_date_year').agg({'project_id':'count'})
project_year = project_year.orderBy(desc('post_date_year'))
display(project_year)

post_date_year,count(project_id)
2018,86577
2017,224485
2016,187354
2015,143279
2014,128719
2013,102752


In [39]:
#number of unique donors year over year
donors_year = df_renamed.dropDuplicates(subset = ['donor_id'])
donors_year = donors_year.groupBy('donation_year').agg({'donor_id':'count'})
donors_year = donors_year.orderBy(desc('donation_year'))
display(donors_year)

donation_year,count(donor_id)
2018,171248
2017,473139
2016,446936
2015,348995
2014,299678
2013,263076
2012,116


In [40]:
#project posting day distribution
proj_day = df_renamed.dropDuplicates(subset = ['project_id'])
proj_day = proj_day.groupBy('post_date_day', 'post_date_year').agg({'project_id':'count'})
proj_day = proj_day.orderBy(desc('count(project_id)'))
display(proj_day)

post_date_day,post_date_year,count(project_id)
Mon,2017,44512
Sun,2016,33874
Fri,2017,33311
Sat,2017,32994
Tue,2017,32645
Sun,2015,31010
Mon,2016,30277
Sat,2016,29979
Sun,2017,29855
Wed,2017,27278


In [41]:
#teacher project posting day distribution
tproj_day = df_renamed.dropDuplicates(subset = ['project_id'])
tproj_day = tproj_day.groupBy('t_proj_post_day', 't_proj_post_year').agg({'project_id':'count'})
tproj_day = tproj_day.orderBy(desc('count(project_id)'))
display(tproj_day)

t_proj_post_day,t_proj_post_year,count(project_id)
Sat,2014,28253
Sun,2014,27948
Sun,2015,27486
Sun,2013,26817
Sun,2016,25147
Sat,2016,22657
Mon,2016,22397
Mon,2017,21579
Sat,2015,20068
Sat,2013,19924


In [42]:
#total project cost by year
proj_cost = df_renamed.groupBy('donation_year').agg({'donation_amount':'sum'})
proj_cost = proj_cost.orderBy(desc('donation_year'))
display(proj_cost)

donation_year,sum(donation_amount)
2018,162463582.31924185
2017,495831431.7237168
2016,411513322.4515216
2015,331728922.53462386
2014,278460904.34735346
2013,240760646.9071443
2012,73665.63999938965


##### Donors Exploration

In [44]:
#donor is teacher
teachers_donors = df_renamed.dropDuplicates(subset = ['donor_id'])
teacher_donors = teachers_donors.groupby('donor_id',"donor_is_teacher").count().groupby('donor_is_teacher').count()
display(teacher_donors)

donor_is_teacher,count
No,1794203
Yes,208985


In [45]:
#total number of times donations were made teacher vs. non teacher
eda1b = df_renamed.groupby('donor_id',"donor_is_teacher").count().withColumnRenamed("count", "num_times")
eda1c = eda1b.groupby('donor_is_teacher').agg(count('donor_id'), sum('num_times'))\
.withColumnRenamed("count(donor_id)", "count_all_donor").withColumnRenamed("sum(num_times)", "total_donation_times")
display(eda1c)

donor_is_teacher,count_all_donor,total_donation_times
No,1794203,21350928
Yes,208985,8069411


In [46]:
#distribution of reoccuring donations among teachers and non-teachers
eda2 = df_renamed.groupby('donor_id',"donor_is_teacher").count().filter('count > 1')
eda2 = eda2.withColumnRenamed("count", "num_times")
eda2a = eda2.groupby('donor_is_teacher').count().withColumnRenamed("count", "count_multi_times_donor")
eda2b = eda2.groupby('donor_is_teacher').agg(count('donor_id'), sum('num_times'))\
.withColumnRenamed("count(donor_id)", "count_multi_times_donor").withColumnRenamed("sum(num_times)", "total_donation_times")
display(eda2b)

donor_is_teacher,count_multi_times_donor,total_donation_times
No,1311047,20867772
Yes,181694,8042120


##### School & Project Exploration

In [48]:
#average project costs by school metro type
metro_cost = df_renamed.dropDuplicates(subset = ['school_id'])
metro_cost = metro_cost.groupBy('school_metro_type').agg({'project_cost':'avg'})
display(metro_cost)

school_metro_type,avg(project_cost)
rural,735.9886322716821
urban,749.9243088062658
unknown,742.2641204054916
suburban,755.120209185933
town,718.3695344899791


In [49]:
#count of school metro types
metro_type = df_renamed.dropDuplicates(subset = ['project_id'])
metro_type = metro_type.groupby('school_metro_type', 'project_curr_stat').count().sort('count',ascending =False)
display(metro_type)

school_metro_type,project_curr_stat,count
urban,Fully Funded,349862
suburban,Fully Funded,209577
rural,Fully Funded,70080
unknown,Fully Funded,59697
urban,Expired,55993
suburban,Expired,44190
town,Fully Funded,31631
rural,Expired,15137
unknown,Expired,12017
town,Expired,7404


In [50]:
#average project funding time by the school type and project status
funding = df_renamed.dropDuplicates(subset = ['project_id'])
funding = funding.groupby('school_metro_type', 'project_curr_stat').agg({'project_days':'avg'})
display(metro_type)

school_metro_type,project_curr_stat,count
urban,Fully Funded,349862
suburban,Fully Funded,209577
rural,Fully Funded,70080
unknown,Fully Funded,59697
urban,Expired,55993
suburban,Expired,44190
town,Fully Funded,31631
rural,Expired,15137
unknown,Expired,12017
town,Expired,7404


In [51]:
#average project costs by type of project
project_cost = df_renamed.dropDuplicates(subset = ['project_id'])
project_cost = project_cost.groupBy('project_type', 'project_curr_stat').agg({'project_cost':'avg'})
display(project_cost)

project_type,project_curr_stat,avg(project_cost)
Professional Development,Fully Funded,988.7079043440132
Professional Development,Live,1383.2885461936437
Teacher-Led,Expired,1065.421054688449
Student-Led,Live,1203.621097512441
Teacher-Led,Fully Funded,600.4205765279733
Teacher-Led,Live,861.2574573674705
Student-Led,Fully Funded,1034.4092057334126
Student-Led,Expired,2541.9599253336587
Professional Development,Expired,1104.2368683339098


In [52]:
#top resource vendors
resources = df_renamed.dropDuplicates(subset = ['project_id'])
resources = df_renamed.groupby('resource_vendor_name').count().sort('count',ascending =False)
display(resources.take(10))

resource_vendor_name,count
Amazon Business,12822178
AKJ Education,4500778
Lakeshore Learning Materials,3983550
Best Buy Education,1068311
School Specialty,960120
Quill.com,766305
Kaplan Early Learning Company,739847
Blick Art Materials,617471
,576411
Woodwind and Brasswind,479605


In [53]:
#total schools per city
schools_city = df_renamed.dropDuplicates(subset = ['school_id'])
schools_city = schools_city.groupBy("school_city").count()
schools_city= schools_city.orderBy(desc('count'))
display(schools_city.take(10))

school_city,count
New York City,1982
Chicago,718
Houston,690
Los Angeles,619
Philadelphia,368
San Antonio,353
Dallas,337
Phoenix,298
Baltimore,286
Columbus,273


In [54]:
#total schools per state
schools_state = df_renamed.groupBy("school_state").count()
schools_state= schools_state.orderBy(desc('count'))
display(schools_state.take(10))

school_state,count
California,4304237
New York,2346367
Texas,1995018
Illinois,1640931
Florida,1474102
North Carolina,1281494
Michigan,913630
Pennsylvania,904057
Georgia,883011
South Carolina,788193


In [55]:
status_avg = df_renamed.groupby('project_id',"project_cost", 'project_curr_stat').sum()
status_avgn = status_avg.select('project_id', 'project_curr_stat', 'project_cost')
display(status_avgn)

project_id,project_curr_stat,project_cost
b0958c005f4a2677eef6bb9641a984f2,Fully Funded,288.65
b0965c91baec1c42d33eb6c735659f25,Fully Funded,330.78
b0976fc47913cb25fa3b5bdd7adac153,Fully Funded,278.94
b097dc76e3853b655c07c94b64a1454c,Fully Funded,357.78
b098d738674642cb5c1feed74bf4fd07,Fully Funded,723.62
b099267323240c41f370cd0edade0c1e,Fully Funded,262.95
b099417dadc98c5e754d2b42538d4169,Expired,989.39
b09a0b5dfd0f4e26dd1089ac3f58e625,Fully Funded,207.19
b09a157a435e6ede14b43153dacfa694,Fully Funded,871.18
b09a30391e30b0e6a2c2f1dccab09d45,Live,355.53


In [56]:
test_table = df_renamed.registerTempTable('test_table')
test = spark.sql(
'''select project_curr_stat, main_cat1, count(*) as count from test_table
group by main_cat1, project_curr_stat
''')
display(test)

project_curr_stat,main_cat1,count
Fully Funded,Applied Learning,3174999
Expired,Literacy & Language,1162501
Fully Funded,Health & Sports,1175462
Live,Math & Science,49957
Live,Special Needs,11571
Fully Funded,Literacy & Language,13798758
Expired,Health & Sports,103125
Expired,Warmth,6559
Expired,Special Needs,85269
Fully Funded,Warmth,326022


In [57]:
test_table = df_renamed.registerTempTable('test_table')
test = spark.sql(
'''select project_curr_stat, main_cat2, count(*) as count from test_table
group by main_cat2, project_curr_stat
''')
display(test)

project_curr_stat,main_cat2,count
Fully Funded,Special Needs,2324057
Expired,Literacy & Language,191386
Expired,Music & The Arts,136453
Live,Literacy & Language,26687
Expired,Applied Learning,44786
Live,Special Needs,29074
Live,,165776
Fully Funded,Health & Sports,307082
Fully Funded,History & Civics,603396
Expired,Health & Sports,22823


In [58]:
#top main categories vendors
maincats = df_renamed.groupby('main_cat1').count().sort('count',ascending =False)
display(maincats.take(10))

main_cat1,count
Literacy & Language,15097325
Math & Science,5281522
Applied Learning,3486772
Music & The Arts,1546474
History & Civics,1372551
Health & Sports,1290419
Special Needs,1007901
Warmth,336642
,733
