### Connect To Datasource

In [2]:
#connect to s3 bucket to import data
import urllib.parse
import urllib.request
from pyspark.sql.types import (StructField, StringType, IntegerType, StructType)

ACCESS_KEY = ""
SECRET_KEY = ""
ENCODED_SECRET_KEY = urllib.parse.quote(SECRET_KEY, "")
AWS_BUCKET_NAME = "donorchoose"
MOUNT_NAME = "kps3"

dbutils.fs.refreshMounts()
# comment out the following line after you have successfully mounted your bucket
#dbutils.fs.mount("s3n://%s:%s@%s" % (ACCESS_KEY, ENCODED_SECRET_KEY, AWS_BUCKET_NAME), "/mnt/%s" % MOUNT_NAME)

In [3]:
data_root = '/tmp/'
submission_path = '/mnt/{}/submission.txt'.format(MOUNT_NAME)

In [4]:
#display datasets from s3 bucket
display(dbutils.fs.ls("/mnt/%s" % MOUNT_NAME))

path,name,size
dbfs:/mnt/kps3/Donations.csv,Donations.csv,611355459
dbfs:/mnt/kps3/Donors.csv,Donors.csv,123981264
dbfs:/mnt/kps3/Projects.csv,Projects.csv,2571524598
dbfs:/mnt/kps3/Resources.csv,Resources.csv,819349026
dbfs:/mnt/kps3/Schools.csv,Schools.csv,9652203
dbfs:/mnt/kps3/Teachers.csv,Teachers.csv,19570934
dbfs:/mnt/kps3/newprojects.csv,newprojects.csv,293982042


### Data Wrangling

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pandas as pd
spark = SparkSession.builder.appName('ReadData').getOrCreate()
sc = spark.sparkContext
from pyspark.sql import HiveContext
hive_context = HiveContext(sc)
from pyspark.sql import SQLContext
from pyspark.sql.functions import desc
sqlContext = SQLContext(sc)
from plotly.offline import plot
import plotly.graph_objs as go
import requests
requests.packages.urllib3.disable_warnings()
from pyspark.sql import SQLContext
from pyspark.sql.functions import isnan, when, count, col, translate
from pyspark.sql.types import IntegerType, FloatType
from pyspark.sql.types import FloatType
from pyspark.sql import functions as F
from pyspark.sql.functions import when, col, mean, desc, round

##### Pre-Pre Processing

Initially there were problems with importing the projects.csv dataset. Special characters caused linebreak issues with the Project Essay, Project Short Description, Project Title and Project Need Statement. The issue caused the data to shift to the wrong columns and rows. In order to fix this issue, the dataset was uploaded in Jupyter notebook, and read in using Pandas. Once the file was read, I did the following cleaning:
  - Drop NA
  - Dropped columns: Project Essay, Project Short Description and Project Need Statement
  - Removed special characters from Project Title Column
Once these changes were made, I exported the dataframe to csv and re-saved it as newprojects.csv, which is read in below.

In [8]:
#set partitions 
sqlContext.setConf('spark.sql.shuffle.partitions', '6')

In [9]:
#import datasets from s3 bucket
donations_df = spark.read.csv("dbfs:/mnt/kps3/Donations.csv", header =True, inferSchema = True)
donors_df = spark.read.csv("dbfs:/mnt/kps3/Donors.csv", header =True,  inferSchema = True)
projects_df = spark.read.csv("dbfs:/mnt/kps3/newprojects.csv", header =True, inferSchema = True)
resources_df = spark.read.csv("dbfs:/mnt/kps3/Resources.csv", header =True,  inferSchema = True)
Teachers = spark.read.csv("dbfs:/mnt/kps3/Teachers.csv", header =True,  inferSchema = True)
schools_df = spark.read.csv("dbfs:/mnt/kps3/Schools.csv", header =True, inferSchema = True)

In [10]:
#checking cleaned projects df - removed project essay/project need statement/project short description
display(projects_df.groupBy('Project Current Status').count())

Project Current Status,count
Live,41851
Expired,241402
Fully Funded,826764


In [11]:
#join dataframes together for future modeling and feature 
donor_donations = donations_df.join(donors_df, on = ['Donor ID'], how = 'inner')
projects_schools = projects_df.join(schools_df, on = ['School ID'], how = 'inner')
donor_projects =  donor_donations.join(projects_schools, on = ['Project ID'], how = 'left')
projects_teachers = donor_projects.join(Teachers, on = ['Teacher ID'], how = 'inner')
df = projects_teachers.join(resources_df, on = ['Project ID'], how = 'inner')

In [12]:
#inspect combined dataset column types
df.printSchema()

In [13]:
#null values in city name due to washington DC - school state and school county are both district of columbia
df = df.fillna({'School City':'DC'})

In [14]:
#rename all columns and create new df
df_renamed = df.withColumnRenamed('Project ID', 'project_id').withColumnRenamed('Teacher ID', 'teacher_id').withColumnRenamed('Donor ID', 'donor_id').withColumnRenamed('Donation ID', 'donation_id').withColumnRenamed('Donation Included Optional Donation', 'optional_donation').withColumnRenamed('Donation Amount', 'donation_amount'). withColumnRenamed('Donor Cart Sequence', 'donor_cart_sequence').withColumnRenamed('Donation Received Date','donation_received_date').withColumnRenamed('Donor City', 'donor_city').withColumnRenamed('Donor State', 'donor_state').withColumnRenamed('Donor Is Teacher', 'donor_is_teacher').withColumnRenamed('Donor Zip', 'donor_zip').withColumnRenamed('School ID', 'school_id').withColumnRenamed('Teacher Project Posted Sequence', 'teacher_project_posted_seq').withColumnRenamed('Project ID', 'project_id').withColumnRenamed('Project Type', 'project_type').withColumnRenamed('Project Title', 'project_title').withColumnRenamed('Project Subject Category Tree', 'project_cat').withColumnRenamed('Project Subject Subcategory Tree', 'project_cat2').withColumnRenamed('Project Grade Level Category', 'project_grade_level_cat').withColumnRenamed('Project Resource Category', 'project_resource_cat').withColumnRenamed('Project Cost', 'project_cost').withColumnRenamed('Project Posted Date', 'project_posted_date').withColumnRenamed('Project Expiration Date', 'project_exp_date').withColumnRenamed('Project Current Status', 'project_curr_stat').withColumnRenamed('Project Fully Funded Date', 'project_fully_funded_date').withColumnRenamed('School Name', 'school_name').withColumnRenamed('School Metro Type', 'school_metro_type').withColumnRenamed('School Percentage Free Lunch', 'school_percent_free_lunch').withColumnRenamed('School State', 'school_state').withColumnRenamed('School Zip', 'school_zip').withColumnRenamed('School City', 'school_city').withColumnRenamed('School County', 'school_county').withColumnRenamed('School District', 'school_district').withColumnRenamed('Teacher Prefix', 'teacher_prefix').withColumnRenamed('Teacher First Project Posted Date', 'teacher_first_proj').withColumnRenamed('Resource Item Name', 'resource_item_name').withColumnRenamed('Resource Quantity', 'resource_quantity').withColumnRenamed('Resource Unit Price', 'resource_unit_price').withColumnRenamed('Resource Vendor Name', 'resource_vendor_name')

In [15]:
#cast timestamps
df_renamed = df_renamed.withColumn("donation_received_date",to_date(unix_timestamp(col("donation_received_date"), "yyyy-MM-dd").cast("timestamp")))
df_renamed = df_renamed.withColumn("project_posted_date",to_date(unix_timestamp(col("project_posted_date"), "yyyy-MM-dd").cast("timestamp")))
df_renamed = df_renamed.withColumn("project_exp_date",to_date(unix_timestamp(col("project_exp_date"), "yyyy-MM-dd").cast("timestamp")))
df_renamed = df_renamed.withColumn("project_fully_funded_date",to_date(unix_timestamp(col("project_fully_funded_date"), "yyyy-MM-dd").cast("timestamp")))
df_renamed = df_renamed.withColumn("teacher_first_proj",to_date(unix_timestamp(col("teacher_first_proj"), "yyyy-MM-dd").cast("timestamp")))

In [16]:
df_renamed.printSchema()

In [17]:
#create new date columns for future data exploration & modeling
df_renamed = df_renamed.withColumn('teacher_date_exp_dif',datediff(df_renamed.project_exp_date.cast('date'),df_renamed.teacher_first_proj.cast('date')))
df_renamed= df_renamed.withColumn('project_days',datediff(df_renamed.project_exp_date.cast('date'),df_renamed.project_posted_date.cast('date')))

df_renamed = df_renamed.withColumn('post_date_day', date_format(col('project_posted_date'), "E"))
df_renamed = df_renamed.withColumn('post_date_month', month(df_renamed['project_posted_date']))
df_renamed = df_renamed.withColumn('post_date_year', year(df_renamed['project_posted_date']))

df_renamed = df_renamed.withColumn('donation_day', date_format(col('donation_received_date'), "E"))
df_renamed = df_renamed.withColumn('donation_month', month(df_renamed['donation_received_date']))
df_renamed = df_renamed.withColumn('donation_year', year(df_renamed['donation_received_date']))

df_renamed = df_renamed.withColumn('t_proj_post_day', date_format(col('teacher_first_proj'), "E"))
df_renamed = df_renamed.withColumn('t_proj_post_month', month(df_renamed['teacher_first_proj']))
df_renamed = df_renamed.withColumn('t_proj_post_year', year(df_renamed['teacher_first_proj']))


In [18]:
#cast column types for ints and floats that weren't inferred at import
df_renamed = df_renamed.withColumn('school_percent_free_lunch',df_renamed.school_percent_free_lunch.cast('float'))
df_remamed = df_renamed.withColumn('donor_cart_sequence',df_renamed.donor_cart_sequence.cast('int'))
df_renamed = df_renamed.withColumn('donation_amount',df_renamed.donation_amount.cast('float'))
df_renamed = df_renamed.withColumn('teacher_project_posted_seq',df_renamed.teacher_project_posted_seq.cast('int'))
df_renamed = df_renamed.withColumn('project_cost',df_renamed.project_cost.cast('float'))
df_renamed = df_renamed.withColumn('resource_quantity',df_renamed.resource_quantity.cast('float'))
df_renamed = df_renamed.withColumn('resource_unit_price',df_renamed.resource_unit_price.cast('float'))
df_renamed = df_renamed.withColumn('resource_unit_price',df_renamed.resource_unit_price.cast('float'))

In [19]:
display(df_renamed)

project_id,teacher_id,donor_id,donation_id,optional_donation,donation_amount,donor_cart_sequence,donation_received_date,donor_city,donor_state,donor_is_teacher,donor_zip,school_id,teacher_project_posted_seq,project_type,project_title,project_cat,project_cat2,project_grade_level_cat,project_resource_cat,project_cost,project_posted_date,project_exp_date,project_curr_stat,project_fully_funded_date,school_name,school_metro_type,school_percent_free_lunch,school_state,school_zip,school_city,school_county,school_district,teacher_prefix,teacher_first_proj,resource_item_name,resource_quantity,resource_unit_price,resource_vendor_name,teacher_date_exp_dif,project_days,post_date_day,post_date_month,post_date_year,donation_day,donation_month,donation_year,t_proj_post_day,t_proj_post_month,t_proj_post_year
0000c0bdc0f15bd239cfffa884791a10,a28e2bf8d9adc6da2c0fbb9a3206226d,17f3f75fe1d4c9058f9a36720b312fbc,74cad43aa81e1c9b570161c0c27230e9,Yes,25.0,1,2016-07-27,Arlington,Virginia,No,222.0,87d45b15ae6f5e40b72bf9e692e295c7,3,Teacher-Led,Code Green!,"Literacy & Language, Math & Science","Literacy, Mathematics",Grades PreK-2,Supplies,424.74,2016-07-18,2016-11-17,Fully Funded,2016-07-27,Brightwood Education Center,urban,95.0,District of Columbia,20011.0,DC,District Of Columbia,Dc Public Schools,Ms.,2011-09-03,aa620x - addition fluency puzzles - complete set,1.0,55.0,Lakeshore Learning Materials,1902,122,Mon,7,2016,Wed,7,2016,Sat,9,2011
0000c0bdc0f15bd239cfffa884791a10,a28e2bf8d9adc6da2c0fbb9a3206226d,17f3f75fe1d4c9058f9a36720b312fbc,74cad43aa81e1c9b570161c0c27230e9,Yes,25.0,1,2016-07-27,Arlington,Virginia,No,222.0,87d45b15ae6f5e40b72bf9e692e295c7,3,Teacher-Led,Code Green!,"Literacy & Language, Math & Science","Literacy, Mathematics",Grades PreK-2,Supplies,424.74,2016-07-18,2016-11-17,Fully Funded,2016-07-27,Brightwood Education Center,urban,95.0,District of Columbia,20011.0,DC,District Of Columbia,Dc Public Schools,Ms.,2011-09-03,jj518 - classroom magnetic letters kit,1.0,42.99,Lakeshore Learning Materials,1902,122,Mon,7,2016,Wed,7,2016,Sat,9,2011
0000c0bdc0f15bd239cfffa884791a10,a28e2bf8d9adc6da2c0fbb9a3206226d,17f3f75fe1d4c9058f9a36720b312fbc,74cad43aa81e1c9b570161c0c27230e9,Yes,25.0,1,2016-07-27,Arlington,Virginia,No,222.0,87d45b15ae6f5e40b72bf9e692e295c7,3,Teacher-Led,Code Green!,"Literacy & Language, Math & Science","Literacy, Mathematics",Grades PreK-2,Supplies,424.74,2016-07-18,2016-11-17,Fully Funded,2016-07-27,Brightwood Education Center,urban,95.0,District of Columbia,20011.0,DC,District Of Columbia,Dc Public Schools,Ms.,2011-09-03,ee590x - roll & read phonics games - complete set,1.0,69.99,Lakeshore Learning Materials,1902,122,Mon,7,2016,Wed,7,2016,Sat,9,2011
0000c0bdc0f15bd239cfffa884791a10,a28e2bf8d9adc6da2c0fbb9a3206226d,17f3f75fe1d4c9058f9a36720b312fbc,74cad43aa81e1c9b570161c0c27230e9,Yes,25.0,1,2016-07-27,Arlington,Virginia,No,222.0,87d45b15ae6f5e40b72bf9e692e295c7,3,Teacher-Led,Code Green!,"Literacy & Language, Math & Science","Literacy, Mathematics",Grades PreK-2,Supplies,424.74,2016-07-18,2016-11-17,Fully Funded,2016-07-27,Brightwood Education Center,urban,95.0,District of Columbia,20011.0,DC,District Of Columbia,Dc Public Schools,Ms.,2011-09-03,gg294 - fill-in-the-blank phonics stamps - set 2,1.0,29.99,Lakeshore Learning Materials,1902,122,Mon,7,2016,Wed,7,2016,Sat,9,2011
0000c0bdc0f15bd239cfffa884791a10,a28e2bf8d9adc6da2c0fbb9a3206226d,17f3f75fe1d4c9058f9a36720b312fbc,74cad43aa81e1c9b570161c0c27230e9,Yes,25.0,1,2016-07-27,Arlington,Virginia,No,222.0,87d45b15ae6f5e40b72bf9e692e295c7,3,Teacher-Led,Code Green!,"Literacy & Language, Math & Science","Literacy, Mathematics",Grades PreK-2,Supplies,424.74,2016-07-18,2016-11-17,Fully Funded,2016-07-27,Brightwood Education Center,urban,95.0,District of Columbia,20011.0,DC,District Of Columbia,Dc Public Schools,Ms.,2011-09-03,ra313 - unifix® cubes,2.0,24.99,Lakeshore Learning Materials,1902,122,Mon,7,2016,Wed,7,2016,Sat,9,2011
0000c0bdc0f15bd239cfffa884791a10,a28e2bf8d9adc6da2c0fbb9a3206226d,17f3f75fe1d4c9058f9a36720b312fbc,74cad43aa81e1c9b570161c0c27230e9,Yes,25.0,1,2016-07-27,Arlington,Virginia,No,222.0,87d45b15ae6f5e40b72bf9e692e295c7,3,Teacher-Led,Code Green!,"Literacy & Language, Math & Science","Literacy, Mathematics",Grades PreK-2,Supplies,424.74,2016-07-18,2016-11-17,Fully Funded,2016-07-27,Brightwood Education Center,urban,95.0,District of Columbia,20011.0,DC,District Of Columbia,Dc Public Schools,Ms.,2011-09-03,hh445x - reading match-ups - set 2,1.0,29.99,Lakeshore Learning Materials,1902,122,Mon,7,2016,Wed,7,2016,Sat,9,2011
0000c0bdc0f15bd239cfffa884791a10,a28e2bf8d9adc6da2c0fbb9a3206226d,17f3f75fe1d4c9058f9a36720b312fbc,74cad43aa81e1c9b570161c0c27230e9,Yes,25.0,1,2016-07-27,Arlington,Virginia,No,222.0,87d45b15ae6f5e40b72bf9e692e295c7,3,Teacher-Led,Code Green!,"Literacy & Language, Math & Science","Literacy, Mathematics",Grades PreK-2,Supplies,424.74,2016-07-18,2016-11-17,Fully Funded,2016-07-27,Brightwood Education Center,urban,95.0,District of Columbia,20011.0,DC,District Of Columbia,Dc Public Schools,Ms.,2011-09-03,dd468 - visualize math write & wipe boards,1.0,29.99,Lakeshore Learning Materials,1902,122,Mon,7,2016,Wed,7,2016,Sat,9,2011
0000c0bdc0f15bd239cfffa884791a10,a28e2bf8d9adc6da2c0fbb9a3206226d,e1a9ef3d18fad06f5ab2f7c1aab38e2e,e33a13180f8e17db9d3f41200a8ca9c5,Yes,50.0,1,2016-07-19,Monroe,North Carolina,No,281.0,87d45b15ae6f5e40b72bf9e692e295c7,3,Teacher-Led,Code Green!,"Literacy & Language, Math & Science","Literacy, Mathematics",Grades PreK-2,Supplies,424.74,2016-07-18,2016-11-17,Fully Funded,2016-07-27,Brightwood Education Center,urban,95.0,District of Columbia,20011.0,DC,District Of Columbia,Dc Public Schools,Ms.,2011-09-03,aa620x - addition fluency puzzles - complete set,1.0,55.0,Lakeshore Learning Materials,1902,122,Mon,7,2016,Tue,7,2016,Sat,9,2011
0000c0bdc0f15bd239cfffa884791a10,a28e2bf8d9adc6da2c0fbb9a3206226d,e1a9ef3d18fad06f5ab2f7c1aab38e2e,e33a13180f8e17db9d3f41200a8ca9c5,Yes,50.0,1,2016-07-19,Monroe,North Carolina,No,281.0,87d45b15ae6f5e40b72bf9e692e295c7,3,Teacher-Led,Code Green!,"Literacy & Language, Math & Science","Literacy, Mathematics",Grades PreK-2,Supplies,424.74,2016-07-18,2016-11-17,Fully Funded,2016-07-27,Brightwood Education Center,urban,95.0,District of Columbia,20011.0,DC,District Of Columbia,Dc Public Schools,Ms.,2011-09-03,jj518 - classroom magnetic letters kit,1.0,42.99,Lakeshore Learning Materials,1902,122,Mon,7,2016,Tue,7,2016,Sat,9,2011
0000c0bdc0f15bd239cfffa884791a10,a28e2bf8d9adc6da2c0fbb9a3206226d,e1a9ef3d18fad06f5ab2f7c1aab38e2e,e33a13180f8e17db9d3f41200a8ca9c5,Yes,50.0,1,2016-07-19,Monroe,North Carolina,No,281.0,87d45b15ae6f5e40b72bf9e692e295c7,3,Teacher-Led,Code Green!,"Literacy & Language, Math & Science","Literacy, Mathematics",Grades PreK-2,Supplies,424.74,2016-07-18,2016-11-17,Fully Funded,2016-07-27,Brightwood Education Center,urban,95.0,District of Columbia,20011.0,DC,District Of Columbia,Dc Public Schools,Ms.,2011-09-03,ee590x - roll & read phonics games - complete set,1.0,69.99,Lakeshore Learning Materials,1902,122,Mon,7,2016,Tue,7,2016,Sat,9,2011


In [20]:
#remove live projects
df_renamed = df_renamed.filter(df_renamed['project_curr_stat'] != 'Live')
display(df_renamed.groupBy('project_curr_stat').count())

project_curr_stat,count
Expired,2300073
Fully Funded,26839026


### Exploratory Data Analysis

In [22]:
#total number of rows in joined df
df_renamed.count()

In [23]:
#total number of donors
df_renamed.select("donor_id").distinct().count()

In [24]:
#total number of donations
display(df_renamed.select("donation_id").distinct().count())

In [25]:
#create grouped dataframes by project id and project resources
#group data by project id - each row for the same project id
donations_grouped = df_renamed.groupBy('project_id').agg({'donation_id':'count','donation_amount':'sum', 'donation_amount':'avg'})
donations_grouped = donations_grouped.withColumnRenamed('project_id','project_id_2')

resources_grouped = df_renamed.groupBy('project_id').agg({'resource_item_name':'count','resource_unit_price':'sum','resource_quantity':'sum'})
resources_grouped = resources_grouped.withColumnRenamed('project_id','project_id_2')

In [26]:
display(donations_grouped)

project_id_2,avg(donation_amount),count(donation_id)
0000c0bdc0f15bd239cfffa884791a10,28.045333353678384,105
0000d299ce46c8375f29f7bb792b9eae,26.666666666666668,3
0000fe73a95dae43c4bd72a142760efb,21.554999828338623,8
000177bef7ed7b7d1d0f5741d0b5fab8,135.15499877929688,4
0001a55b63eb85dfa06dac45fd0883b8,50.0,7
0002555bbe359440d6ceb34b699d3932,52.77777777777778,9
0002aa294cef2e329afa06d1198d0c6c,89.24666659037273,114
000475c3717556a33ecb54772dac9db5,26.072777781221603,36
0004c4500d7e39380bfa9ba4905bc4f8,67.54999923706055,4
000562cc943f4bb8aa6db6cd9baeefcf,26.77599983215332,25


In [27]:
display(resources_grouped)

project_id_2,count(resource_item_name),sum(resource_unit_price),sum(resource_quantity)
0000c0bdc0f15bd239cfffa884791a10,105,4244.099979400635,120.0
0000d299ce46c8375f29f7bb792b9eae,3,321.17999267578125,12.0
0000fe73a95dae43c4bd72a142760efb,8,187.12000274658203,16.0
000177bef7ed7b7d1d0f5741d0b5fab8,4,106.07999992370604,16.0
0001a55b63eb85dfa06dac45fd0883b8,7,588.7700004577637,8.0
0002555bbe359440d6ceb34b699d3932,9,1531.7100219726562,27.0
0002aa294cef2e329afa06d1198d0c6c,114,2143.439992904663,138.0
000475c3717556a33ecb54772dac9db5,36,3456.3599395751958,72.0
0004c4500d7e39380bfa9ba4905bc4f8,4,1554.199951171875,4.0
000562cc943f4bb8aa6db6cd9baeefcf,25,622.1500015258789,25.0


##### Geographic Breakdown of Donations and Donors

In [29]:
#top donation TOTALS based on city
df_city2 = df_renamed.groupBy("donor_city").sum("donation_amount")
df_city2 = df_city2.orderBy(desc('sum(donation_amount)'))
display(df_city2.take(10))


donor_city,sum(donation_amount)
,129806027.48622832
New York,59742369.68825033
Chicago,37260709.77268964
San Francisco,31696289.398964703
Brooklyn,27273641.69914639
Seattle,21894121.294950485
Los Angeles,21379783.42353976
Houston,18206061.312549997
Portland,14910649.040826648
Washington,14059552.318152357


In [30]:
#top donation TOTALS based on state
df_state = df_renamed.groupBy("donor_state").sum("donation_amount")
df_state = df_state.orderBy(desc('sum(donation_amount)'))
display(df_state.take(10))

donor_state,sum(donation_amount)
California,310853953.0099331
New York,172916225.40442964
Texas,135862307.19034004
Illinois,108132780.2013887
Florida,91075259.97098576
Massachusetts,73713757.79797494
North Carolina,68879204.01516818
Washington,58716593.65198769
Pennsylvania,55309167.01073057
Georgia,51871181.25917381


In [31]:
#total unique donations by city
city_unique = df_renamed.groupBy("donor_city").count()
city_unique= city_unique.orderBy(desc('count'))
display(city_unique.take(10))

donor_city,count
,2126438
Chicago,629522
New York,569645
Brooklyn,464841
Los Angeles,328291
San Francisco,319527
Seattle,252905
Houston,235663
Portland,226163
Indianapolis,196375


In [32]:
#total unique donations by state
state_unique = df_renamed.groupBy("donor_state").count()
state_unique = state_unique.orderBy(desc('count'))
display(state_unique.take(10))

donor_state,count
California,4370749
New York,2389305
Texas,1768349
Illinois,1561602
Florida,1338188
North Carolina,1119617
other,1116735
Pennsylvania,925952
Massachusetts,861110
Georgia,849226


In [33]:
#top donation amounts
donation_top = df_renamed.groupBy("donation_amount").count()
donation_top = df_state.orderBy(desc('count'))
display(donation_top.take(10))

donation_amount,count
25.0,5840673
50.0,5231285
10.0,2765390
100.0,2123106
1.0,1995368
20.0,1840746
5.0,965861
15.0,544775
2.0,496496
30.0,477843


In [34]:
#free lunch distribution by state

In [35]:
#free lunch distribution by city

##### Time Series Analysis

In [37]:
#number of donations year over year
donations_year = df_renamed.groupBy('donation_year').agg({'donation_id':'count'})
donations_year = donations_year.orderBy(desc('donation_year'))
display(donations_year)

donation_year,count(donation_id)
2018,2319961
2017,7915683
2016,5928106
2015,4871935
2014,4703653
2013,3399577
2012,184


In [38]:
#number of projects year over year
project_year = df_renamed.groupBy('post_date_year').agg({'project_id':'count'})
project_year = project_year.orderBy(desc('post_date_year'))
display(project_year)

post_date_year,count(project_id)
2018,1954446
2017,7888532
2016,6101937
2015,4800416
2014,4752562
2013,3641206


In [39]:
#project distribution by month

In [40]:
#donation distribution by year


In [41]:
#donation distribution by month


In [42]:
#donor distribution by year


In [43]:
#donor distribution by month

In [44]:
#project posting day distribution

In [45]:
#teacher project posted date distribution by month

In [46]:
#project status by year
display(df_renamed)

In [47]:
#project cost by year

##### Donors Exploration

In [49]:
#donor is teacher
teacher_donors = df_renamed.groupby('donor_id',"donor_is_teacher").count().groupby('donor_is_teacher').count()
display(teacher_donors)

donor_is_teacher,count
No,704564
Yes,84051


In [50]:
#distribution of donors who make optional donation

##### School & Project Exploration

In [52]:
#average project costs by school metro type
metro_cost = df_renamed.groupBy('school_metro_type').agg({'project_cost':'avg'})
display(metro_cost)

school_metro_type,avg(project_cost)
rural,787.3810939517026
urban,913.18033603447
unknown,810.4613475833775
suburban,860.6705810843996
town,716.0162003595541


In [53]:
#count of school metro types
metro_type = df_renamed.groupby('school_metro_type', 'project_curr_stat').count().sort('count',ascending =False)
display(metro_type)

school_metro_type,project_curr_stat,count
urban,Fully Funded,13587893
suburban,Fully Funded,7537465
rural,Fully Funded,2437575
unknown,Fully Funded,2253198
town,Fully Funded,1022895
urban,Expired,1004286
suburban,Expired,754791
rural,Expired,220090
unknown,Expired,213627
town,Expired,107279


In [54]:
#average project funding time by the school type and project status
metro_type = df_renamed.groupby('school_metro_type', 'project_curr_stat').agg({'project_days':'avg'})
display(metro_type)

school_metro_type,project_curr_stat,avg(project_days)
town,Expired,117.05480103282096
town,Fully Funded,117.17297963134044
rural,Expired,116.83524467263392
unknown,Expired,116.45434373507852
rural,Fully Funded,117.16742172035732
unknown,Fully Funded,116.64047367341884
urban,Fully Funded,116.8153088929976
suburban,Expired,116.80181401209076
suburban,Fully Funded,116.9149938341339
urban,Expired,116.58250438620074


In [55]:
#average project costs by type of project
project_cost = df_renamed.groupBy('project_type', 'project_curr_stat').agg({'project_cost':'avg'})
display(project_cost)

project_type,project_curr_stat,avg(project_cost)
Professional Development,Fully Funded,940.862728770463
Teacher-Led,Expired,1307.6008383941485
Teacher-Led,Fully Funded,827.9389551736145
Student-Led,Fully Funded,1376.7188499365448
Student-Led,Expired,3025.18440395818
Professional Development,Expired,993.061207739227


In [56]:
#top resource vendors
resources = df_renamed.groupby('resource_vendor_name').count().sort('count',ascending =False)
display(resources.take(10))

resource_vendor_name,count
Amazon Business,3098667
AKJ Education,1697391
Lakeshore Learning Materials,1517968
Best Buy Education,432334
School Specialty,356295
Quill.com,286474
Kaplan Early Learning Company,267710
Blick Art Materials,245850
Nasco,167169
Woodwind and Brasswind,162057


In [57]:
#total schools per city
schools_city = df_renamed.groupBy("school_city").count()
schools_city= schools_city.orderBy(desc('count'))
display(schools_city.take(10))

school_city,count
New York City,1878097
Chicago,958242
Los Angeles,519275
Philadelphia,506572
Houston,393487
Indianapolis,348121
Oakland,286415
San Francisco,268914
DC,245239
Seattle,237960


In [58]:
#total schools per state
schools_state = df_renamed.groupBy("school_state").count()
schools_state= schools_state.orderBy(desc('count'))
display(schools_state.take(10))

school_state,count
California,4269921
New York,2325170
Texas,1973932
Illinois,1627676
Florida,1459165
North Carolina,1270767
Michigan,904033
Pennsylvania,894587
Georgia,874677
South Carolina,777746


In [59]:
display(df_renamed)

In [60]:
#school state top avg donations


In [61]:
#school state top total donations


In [62]:
#school state top donation counts

In [63]:
#school city top avg donations

In [64]:
#school city top avg donations

In [65]:
#school city top avg donations

In [66]:
#top donation totals by school

In [67]:
#top avg donation by school

In [68]:
#top unique donations by school

In [69]:
#school states by project status

In [70]:
#school cities by project status

In [71]:
#resource quantity by year

In [72]:
#resource avg cost by year