In [3]:
## reference: https://colab.research.google.com/drive/1QhSnbh-WJVGZjQJF8u974msOL_vAgMeS#scrollTo=PlAGuj5kuZm9
## https://github.com/eisenjulian/nlp_estimator_tutorial/blob/master/nlp_estimators.ipynb

import pandas as pd
import numpy as np

from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection
from sklearn import feature_selection
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit
from sklearn.svm import SVR
from sklearn.svm import SVC

from keras.preprocessing.text import Tokenizer

import tensorflow as tf
from tensorboard import summary as summary_lib
from tensorflow.python.keras.preprocessing import sequence

import seaborn as sns
import matplotlib.pyplot as plt

import tempfile
import os
print(tf.__version__)


dir='/Users/xinwang/ai/dataset/kaggle/DonorsChoose/'
train_file='train.csv'
resource_file='resources.csv'
SEED=1000
positive_sample_size=30000

all_train_df = pd.read_csv(dir + train_file)
resource_df = pd.read_csv(dir + resource_file)
label = LabelEncoder()
low_memory=False


def sampleData():
    train_label_1_df = all_train_df.loc[all_train_df['project_is_approved']==1].sample(n=positive_sample_size,
                                                                           random_state=SEED)
    train_label_0_df = all_train_df[all_train_df['project_is_approved']==0]

    train_df = pd.concat([train_label_1_df,train_label_0_df])
    train_df = shuffle(train_df)

    train_df.fillna(value={"teacher_prefix":'Mr.'}, inplace=True)

    return train_df

train_data = sampleData() 

target = 'project_is_approved'
model_dir = tempfile.mkdtemp()


train_data.info()

Using TensorFlow backend.


1.9.0
<class 'pandas.core.frame.DataFrame'>
Int64Index: 57734 entries, 38288 to 75850
Data columns (total 16 columns):
id                                              57734 non-null object
teacher_id                                      57734 non-null object
teacher_prefix                                  57734 non-null object
school_state                                    57734 non-null object
project_submitted_datetime                      57734 non-null object
project_grade_category                          57734 non-null object
project_subject_categories                      57734 non-null object
project_subject_subcategories                   57734 non-null object
project_title                                   57734 non-null object
project_essay_1                                 57734 non-null object
project_essay_2                                 57734 non-null object
project_essay_3                                 1922 non-null object
project_essay_4                           

In [4]:
train_data.head()

Unnamed: 0,id,teacher_id,teacher_prefix,school_state,project_submitted_datetime,project_grade_category,project_subject_categories,project_subject_subcategories,project_title,project_essay_1,project_essay_2,project_essay_3,project_essay_4,project_resource_summary,teacher_number_of_previously_posted_projects,project_is_approved
38288,p205571,6f6e3e52ab2efa55a518b832fa996407,Mrs.,NJ,2017-03-16 11:52:08,Grades PreK-2,"Literacy & Language, Math & Science","Literacy, Mathematics",Teaching Through Tablets!,Our small district services students in grades...,My students need tablets accessible to them in...,,,My students need classroom access to technolog...,1,1
94265,p210760,9726d214cc6e9efb02a0f12c54ffd77d,Ms.,CA,2017-03-27 13:12:37,Grades PreK-2,"Literacy & Language, Math & Science","Literacy, Mathematics",Does Not Compute! Help We Need Chromebooks!,My second graders are a special group of kids ...,Our district recently adopted a new literacy s...,,,My students need a set of chromebooks so they ...,3,1
144681,p053170,b99c7d69e5c7f2b18abb967b52444591,Mrs.,NJ,2017-01-24 13:43:42,Grades 3-5,"Literacy & Language, Special Needs","Literacy, Special Needs",We Have a Burning Desire to Learn,Our class is a third grade inclusion class tha...,The students in our class are eager to learn. ...,,,My students need the technology in order to di...,0,1
171729,p108665,2837a4b68722d082dc6817cdc7d527c3,Mrs.,AR,2016-08-02 16:13:02,Grades PreK-2,Literacy & Language,"ESL, Literacy",Big Book Bananas!,My students start out their day by having brea...,The donations for this project will aide in my...,,,My students need the big book activity kit to ...,10,1
174461,p007269,0c739ff7cea113f16c9fdbadb833d555,Ms.,NY,2017-03-23 11:30:05,Grades PreK-2,Special Needs,Special Needs,Sensory Needs for Sensory Delays,My students are special needs children ages 5-...,Many people diagnosed with Autism have a diffi...,,,My students need sensory items to help with se...,21,0


In [5]:
train_data[['teacher_prefix', 'project_is_approved']].groupby('teacher_prefix', as_index=False).mean()

Unnamed: 0,teacher_prefix,project_is_approved
0,Dr.,0.375
1,Mr.,0.504878
2,Mrs.,0.534025
3,Ms.,0.509251
4,Teacher,0.427046


In [6]:
train_data[['school_state','project_is_approved']].groupby('school_state', as_index=False).mean().sort_values(
    by='project_is_approved', ascending=False)[0:10]

Unnamed: 0,school_state,project_is_approved
8,DE,0.597484
6,CT,0.582649
41,SD,0.572327
28,ND,0.565789
40,SC,0.564371
35,OH,0.560694
21,ME,0.555133
49,WV,0.554217
47,WA,0.553339
19,MA,0.549643


In [7]:
train_data[['project_grade_category','project_is_approved']].groupby(
    'project_grade_category', as_index=False).mean().sort_values(by='project_is_approved', ascending=False)


Unnamed: 0,project_grade_category,project_is_approved
0,Grades 3-5,0.533968
3,Grades PreK-2,0.517677
1,Grades 6-8,0.511401
2,Grades 9-12,0.493197


In [8]:
train_data[['project_subject_categories','project_is_approved']].groupby(
    'project_subject_categories', as_index=False).mean().sort_values(
    by='project_is_approved', ascending=False)[0:10]

Unnamed: 0,project_subject_categories,project_is_approved
50,"Warmth, Care & Hunger",0.70568
15,"Health & Sports, Warmth, Care & Hunger",0.666667
31,"Literacy & Language, Warmth, Care & Hunger",0.636364
27,"Literacy & Language, History & Civics",0.610811
18,"History & Civics, Health & Sports",0.6
19,"History & Civics, Literacy & Language",0.591264
35,"Math & Science, History & Civics",0.574586
25,"Literacy & Language, Applied Learning",0.568047
24,Literacy & Language,0.562136
28,"Literacy & Language, Math & Science",0.560049


In [9]:
train_data[['project_subject_subcategories','project_is_approved']].groupby('project_subject_subcategories', as_index=False).mean().sort_values(
    'project_is_approved', ascending=False)[0:50]


Unnamed: 0,project_subject_subcategories,project_is_approved
222,"Financial Literacy, Health & Wellness",1.0
151,"Early Development, Foreign Languages",1.0
206,"Extracurricular, History & Geography",1.0
260,"Gym & Fitness, Warmth, Care & Hunger",1.0
198,"Environmental Science, Team Sports",1.0
267,"Health & Life Science, Music",1.0
182,"Environmental Science, Financial Literacy",1.0
174,"Economics, Literature & Writing",1.0
171,"Economics, Health & Life Science",1.0
48,"Character Education, Nutrition Education",1.0


In [10]:
## much more effective that individual 
cross_teacher_state = train_data[['teacher_prefix','school_state','project_is_approved']].groupby(
    by=['teacher_prefix','school_state'], as_index=False).mean().sort_values(
    'project_is_approved', ascending=False)

# print(cross_teacher_state)
cross_teacher_state[0:10]

Unnamed: 0,teacher_prefix,school_state,project_is_approved
0,Dr.,CA,1.0
3,Dr.,NJ,1.0
1,Dr.,IA,1.0
159,Ms.,WY,0.777778
35,Mr.,ND,0.75
208,Teacher,WV,0.714286
23,Mr.,KS,0.714286
19,Mr.,IA,0.695652
181,Teacher,ME,0.666667
28,Mr.,ME,0.652174


In [11]:
## much more effective that individual 
train_data[['school_state','project_grade_category','project_is_approved']].groupby(
    by=['school_state','project_grade_category'], as_index=False).mean().sort_values(
    'project_is_approved', ascending=False)[0:10]

Unnamed: 0,school_state,project_grade_category,project_is_approved
186,VT,Grades 9-12,0.75
33,DE,Grades 6-8,0.75
85,ME,Grades 6-8,0.722222
164,SD,Grades 3-5,0.672414
118,NE,Grades 9-12,0.666667
122,NH,Grades 9-12,0.652174
201,WY,Grades 6-8,0.642857
35,DE,Grades PreK-2,0.625
27,CT,Grades PreK-2,0.620482
142,OH,Grades 9-12,0.616822


In [12]:
train_data.head()

train_data[['project_subject_categories','project_subject_subcategories','project_is_approved']].groupby(
    by=['project_subject_categories','project_subject_subcategories'], as_index=False).mean().sort_values(
    'project_is_approved', ascending=False)[0:30]

Unnamed: 0,project_subject_categories,project_subject_subcategories,project_is_approved
115,"Applied Learning, Music & The Arts","Parent Involvement, Performing Arts",1.0
179,History & Civics,"Financial Literacy, Social Sciences",1.0
190,"History & Civics, Health & Sports","Civics & Government, Health & Wellness",1.0
194,"History & Civics, Literacy & Language","Civics & Government, Foreign Languages",1.0
189,"History & Civics, Applied Learning","History & Geography, Parent Involvement",1.0
331,"Math & Science, History & Civics","Environmental Science, Financial Literacy",1.0
198,"History & Civics, Literacy & Language","Economics, Literature & Writing",1.0
47,"Applied Learning, Health & Sports","Other, Team Sports",1.0
352,"Math & Science, Music & The Arts","Health & Life Science, Music",1.0
320,"Math & Science, Health & Sports","Environmental Science, Team Sports",1.0


In [13]:
train_data['num_posted_projects_bins'] = pd.cut(
    train_data['teacher_number_of_previously_posted_projects'],
    [-1,7,50,1000])


train_data[['num_posted_projects_bins','project_is_approved']].groupby(
    'num_posted_projects_bins', as_index=False).mean().sort_values(
    'project_is_approved', ascending=False)

Unnamed: 0,num_posted_projects_bins,project_is_approved
2,"(50, 1000]",0.722997
1,"(7, 50]",0.583002
0,"(-1, 7]",0.490573


In [14]:
train_data[['project_subject_categories','num_posted_projects_bins','project_is_approved']].groupby(
    by=['project_subject_categories','num_posted_projects_bins'], as_index=False).mean().sort_values(
    'project_is_approved', ascending=False)[0:20]

Unnamed: 0,project_subject_categories,num_posted_projects_bins,project_is_approved
29,"Health & Sports, Applied Learning","(50, 1000]",1.0
143,"Special Needs, Health & Sports","(50, 1000]",1.0
125,"Music & The Arts, Applied Learning","(50, 1000]",1.0
55,"History & Civics, Health & Sports","(7, 50]",1.0
131,"Music & The Arts, History & Civics","(50, 1000]",1.0
47,"Health & Sports, Warmth, Care & Hunger","(50, 1000]",1.0
46,"Health & Sports, Warmth, Care & Hunger","(7, 50]",1.0
80,"Literacy & Language, Health & Sports","(50, 1000]",1.0
94,"Literacy & Language, Warmth, Care & Hunger","(7, 50]",1.0
95,"Literacy & Language, Warmth, Care & Hunger","(50, 1000]",1.0


In [15]:
train_data['num_posted_projects_bins'] = pd.cut(
    train_data['teacher_number_of_previously_posted_projects'],
    [-1,7,50,1000])

label = LabelEncoder()

train_data['num_posted_projects_buckets'] = label.fit_transform(train_data['num_posted_projects_bins'])

train_data['num_posted_projects_buckets'].head(10)

38288     0
94265     0
144681    0
171729    1
174461    1
169996    0
6997      0
109763    2
82953     1
166118    0
Name: num_posted_projects_buckets, dtype: int64

In [23]:
resource_file = dir + 'resources.csv'
r_df = pd.read_csv(resource_file)
print(r_df.columns)

total_quantity_df = r_df[['id','quantity']].groupby('id', as_index=False).sum()
total_price_df = r_df[['id','price']].groupby('id', as_index=False).sum()

train_data = pd.merge(train_data, total_quantity_df, how='inner', on='id')
train_data = pd.merge(train_data, total_price_df, how='inner', on='id')

train_data['quantity'].describe()

Index(['id', 'description', 'quantity', 'price'], dtype='object')


KeyError: 'quantity'

In [22]:
train_data['quantity_bins'] = pd.cut(train_data['quantity'], [0,4,10,22,1000])

train_data[['quantity_bins','project_is_approved']].groupby('quantity_bins').mean().sort_values(
    'project_is_approved', ascending=False)

Unnamed: 0_level_0,project_is_approved
quantity_bins,Unnamed: 1_level_1
"(0, 4]",0.646926
"(4, 10]",0.512734
"(22, 1000]",0.475307
"(10, 22]",0.431606
