In [1]:
import pandas as pd
import numpy as np

In [2]:
%matplotlib inline
import matplotlib
matplotlib.style.use('ggplot')

In [3]:
regions = {
    "New England": ["CT", "ME", "MA", "NH", "RI", "VT"],
    "Mideast": ["DE", "DC", "MD", "NJ", "NY", "PA"],
    "Great Lakes": ["IL", "IN", "MI", "OH", "WI"],
    "Plains": ["IA", "KS", "MN", "MO", "MS", "NE", "ND", "SD"],
    "Southeast": ["AL", "AR", "FL", "GA", "KY", "LA", "MS", "NC", "SC", "VA", "TN", "WV"],
    "Southwest": ["AZ", "NM", "OK", "TX"],
    "Rocky Mountain": ["CO", "ID", "MT", "UT", "WY"],
    "Far West": ["AK", "CA", "HI", "NV", "OR", "WA"]
}


states_by_region = [{state: region  for state in states}for (region, states) in regions.iteritems()]

states = {}
for d in states_by_region:
    states.update(d)

In [4]:
df_submission = pd.read_csv("data/sampleSubmission.csv", index_col=0)
df_outcomes_all = pd.read_csv("data/outcomes.csv", index_col=0)
df_results = df_outcomes_all[['is_exciting']]

In [5]:
df_projects_all = pd.read_csv("data/projects.csv", index_col=0)

In [7]:
def filter_by_date(projects, year):
    is_new = projects.date_posted.apply(lambda date: int(date.split('-')[0]) >= year)
    return projects.loc[is_new,:]

df_projects = filter_by_date(df_projects_all, 2013)

In [8]:
df_projects_train = pd.merge(df_results, df_projects, right_index=True, left_index=True, how="inner")
df_projects_submission = df_projects_all[df_projects_all.index.isin(df_submission.index)]

In [10]:
columns = [u'school_state', u'school_metro', u'school_district',
       u'school_county', u'school_charter', u'school_magnet',
       u'school_year_round', u'school_nlns', u'school_kipp',
       u'school_charter_ready_promise', u'teacher_prefix',
       u'teacher_teach_for_america', u'teacher_ny_teaching_fellow',
       u'primary_focus_subject', u'primary_focus_area',
       u'secondary_focus_subject', u'secondary_focus_area', u'resource_type',
       u'poverty_level', u'grade_level', u'fulfillment_labor_materials',
       u'total_price_excluding_optional_support',
       u'total_price_including_optional_support', u'students_reached',
       u'eligible_double_your_impact_match', u'eligible_almost_home_match']

train_columns = [u'is_exciting', u'school_state', u'school_metro', u'school_district',
       u'school_county', u'school_charter', u'school_magnet',
       u'school_year_round', u'school_nlns', u'school_kipp',
       u'school_charter_ready_promise', u'teacher_prefix',
       u'teacher_teach_for_america', u'teacher_ny_teaching_fellow',
       u'primary_focus_subject', u'primary_focus_area',
       u'secondary_focus_subject', u'secondary_focus_area', u'resource_type',
       u'poverty_level', u'grade_level', u'fulfillment_labor_materials',
       u'total_price_excluding_optional_support',
       u'total_price_including_optional_support', u'students_reached',
       u'eligible_double_your_impact_match', u'eligible_almost_home_match']

In [11]:
df_projects_train = df_projects_train[train_columns]
df_projects_submission = df_projects_submission[columns]

In [12]:
def get_school_district_for_state(group):
    most_commot_district = group["school_district"].value_counts().index[0]
    return pd.Series([most_commot_district], 
                         index=['most_commot_district'])

def get_missing_district(project):
    return project['school_district'] if not pd.isnull(project['school_district']) else districts.ix[project['school_state'], project['school_county']].values[0]

districts = df_projects_train.groupby(['school_state', 'school_county']).apply(get_school_district_for_state)

def fix_projects_missing_data(projects):
    projects.students_reached.fillna(projects.students_reached.median(), inplace=True)
    projects.primary_focus_subject.fillna(projects.primary_focus_subject.value_counts().index[0], inplace=True)
    projects.primary_focus_area.fillna(projects.primary_focus_area.value_counts().index[0], inplace=True)
    projects.resource_type.fillna(projects.resource_type.value_counts().index[0], inplace=True)
    projects.grade_level.fillna(projects.grade_level.value_counts().index[0], inplace=True)
    projects.school_district = projects.apply(get_missing_district, axis=1).values
    return projects

In [13]:
df_projects_train = fix_projects_missing_data(df_projects_train)
df_projects_submission = fix_projects_missing_data(df_projects_submission)

In [225]:
df_projects_train.to_csv("train-data/projects.csv")
df_projects_submission.to_csv("train-data/submissions.csv")

In [47]:
df_essays = pd.read_csv("data/essays.csv", index_col=0)
df_essays = df_essays[df_essays.index.isin(df_projects.index)]
df_essays.fillna('', inplace=True)
df_essays['essay_len'] = df_essays.essay.apply(len)
df_essays['title_len'] = df_essays.title.apply(len)
df_essays['need_statement_len'] = df_essays.need_statement.apply(len)
df_essays['short_description_len'] = df_essays.short_description.apply(len)

df_essays_len = df_essays[['essay_len', 'title_len', 'need_statement_len', 'short_description_len']]
df_essays_len.to_csv("train-data/essays.csv")

In [48]:
import nltk
from gensim import corpora, models, similarities
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
en_stop = stopwords.words('english')

In [56]:
punctuation_regex = u'[^\u0400-\u0500a-zA-Z\d ]'

def remove_punctuation(series):
        return series.str.replace(punctuation_regex, '')
    
documents = df_essays.essay
documents = documents.fillna("")
documents = documents.apply(lambda text: text.replace("\r\\n\r\\n", ""))
documents = remove_punctuation(documents)
documents = documents.apply(str.lower)
tokenized = documents.apply(lambda text: nltk.word_tokenize(text.decode('utf-8')))

In [57]:
tokenized = tokenized.apply(lambda tokens: [porter_stemmer.stem(token) for token in tokens if token not in en_stop])

In [58]:
dictionary = corpora.Dictionary(tokenized.values)
corpus = [dictionary.doc2bow(text) for text in tokenized.values]
corpora.MmCorpus.serialize('corpus.mm', corpus)
corpus = corpora.MmCorpus('corpus.mm')

In [66]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=75)
corpus_lsi = lsi[corpus_tfidf]

In [92]:
get_main_topic = lambda document: sorted(document,key=lambda (index, relation): relation, reverse=True)[:2]
main_topics = [get_main_topic(document) for document in corpus_lsi]

In [131]:
main_topic = pd.Series([topic[0][0] if topic else None for topic in main_topics])
secondary_topic = pd.Series([topic[1][0] if topic else None for topic in main_topics])
topics_explained = pd.Series([topic[0][1] + topic[1][1] if topic else None for topic in main_topics])

Unnamed: 0_level_0,expences_per_student,school_region,main_topic,secondary_topic,topics_explained
projectid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
316ed8fb3b81402ff6ac8f721bb31192,17.369062,Far West,0,15,0.388444
90de744e368a7e4883223ca49318ae30,13.475909,Southwest,0,8,0.332286
32943bb1063267de6ed19fc0ceb4b9a7,25.346471,Far West,0,1,0.325069
bb18f409abda2f264d5acda8cab577a9,48.005833,Mideast,0,1,0.389604
24761b686e18e5eace634607acbcc19f,17.016667,Far West,0,1,0.466246


In [153]:
main_topic_prob = pd.Series([topic[0][1] if topic else None for topic in main_topics])
secondary_topic_prob = pd.Series([topic[1][1] if topic else None for topic in main_topics])

In [14]:
df_projects = fix_projects_missing_data(df_projects)

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [17]:
df_additional_data = df_projects[['school_state', 'total_price_excluding_optional_support', 'students_reached']]

df_additional_data['expences_per_student'] =  df_additional_data.apply(lambda project: 1.0 * project['total_price_excluding_optional_support'] / project['students_reached'], axis=1)
df_additional_data['school_region'] = df_additional_data.school_state.apply(lambda state: states[state])
df_additional_data = df_additional_data [['expences_per_student', 'school_region']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [None]:
df_additional_data['main_topic'] = main_topic.values
df_additional_data['secondary_topic'] = secondary_topic.values
# df_additional_data['main_topic_prob'] = main_topic_prob.values
# df_additional_data['secondary_topic_prob'] = secondary_topic_prob.values
df_additional_data['topics_explained'] = topics_explained.values

df_additional_data.main_topic.fillna(df_additional_data.main_topic.value_counts().index[0], inplace=True)
df_additional_data.secondary_topic.fillna(df_additional_data[df_additional_data.main_topic == 0].secondary_topic.value_counts().index[0], inplace=True)
df_additional_data.topics_explained.fillna(0, inplace=True)
# df_additional_data.main_topic_prob.fillna(0, inplace=True)
# df_additional_data.secondary_topic_prob.fillna(0, inplace=True)

df_additional_data.to_csv("train-data/projects-additional.csv")

df_additional_data.head()

In [19]:
df_donations_all = pd.read_csv("data/donations.csv", index_col=0)

In [41]:
df_donations = df_donations_all[df_donations_all.projectid.isin(df_projects.index)]
df_donations = pd.merge(df_donations, df_projects[['teacher_acctid', 'schoolid', 'grade_level', 'primary_focus_subject']], left_on="projectid", right_index=True, how="inner")
df_donations = pd.merge(df_donations, df_outcomes_all[["is_exciting"]], left_on="projectid", right_index=True, how="inner")

In [52]:
df_donations.head()

Unnamed: 0_level_0,projectid,donor_acctid,donor_city,donor_state,donor_zip,is_teacher_acct,donation_timestamp,donation_to_project,donation_optional_support,donation_total,...,payment_included_web_purchased_gift_card,payment_was_promo_matched,via_giving_page,for_honoree,donation_message,teacher_acctid,schoolid,grade_level,primary_focus_subject,is_exciting
donationid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
44842bc73032bcc4b44f77dd9007a6eb,ffff2d9c769c8fb5335e949c615425eb,ca2858ac1c427f52e39ca066db0b50a1,,ID,83201.0,f,2013-05-09 17:08:40.145,58.65,10.35,69,...,f,t,t,f,Education is the most important thing in the w...,485784e52a4c85ea6783285d09d8aea4,f8e49225a821cc04e1176303e55d89ec,Grades 3-5,Mathematics,t
45b90c3d8cbf06a4c88410069c7fa9c7,ffff2d9c769c8fb5335e949c615425eb,38fcf7f544893c94a77ec58d4729ffb8,,NY,,f,2013-03-06 22:48:04.013,85.0,15.0,100,...,f,f,t,f,Matching donation by the DonorsChoose.org Boar...,485784e52a4c85ea6783285d09d8aea4,f8e49225a821cc04e1176303e55d89ec,Grades 3-5,Mathematics,t
9047579ce6a5c45abd801aa9e4c16bce,ffff2d9c769c8fb5335e949c615425eb,2f471330531031f76a0dd43c03096be5,,CO,80003.0,f,2013-03-06 22:47:57.516,85.0,15.0,100,...,f,t,t,f,I gave because we should all pay it forward.,485784e52a4c85ea6783285d09d8aea4,f8e49225a821cc04e1176303e55d89ec,Grades 3-5,Mathematics,t
9e0f234dfa2982619dacd0635019d53d,ffff2d9c769c8fb5335e949c615425eb,af1f03b856fc2979c01a6ad8b61f6fbe,Pocatello,ID,83201.0,f,2013-05-08 12:58:55.38,68.0,12.0,80,...,f,f,f,f,Idaho Power Company believes in education and ...,485784e52a4c85ea6783285d09d8aea4,f8e49225a821cc04e1176303e55d89ec,Grades 3-5,Mathematics,t
fdcd840e3dd57016590cb8a69aed0c09,ffff2d9c769c8fb5335e949c615425eb,af1f03b856fc2979c01a6ad8b61f6fbe,Pocatello,ID,83201.0,f,2013-05-08 13:18:21.233,12.75,2.25,15,...,f,f,f,f,We beleive in Education and felt this was a go...,485784e52a4c85ea6783285d09d8aea4,f8e49225a821cc04e1176303e55d89ec,Grades 3-5,Mathematics,t


In [53]:
def get_donations(group, prefix):
    total_donated = group['donation_total'].sum()
    donation_to_project = group['donation_to_project'].sum()
    donation_optional_support = group['donation_optional_support'].sum()
    donated_promo_matched_total = group[group['payment_was_promo_matched'] == "t"]['donation_total'].sum() 
    donated_for_honoree = group[group['for_honoree'] == "t"]['donation_total'].sum() 
    donated_for_exciting = group[group['is_exciting'] == "t"]['donation_total'].sum() 
    
    columns = ['total_donated', 'donation_to_project', 'donation_optional_support', 
               'donated_promo_matched_total', 'donated_for_honoree', 'donated_for_exciting_project']
    
    columns = [prefix + '_' + column for column in columns]
    
    return pd.Series([total_donated, donation_to_project, donation_optional_support,
                     donated_promo_matched_total, donated_for_honoree, donated_for_exciting], 
                     index=[columns])

In [54]:
get_donations_to_teacher = lambda(group): get_donations(group, 'teacher')
df_teacher_donations = df_donations.groupby('teacher_acctid').apply(get_donations_to_teacher)

In [55]:
get_donations_to_school = lambda(group): get_donations(group, 'school')
df_school_donations = df_donations.groupby('schoolid').apply(get_donations_to_school)

In [58]:
get_donations_for_school_grades = lambda(group): get_donations(group, 'school_grades')
df_donations_for_school_grades = df_donations.groupby(['schoolid', 'grade_level']).apply(get_donations_for_school_grades)

In [59]:
get_donations_for_school_primary_subject = lambda(group): get_donations(group, 'primary_subject')
df_donations_for_school_subject = df_donations.groupby(['schoolid', 'primary_focus_subject']).apply(get_donations_for_school_primary_subject)

In [60]:
df_teacher_donations.to_csv("train-data/donations_to_teacher.csv")
df_school_donations.to_csv("train-data/donations_to_school.csv")
df_donations_for_school_grades.to_csv("train-data/donations_for_school_grades.csv")
df_donations_for_school_subject.to_csv("train-data/donations_for_school_subject.csv")

In [46]:
x`.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,primary_subjecttotal_donated,primary_subjectdonation_to_project,primary_subjectdonation_optional_support,primary_subjectdonated_promo_matched_total,primary_subjectdonated_for_honoree
schoolid,primary_focus_subject,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0000efec25428001c9dd83f651c5d181,Foreign Languages,718.41,645.45,72.96,167,0
0000efec25428001c9dd83f651c5d181,Visual Arts,20.0,18.5,1.5,10,0
00014d8717dd762910c815aceb2e5521,Literature & Writing,344.54,292.86,51.68,0,0
00014d8717dd762910c815aceb2e5521,Special Needs,999.63,852.51,147.12,295,0
00079011fea3d16aa2d3a4025aae6db6,Environmental Science,2005.59,1704.75,300.84,1006,0


In [64]:
# df_additional_data = pd.read_csv("train-data/projects-additional.csv",index_col=0)
df_additional_data.head()

Unnamed: 0_level_0,expences_per_student,school_region,main_topic,secondary_topic,topics_explained
projectid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
316ed8fb3b81402ff6ac8f721bb31192,17.369062,Far West,0,15,0.388444
90de744e368a7e4883223ca49318ae30,13.475909,Southwest,0,8,0.332286
32943bb1063267de6ed19fc0ceb4b9a7,25.346471,Far West,0,1,0.325069
bb18f409abda2f264d5acda8cab577a9,48.005833,Mideast,0,1,0.389604
24761b686e18e5eace634607acbcc19f,17.016667,Far West,0,1,0.466246


In [68]:
# df_donations_for_school_grades.to_csv("train-data/donations_for_school_grades.csv")
df_donations_for_school_grades.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,school_grades_total_donated,school_grades_donation_to_project,school_grades_donation_optional_support,school_grades_donated_promo_matched_total,school_grades_donated_for_honoree,school_grades_donated_for_exciting_project
schoolid,grade_level,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0000efec25428001c9dd83f651c5d181,Grades 9-12,738.41,663.95,74.46,177,0,0
00014d8717dd762910c815aceb2e5521,Grades 3-5,999.63,852.51,147.12,295,0,0
00014d8717dd762910c815aceb2e5521,Grades PreK-2,344.54,292.86,51.68,0,0,0
00079011fea3d16aa2d3a4025aae6db6,Grades 3-5,45.0,38.25,6.75,0,0,0
00079011fea3d16aa2d3a4025aae6db6,Grades 6-8,3876.34,3294.88,581.46,1006,0,0
