In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

standart_scaler = preprocessing.StandardScaler(with_mean=True, with_std=True)
min_max_scaler = preprocessing.MinMaxScaler()


regions = {
    "New England": ["CT", "ME", "MA", "NH", "RI", "VT"],
    "Mideast": ["DE", "DC", "MD", "NJ", "NY", "PA"],
    "Great Lakes": ["IL", "IN", "MI", "OH", "WI"],
    "Plains": ["IA", "KS", "MN", "MO", "MS", "NE", "ND", "SD"],
    "Southeast": ["AL", "AR", "FL", "GA", "KY", "LA", "MS", "NC", "SC", "VA", "TN", "WV"],
    "Southwest": ["AZ", "NM", "OK", "TX"],
    "Rocky Mountain": ["CO", "ID", "MT", "UT", "WY"],
    "Far West": ["AK", "CA", "HI", "NV", "OR", "WA"]
}


states_by_region = [{state: region  for state in states}for (region, states) in regions.iteritems()]

states = {}
for d in states_by_region:
    states.update(d)

In [3]:
%matplotlib inline
import matplotlib
matplotlib.style.use('ggplot')

In [4]:
df_submission = pd.read_csv("data/sampleSubmission.csv", index_col=0)
df_outcomes = pd.read_csv("data/outcomes.csv", index_col=0)

# df_outcomes = df_outcomes_all[pd.notnull(df_outcomes_all.at_least_1_green_donation) 
#                                  & pd.notnull(df_outcomes_all.three_or_more_non_teacher_referred_donors)]

df_projects = pd.read_csv("data/projects.csv", index_col=0)
df_test_projects = df_projects[df_projects.index.isin(df_submission.index)]

In [5]:
df_projects_train = pd.merge(df_projects, df_outcomes, left_index=True,right_index=True,how="inner")
df_projects_train.shape

(619326, 45)

In [6]:
df_projects_train['is_new'] = df_projects_train.date_posted.apply(lambda date: int(date.split('-')[0]) > 2012)
df_projects_train = df_projects_train[df_projects_train.is_new].drop("is_new", axis=1)

In [7]:
def fix_missing_data_for_projects(df):
    
    def remove_students_reached_outliers(students):
        if(students < 20000 or pd.isnull(students)):
            return students

        #magic, found better value
        return 7500

    #remove outliers for students reached field
    df["new_students_reached"] = df["students_reached"].apply(remove_students_reached_outliers)
    df.new_students_reached.fillna(df.new_students_reached.median(), inplace=True) 
    df["students_reached"] = df["new_students_reached"]
    df.drop('new_students_reached', axis=1, inplace=True)

    #fill primary focus NAs
    df.primary_focus_subject.fillna(df.primary_focus_subject.value_counts().index[0], inplace=True)
    df.primary_focus_area.fillna(df.primary_focus_area.value_counts().index[0], inplace=True)

    df.resource_type.fillna(df.resource_type.value_counts().index[0], inplace=True)

    #fill fulfillment_labor_materials
    df.fulfillment_labor_materials.fillna(df.fulfillment_labor_materials.median(), inplace=True)

    #fill grade level NA's
    df.grade_level.fillna(df.grade_level.value_counts().index[0], inplace=True)

    df.teacher_prefix.fillna(df.teacher_prefix.value_counts().index[0], inplace=True)

    #optional support
    df["optional_support"] = df["total_price_including_optional_support"] - df["total_price_excluding_optional_support"]

    #school_region
    df["school_region"] =  df["school_state"].apply(lambda state: states[state.upper()])
    
    return df
    
df_projects_train = fix_missing_data_for_projects(df_projects_train)
df_test_projects = fix_missing_data_for_projects(df_test_projects)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

In [8]:
def get_teachers_data(group):
    projects_count = len(group)
    investments_involved = group
    exciting_projects = len(group[group["is_exciting"] == "t"])
    exciting_projects_rate = 1.0 * exciting_projects / projects_count
    fully_funded_count = len(group[group["fully_funded"] == "t"])
    fully_funded_rate = 1.0 * fully_funded_count / projects_count
    green_donation_count = len(group[group["at_least_1_green_donation"] == "t"])
    green_donation_rate = 1.0 * green_donation_count / projects_count
    great_chat_count = len(group[group["great_chat"] == "t"])
    great_chat_rate = 1.0 * great_chat_count / projects_count
    teacher_referred_count_mean = group["teacher_referred_count"].median()
    donation_from_thoughtful_donor_count = len(group["donation_from_thoughtful_donor"] == "t")
    donation_from_thoughtful_donor_rate = 1.0 * donation_from_thoughtful_donor_count / projects_count
    non_teacher_referred_count_mean = group["non_teacher_referred_count"].mean()
    
    return pd.Series([projects_count, exciting_projects, exciting_projects_rate, fully_funded_count, 
                      fully_funded_rate, green_donation_count, green_donation_rate, great_chat_count,
                     great_chat_rate, teacher_referred_count_mean, donation_from_thoughtful_donor_count,
                     donation_from_thoughtful_donor_rate, non_teacher_referred_count_mean], 
                     index=['teacher_projects_count', 'teacher_exciting_projects', 'teacher_exciting_projects_rate', 
                            'teacher_fully_funded_count', 'teacher_fully_funded_rate', 
                            'teacher_green_donation_count', 'teacher_green_donation_rate', 
                            'teacher_great_chat_count', 'teacher_great_chat_rate', 
                            'teacher_referred_count_mean', 'teacher_donation_from_thoughtful_donor_count',
                            'teacher_donation_from_thoughtful_donor_rate', 'teacher_non_teacher_referred_count_mean'])

teachers = df_projects_train.head().groupby('teacher_acctid').apply(get_teachers_data)

In [11]:
df_projects_train.

(131329, 47)

In [25]:
def get_schools_data(group):
    projects_count = len(group)
    exciting_projects = len(group[group["is_exciting"] == "t"])
    exciting_projects_rate = 1.0 * exciting_projects / projects_count
    fully_funded_count = len(group[group["fully_funded"] == "t"])
    fully_funded_rate = 1.0 * fully_funded_count / projects_count
    green_donation_count = len(group[group["at_least_1_green_donation"] == "t"])
    green_donation_rate = 1.0 * green_donation_count / projects_count
    great_chat_count = len(group[group["great_chat"] == "t"])
    great_chat_rate = 1.0 * great_chat_count / projects_count
    teacher_referred_count_mean = group["teacher_referred_count"].mean()
    donation_from_thoughtful_donor_count = len(group["donation_from_thoughtful_donor"] == "t")
    donation_from_thoughtful_donor_rate = 1.0 * donation_from_thoughtful_donor_count / projects_count
    non_teacher_referred_count_mean = group["non_teacher_referred_count"].mean()
    
    return pd.Series([projects_count, exciting_projects, exciting_projects_rate, fully_funded_count, 
                      fully_funded_rate, green_donation_count, green_donation_rate, great_chat_count,
                     great_chat_rate, teacher_referred_count_mean, donation_from_thoughtful_donor_count,
                     donation_from_thoughtful_donor_rate, non_teacher_referred_count_mean], 
                     index=['school_projects_count', 'school_exciting_projects', 'school_exciting_projects_rate', 'school_fully_funded_count', 
                            'school_fully_funded_rate', 'school_green_donation_count', 'school_green_donation_rate', 'school_great_chat_count'
                            ,'school_great_chat_rate', 'school_teacher_referred_count_mean', 'school_donation_from_thoughtful_donor_count',
                           'school_donation_from_thoughtful_donor_rate', 'school_non_teacher_referred_count_mean'])

schools = df_projects_train.groupby('schoolid').apply(get_schools_data)

In [26]:
teachers.to_csv("train-data/teachers.csv")
schools.to_csv("train-data/schools.csv")

In [27]:
teachers.isnull().sum()

teacher_projects_count                             0
teacher_exciting_projects                          0
teacher_exciting_projects_rate                     0
teacher_fully_funded_count                         0
teacher_fully_funded_rate                          0
teacher_green_donation_count                       0
teacher_green_donation_rate                        0
teacher_great_chat_count                           0
teacher_great_chat_rate                            0
teacher_referred_count_mean                     8146
teacher_donation_from_thoughtful_donor_count       0
teacher_donation_from_thoughtful_donor_rate        0
teacher_non_teacher_referred_count_mean         8146
dtype: int64