In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils.class_weight import compute_sample_weight

In [2]:
donors_choose = pd.read_csv('train_clean3.csv')

In [3]:
donors_choose.head()

Unnamed: 0.1,Unnamed: 0,id,teacher_id,teacher_prefix,school_state,project_submitted_datetime,project_grade_category,teacher_number_of_previously_posted_projects,project_is_approved,Applied Learning,...,quantity_std,price_count,price_sum,price_min,price_max,price_mean,price_std,mean_price,hour,minute
0,0,p036502,484aaf11257089a66cfedc9461c6bd0a,Ms.,NV,2016-11-18 14:45:59,Grades PreK-2,26,1,0.0,...,0.0,2,299.98,149.99,149.99,149.99,0.0,49.996667,14,45
1,1,p039565,df72a3ba8089423fa8a94be88060f6ed,Mrs.,GA,2017-04-26 15:57:28,Grades 3-5,1,0,0.0,...,,1,20.0,20.0,20.0,20.0,,1.0,15,57
2,2,p233823,a9b876a9252e08a55e3d894150f75ba3,Ms.,UT,2017-01-01 22:57:44,Grades 3-5,5,1,0.0,...,,1,469.99,469.99,469.99,469.99,,469.99,22,57
3,3,p185307,525fdbb6ec7f538a48beebaa0a51b24f,Mr.,NC,2016-08-12 15:42:11,Grades 3-5,16,0,0.0,...,0.0,5,684.47,18.95,354.99,136.894,133.428098,136.894,15,42
4,4,p013780,a63b5547a7239eae4c1872670848e61a,Mr.,CA,2016-08-06 09:09:11,Grades 6-8,42,1,0.0,...,,1,355.5,355.5,355.5,355.5,,177.75,9,9


In [4]:
donors_choose.drop('Unnamed: 0', 1, inplace=True)

In [5]:
donors_choose['teacher_prefix'] = donors_choose['teacher_prefix'].astype('category')
donors_choose['school_state'] = donors_choose['school_state'].astype('category')
donors_choose['project_grade_category'] = donors_choose['project_grade_category'].astype('category')
donors_choose['month'] = donors_choose['month'].astype('category')
donors_choose['dow'] = donors_choose['dow'].astype('category')

donors_choose = pd.get_dummies(donors_choose,columns = ['teacher_prefix','school_state','project_grade_category','month', 'dow'],drop_first=True)

In [6]:
count_vec = TfidfVectorizer(max_df=0.99, min_df=0.01, norm='l2')
count_vec_df = count_vec.fit_transform(donors_choose['lemm_text'])
count_vec_df = pd.DataFrame(count_vec_df.todense(), index=donors_choose.index, columns=count_vec.get_feature_names())

In [7]:
list(donors_choose.columns)

['id',
 'teacher_id',
 'project_submitted_datetime',
 'teacher_number_of_previously_posted_projects',
 'project_is_approved',
 'Applied Learning',
 'Health & Sports',
 'History & Civics',
 'Literacy & Language',
 'Math & Science',
 'Music & The Arts',
 'Special Needs_x',
 'Warmth Care & Hunger_x',
 'Applied Sciences',
 'Character Education',
 'Civics & Government',
 'College & Career Prep',
 'Community Service',
 'ESL',
 'Early Development',
 'Economics',
 'Environmental Science',
 'Extracurricular',
 'Financial Literacy',
 'Foreign Languages',
 'Gym & Fitness',
 'Health & Life Science',
 'Health & Wellness',
 'History & Geography',
 'Literacy',
 'Literature & Writing',
 'Mathematics',
 'Music',
 'Nutrition Education',
 'Other',
 'Parent Involvement',
 'Performing Arts',
 'Social Sciences',
 'Team Sports',
 'Visual Arts',
 'essay1_count',
 'essay2_count',
 'essay3_count',
 'essay4_count',
 'project_resource_summary_count',
 'desc_count',
 'essay1_count_nostop',
 'essay2_count_nostop',


In [8]:
scale_df = donors_choose[['teacher_number_of_previously_posted_projects','quantity_sum',
 'quantity_min',
 'quantity_max',
 'quantity_mean',
 'price_count',
 'price_sum',
 'price_min',
 'price_max',
 'price_mean',
 'mean_price']]

scale_df.isna().any()

teacher_number_of_previously_posted_projects    False
quantity_sum                                    False
quantity_min                                    False
quantity_max                                    False
quantity_mean                                   False
price_count                                     False
price_sum                                       False
price_min                                       False
price_max                                       False
price_mean                                      False
mean_price                                      False
dtype: bool

In [9]:
scaler = MinMaxScaler()
scale_features = scaler.fit_transform(scale_df.values)
scale_df= pd.DataFrame(scale_features, index=scale_df.index, columns=scale_df.columns)

In [10]:
x = donors_choose.drop(['teacher_id','project_submitted_datetime','project_is_approved','id','full_text','lemm_text','teacher_number_of_previously_posted_projects','quantity_sum',
 'quantity_min',
 'quantity_max',
 'quantity_mean',
 'price_count',
 'price_sum',
 'price_min',
 'price_max',
 'price_mean',
 'mean_price','essay1_count',
 'essay2_count',
 'essay3_count',
 'essay4_count',
 'project_resource_summary_count',
 'desc_count','essay1_count_nostop',
 'essay2_count_nostop',
 'essay3_count_nostop',
 'essay4_count_nostop',
 'proj_resouce_count_nostop',
 'desc_count_nostop','minute','price_std','quantity_std'], 1)
y = donors_choose['project_is_approved']

In [11]:
list(x.columns)

['Applied Learning',
 'Health & Sports',
 'History & Civics',
 'Literacy & Language',
 'Math & Science',
 'Music & The Arts',
 'Special Needs_x',
 'Warmth Care & Hunger_x',
 'Applied Sciences',
 'Character Education',
 'Civics & Government',
 'College & Career Prep',
 'Community Service',
 'ESL',
 'Early Development',
 'Economics',
 'Environmental Science',
 'Extracurricular',
 'Financial Literacy',
 'Foreign Languages',
 'Gym & Fitness',
 'Health & Life Science',
 'Health & Wellness',
 'History & Geography',
 'Literacy',
 'Literature & Writing',
 'Mathematics',
 'Music',
 'Nutrition Education',
 'Other',
 'Parent Involvement',
 'Performing Arts',
 'Social Sciences',
 'Team Sports',
 'Visual Arts',
 'hour',
 'teacher_prefix_Mr.',
 'teacher_prefix_Mrs.',
 'teacher_prefix_Ms.',
 'teacher_prefix_Teacher',
 'school_state_AL',
 'school_state_AR',
 'school_state_AZ',
 'school_state_CA',
 'school_state_CO',
 'school_state_CT',
 'school_state_DC',
 'school_state_DE',
 'school_state_FL',
 'scho

In [12]:
pre_x = x.merge(count_vec_df, how='left', left_index=True, right_index=True)
full_x = pre_x.merge(scale_df, how='left', left_index=True, right_index=True)

full_x = full_x.as_matrix().astype(np.float)
y = y.as_matrix().astype(np.float)

In [13]:
x_train, x_test, y_train, y_test = train_test_split(full_x, y, test_size = 0.2, random_state=17)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((130422, 1832), (32606, 1832), (130422,), (32606,))

In [14]:
compute_class_weight('balanced', np.unique(y_train), y_train)

array([3.24545862, 0.59105947])

In [15]:
sample_weight = compute_sample_weight({0:3.24545862, 1:0.59105947}, y_train)

In [16]:
loss = ['deviance', 'exponential']
learning_rate = np.linspace(0.0001, 5, 50)
param_grid = {'loss':loss, 'learning_rate':learning_rate}

In [17]:
gbm = GradientBoostingClassifier()

gbm_gs = RandomizedSearchCV(gbm, param_grid, cv=5, scoring='roc_auc', n_iter=10, n_jobs=2)

gbm_gs.fit(x_train, y_train, sample_weight=sample_weight)

RandomizedSearchCV(cv=5, error_score='raise',
          estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=2,
          param_distributions={'loss': ['deviance', 'exponential'], 'learning_rate': array([1.00000e-04, 1.02139e-01, 2.04178e-01, 3.06216e-01, 4.08255e-01,
       5.10294e-01, 6.12333e-01, 7.14371e-01, 8.16410e-01, 9.18449e-01,
       1.02049e+00, 1.12253e+00, 1.22457e+00, 1.32660e+00, 1.42864e+00,
       1.... 4.38777e+00, 4.48981e+00,
       4.59184e+00, 4.69388e+00, 4.79592e+00, 4.89796e+00, 5.000

In [18]:
pred = gbm_gs.predict(x_test)

In [19]:
gbm_gs.score(x_test, y_test)

0.7504896073185181

In [20]:
print(classification_report(y_test, pred,
     target_names=['rejected','approved']))

             precision    recall  f1-score   support

   rejected       0.29      0.66      0.41      4909
   approved       0.92      0.72      0.81     27697

avg / total       0.83      0.71      0.75     32606



In [21]:
confusion_matrix(y_test, pred)

array([[ 3238,  1671],
       [ 7842, 19855]])

In [22]:
roc_auc_score(y_test, pred)

0.6882347249382381

In [33]:
test = pd.read_csv('test_clean3.csv')

In [34]:
test = test.rename({'proj_res_count_nostop':'proj_resouce_count_nostop'}, axis=1)

In [35]:
test['teacher_prefix'] = test['teacher_prefix'].astype('category')
test['school_state'] = test['school_state'].astype('category')
test['project_grade_category'] = test['project_grade_category'].astype('category')
test['month'] = test['month'].astype('category')
test['dow'] = test['dow'].astype('category')

test = pd.get_dummies(test,columns = ['teacher_prefix','school_state','project_grade_category','month','dow'],drop_first=True)

In [36]:
list(test.columns)

['id',
 'teacher_id',
 'project_submitted_datetime',
 'teacher_number_of_previously_posted_projects',
 'quantity_sum',
 'quantity_min',
 'quantity_max',
 'quantity_mean',
 'quantity_std',
 'price_count',
 'price_sum',
 'price_min',
 'price_max',
 'price_mean',
 'price_std',
 'mean_price',
 'Applied Learning',
 'Health & Sports',
 'History & Civics',
 'Literacy & Language',
 'Math & Science',
 'Music & The Arts',
 'Special Needs_x',
 'Warmth Care & Hunger_x',
 'Applied Sciences',
 'Character Education',
 'Civics & Government',
 'College & Career Prep',
 'Community Service',
 'ESL',
 'Early Development',
 'Economics',
 'Environmental Science',
 'Extracurricular',
 'Financial Literacy',
 'Foreign Languages',
 'Gym & Fitness',
 'Health & Life Science',
 'Health & Wellness',
 'History & Geography',
 'Literacy',
 'Literature & Writing',
 'Mathematics',
 'Music',
 'Nutrition Education',
 'Other',
 'Parent Involvement',
 'Performing Arts',
 'Social Sciences',
 'Team Sports',
 'Visual Arts',
 '

In [37]:
count_vec_df_test = count_vec.transform(test['lemm_text'])
count_vec_df_test = pd.DataFrame(count_vec_df_test.todense(), index=test.index, columns=count_vec.get_feature_names())

In [38]:
scale_test = test[['teacher_number_of_previously_posted_projects','quantity_sum',
 'quantity_min',
 'quantity_max',
 'quantity_mean',
 'price_count',
 'price_sum',
 'price_min',
 'price_max',
 'price_mean',
 'mean_price']]

scale_test.isna().any()

teacher_number_of_previously_posted_projects    False
quantity_sum                                    False
quantity_min                                    False
quantity_max                                    False
quantity_mean                                   False
price_count                                     False
price_sum                                       False
price_min                                       False
price_max                                       False
price_mean                                      False
mean_price                                      False
dtype: bool

In [39]:
scale_features_test = scaler.transform(scale_test.values)
scale_test = pd.DataFrame(scale_features_test, index=scale_test.index, columns=scale_test.columns)

In [41]:
test_x = test.drop(['teacher_id','project_submitted_datetime','id','full_text','lemm_text','teacher_number_of_previously_posted_projects','quantity_sum',
 'quantity_min',
 'quantity_max',
 'quantity_mean',
 'price_count',
 'price_sum',
 'price_min',
 'price_max',
 'price_mean',
 'mean_price','essay1_count',
 'essay2_count',
 'essay3_count',
 'essay4_count',
 'proj_res_count',
 'desc_count','essay1_count_nostop',
 'essay2_count_nostop',
 'essay3_count_nostop',
 'essay4_count_nostop',
 'proj_resouce_count_nostop',
 'desc_count_nostop','minute','price_std','quantity_std'], 1)

In [42]:
pre_test_x = test_x.merge(count_vec_df_test, how='left', left_index=True, right_index=True)
full_test_x = pre_test_x.merge(scale_test, how='left', left_index=True, right_index=True)

In [43]:
pred = gbm_gs.predict_proba(full_test_x)

In [44]:
pred_prob_approved = [x[1] for x in pred]

In [45]:
ids = np.array(test['id'])

In [46]:
pred_dict = {'id':ids, 'project_is_approved': pred_prob_approved}
pred_dict

{'id': array(['p233245', 'p096795', 'p236235', ..., 'p210728', 'p060531',
        'p087783'], dtype=object),
 'project_is_approved': [0.8046354300274137,
  0.6806138703710682,
  0.5561473438105767,
  0.5379750180704083,
  0.4607775835911973,
  0.7583530415357704,
  0.6826122755256973,
  0.6846811798775093,
  0.3658922264512799,
  0.9012456653799908,
  0.5271380988520835,
  0.5626686074715991,
  0.4867681045616985,
  0.5903548032227903,
  0.8495621871016252,
  0.8518297056680818,
  0.8580865545282645,
  0.1837189605464936,
  0.6014053225890962,
  0.47261464951819415,
  0.6941358910924057,
  0.9191227455750379,
  0.7029536496043769,
  0.6168855102184627,
  0.5073505451872321,
  0.470948347938747,
  0.7465004023094203,
  0.6343481426487582,
  0.586635461395226,
  0.9001547599915756,
  0.7962024025530515,
  0.4805744107041227,
  0.5930520943291799,
  0.7995936321317928,
  0.7629378512940292,
  0.47599899052168626,
  0.7218107877414597,
  0.6218114783754731,
  0.3519810747183,
  0.707123076

In [47]:
submission = pd.DataFrame(pred_dict)

In [48]:
submission.to_csv('gbm7.csv', index=False)