In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils.class_weight import compute_sample_weight

In [2]:
donors_choose = pd.read_csv('train_clean3.csv')

In [3]:
donors_choose.head()

Unnamed: 0.1,Unnamed: 0,id,teacher_id,teacher_prefix,school_state,project_submitted_datetime,project_grade_category,teacher_number_of_previously_posted_projects,project_is_approved,Applied Learning,...,quantity_std,price_count,price_sum,price_min,price_max,price_mean,price_std,mean_price,hour,minute
0,0,p036502,484aaf11257089a66cfedc9461c6bd0a,Ms.,NV,2016-11-18 14:45:59,Grades PreK-2,26,1,0.0,...,0.0,2,299.98,149.99,149.99,149.99,0.0,49.996667,14,45
1,1,p039565,df72a3ba8089423fa8a94be88060f6ed,Mrs.,GA,2017-04-26 15:57:28,Grades 3-5,1,0,0.0,...,,1,20.0,20.0,20.0,20.0,,1.0,15,57
2,2,p233823,a9b876a9252e08a55e3d894150f75ba3,Ms.,UT,2017-01-01 22:57:44,Grades 3-5,5,1,0.0,...,,1,469.99,469.99,469.99,469.99,,469.99,22,57
3,3,p185307,525fdbb6ec7f538a48beebaa0a51b24f,Mr.,NC,2016-08-12 15:42:11,Grades 3-5,16,0,0.0,...,0.0,5,684.47,18.95,354.99,136.894,133.428098,136.894,15,42
4,4,p013780,a63b5547a7239eae4c1872670848e61a,Mr.,CA,2016-08-06 09:09:11,Grades 6-8,42,1,0.0,...,,1,355.5,355.5,355.5,355.5,,177.75,9,9


In [4]:
donors_choose.drop('Unnamed: 0', 1, inplace=True)

In [5]:
donors_choose['teacher_prefix'] = donors_choose['teacher_prefix'].astype('category')
donors_choose['school_state'] = donors_choose['school_state'].astype('category')
donors_choose['project_grade_category'] = donors_choose['project_grade_category'].astype('category')
donors_choose['month'] = donors_choose['month'].astype('category')
donors_choose['dow'] = donors_choose['dow'].astype('category')

donors_choose = pd.get_dummies(donors_choose,columns = ['teacher_prefix','school_state','project_grade_category','month', 'dow'],drop_first=True)

In [6]:
count_vec = TfidfVectorizer(max_df=0.97, min_df=0.05, norm='l2')
count_vec_df = count_vec.fit_transform(donors_choose['lemm_text'])
count_vec_df = pd.DataFrame(count_vec_df.todense(), index=donors_choose.index, columns=count_vec.get_feature_names())

In [7]:
list(donors_choose.columns)

['id',
 'teacher_id',
 'project_submitted_datetime',
 'teacher_number_of_previously_posted_projects',
 'project_is_approved',
 'Applied Learning',
 'Health & Sports',
 'History & Civics',
 'Literacy & Language',
 'Math & Science',
 'Music & The Arts',
 'Special Needs_x',
 'Warmth Care & Hunger_x',
 'Applied Sciences',
 'Character Education',
 'Civics & Government',
 'College & Career Prep',
 'Community Service',
 'ESL',
 'Early Development',
 'Economics',
 'Environmental Science',
 'Extracurricular',
 'Financial Literacy',
 'Foreign Languages',
 'Gym & Fitness',
 'Health & Life Science',
 'Health & Wellness',
 'History & Geography',
 'Literacy',
 'Literature & Writing',
 'Mathematics',
 'Music',
 'Nutrition Education',
 'Other',
 'Parent Involvement',
 'Performing Arts',
 'Social Sciences',
 'Team Sports',
 'Visual Arts',
 'essay1_count',
 'essay2_count',
 'essay3_count',
 'essay4_count',
 'project_resource_summary_count',
 'desc_count',
 'essay1_count_nostop',
 'essay2_count_nostop',


In [8]:
scale_df = donors_choose[['teacher_number_of_previously_posted_projects','quantity_sum',
 'quantity_min',
 'quantity_max',
 'quantity_mean',
 'price_count',
 'price_sum',
 'price_min',
 'price_max',
 'price_mean',
 'mean_price']]

scale_df.isna().any()

teacher_number_of_previously_posted_projects    False
quantity_sum                                    False
quantity_min                                    False
quantity_max                                    False
quantity_mean                                   False
price_count                                     False
price_sum                                       False
price_min                                       False
price_max                                       False
price_mean                                      False
mean_price                                      False
dtype: bool

In [9]:
scaler = MinMaxScaler()
scale_features = scaler.fit_transform(scale_df.values)
scale_df= pd.DataFrame(scale_features, index=scale_df.index, columns=scale_df.columns)

In [10]:
x = donors_choose.drop(['teacher_id','project_submitted_datetime','project_is_approved','id','full_text','lemm_text','teacher_number_of_previously_posted_projects','quantity_sum',
 'quantity_min',
 'quantity_max',
 'quantity_mean',
 'price_count',
 'price_sum',
 'price_min',
 'price_max',
 'price_mean',
 'mean_price','essay1_count',
 'essay2_count',
 'essay3_count',
 'essay4_count',
 'project_resource_summary_count',
 'desc_count','essay1_count_nostop',
 'essay2_count_nostop',
 'essay3_count_nostop',
 'essay4_count_nostop',
 'proj_resouce_count_nostop',
 'desc_count_nostop','minute','price_std','quantity_std','hour'], 1)
y = donors_choose['project_is_approved']

In [11]:
list(x.columns)

['Applied Learning',
 'Health & Sports',
 'History & Civics',
 'Literacy & Language',
 'Math & Science',
 'Music & The Arts',
 'Special Needs_x',
 'Warmth Care & Hunger_x',
 'Applied Sciences',
 'Character Education',
 'Civics & Government',
 'College & Career Prep',
 'Community Service',
 'ESL',
 'Early Development',
 'Economics',
 'Environmental Science',
 'Extracurricular',
 'Financial Literacy',
 'Foreign Languages',
 'Gym & Fitness',
 'Health & Life Science',
 'Health & Wellness',
 'History & Geography',
 'Literacy',
 'Literature & Writing',
 'Mathematics',
 'Music',
 'Nutrition Education',
 'Other',
 'Parent Involvement',
 'Performing Arts',
 'Social Sciences',
 'Team Sports',
 'Visual Arts',
 'teacher_prefix_Mr.',
 'teacher_prefix_Mrs.',
 'teacher_prefix_Ms.',
 'teacher_prefix_Teacher',
 'school_state_AL',
 'school_state_AR',
 'school_state_AZ',
 'school_state_CA',
 'school_state_CO',
 'school_state_CT',
 'school_state_DC',
 'school_state_DE',
 'school_state_FL',
 'school_state_

In [12]:
pre_x = x.merge(count_vec_df, how='left', left_index=True, right_index=True)
full_x = pre_x.merge(scale_df, how='left', left_index=True, right_index=True)

full_x_array = full_x.as_matrix().astype(np.float)
y = y.as_matrix().astype(np.float)

In [13]:
x_train, x_test, y_train, y_test = train_test_split(full_x_array, y, test_size = 0.2, random_state=17)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((130422, 594), (32606, 594), (130422,), (32606,))

In [14]:
list(full_x.columns)

['Applied Learning',
 'Health & Sports',
 'History & Civics',
 'Literacy & Language',
 'Math & Science',
 'Music & The Arts',
 'Special Needs_x',
 'Warmth Care & Hunger_x',
 'Applied Sciences',
 'Character Education',
 'Civics & Government',
 'College & Career Prep',
 'Community Service',
 'ESL',
 'Early Development',
 'Economics',
 'Environmental Science',
 'Extracurricular',
 'Financial Literacy',
 'Foreign Languages',
 'Gym & Fitness',
 'Health & Life Science',
 'Health & Wellness',
 'History & Geography',
 'Literacy',
 'Literature & Writing',
 'Mathematics',
 'Music',
 'Nutrition Education',
 'Other',
 'Parent Involvement',
 'Performing Arts',
 'Social Sciences',
 'Team Sports',
 'Visual Arts',
 'teacher_prefix_Mr.',
 'teacher_prefix_Mrs.',
 'teacher_prefix_Ms.',
 'teacher_prefix_Teacher',
 'school_state_AL',
 'school_state_AR',
 'school_state_AZ',
 'school_state_CA',
 'school_state_CO',
 'school_state_CT',
 'school_state_DC',
 'school_state_DE',
 'school_state_FL',
 'school_state_

In [15]:
compute_class_weight('balanced', np.unique(y_train), y_train)

array([3.24545862, 0.59105947])

In [16]:
sample_weight = compute_sample_weight({0:3.24545862, 1:0.59105947}, y_train)

In [17]:
max_depth = [2,3,4]
learning_rate = np.linspace(0.0001, 2, 50)
reg_lambda = np.linspace(0.0001, 2, 50)
param_grid = {'max_depth':max_depth, 'learning_rate':learning_rate, 'reg_lambda':reg_lambda}

In [18]:
model = xgb.XGBClassifier(n_jobs=2)

model_gs = RandomizedSearchCV(model, param_grid, cv=5, scoring='roc_auc', n_iter=10, verbose=2)

model_gs.fit(x_train, y_train, sample_weight=sample_weight)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] reg_lambda=0.40824285714285713, max_depth=2, learning_rate=1.5918571428571429 
[CV]  reg_lambda=0.40824285714285713, max_depth=2, learning_rate=1.5918571428571429, total= 1.2min
[CV] reg_lambda=0.40824285714285713, max_depth=2, learning_rate=1.5918571428571429 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.2min remaining:    0.0s


[CV]  reg_lambda=0.40824285714285713, max_depth=2, learning_rate=1.5918571428571429, total= 1.2min
[CV] reg_lambda=0.40824285714285713, max_depth=2, learning_rate=1.5918571428571429 
[CV]  reg_lambda=0.40824285714285713, max_depth=2, learning_rate=1.5918571428571429, total= 1.2min
[CV] reg_lambda=0.40824285714285713, max_depth=2, learning_rate=1.5918571428571429 
[CV]  reg_lambda=0.40824285714285713, max_depth=2, learning_rate=1.5918571428571429, total= 1.2min
[CV] reg_lambda=0.40824285714285713, max_depth=2, learning_rate=1.5918571428571429 
[CV]  reg_lambda=0.40824285714285713, max_depth=2, learning_rate=1.5918571428571429, total= 1.2min
[CV] reg_lambda=1.3877857142857142, max_depth=2, learning_rate=1.836742857142857 
[CV]  reg_lambda=1.3877857142857142, max_depth=2, learning_rate=1.836742857142857, total= 1.3min
[CV] reg_lambda=1.3877857142857142, max_depth=2, learning_rate=1.836742857142857 
[CV]  reg_lambda=1.3877857142857142, max_depth=2, learning_rate=1.836742857142857, total= 1

[CV]  reg_lambda=1.2245285714285714, max_depth=4, learning_rate=0.8163857142857143, total= 2.3min
[CV] reg_lambda=1.2245285714285714, max_depth=4, learning_rate=0.8163857142857143 
[CV]  reg_lambda=1.2245285714285714, max_depth=4, learning_rate=0.8163857142857143, total= 2.4min
[CV] reg_lambda=1.2245285714285714, max_depth=4, learning_rate=0.8163857142857143 
[CV]  reg_lambda=1.2245285714285714, max_depth=4, learning_rate=0.8163857142857143, total= 2.3min


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed: 84.0min finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=2, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'max_depth': [2, 3, 4], 'learning_rate': array([1.00000e-04, 4.09143e-02, 8.17286e-02, 1.22543e-01, 1.63357e-01,
       2.04171e-01, 2.44986e-01, 2.85800e-01, 3.26614e-01, 3.67429e-01,
       4.08243e-01, 4.49057e-01, 4.89871e-01, 5.30686e-01, 5.71500e-01,
       6.12314e-01, 6.... 1.75511e+00, 1.79593e+00,
       1.83674e+00, 1.87756e+00, 1.91837e+00, 1.95919e+00, 2.00000e+00])},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
      

In [19]:
pred = model_gs.predict(x_test)

  if diff:


In [20]:
model_gs.score(x_test, y_test)

0.7255478234024976

In [21]:
print(classification_report(y_test, pred,
     target_names=['rejected','approved']))

             precision    recall  f1-score   support

   rejected       0.26      0.67      0.38      4909
   approved       0.92      0.67      0.77     27697

avg / total       0.82      0.67      0.71     32606



In [22]:
confusion_matrix(y_test, pred)

array([[ 3292,  1617],
       [ 9249, 18448]])

In [23]:
roc_auc_score(y_test, pred)

0.6683349639909508

In [24]:
test = pd.read_csv('test_clean3.csv')

In [25]:
test = test.rename({'proj_res_count_nostop':'proj_resouce_count_nostop'}, axis=1)

In [26]:
test['teacher_prefix'] = test['teacher_prefix'].astype('category')
test['school_state'] = test['school_state'].astype('category')
test['project_grade_category'] = test['project_grade_category'].astype('category')
test['month'] = test['month'].astype('category')
test['dow'] = test['dow'].astype('category')

test = pd.get_dummies(test,columns = ['teacher_prefix','school_state','project_grade_category','month','dow'],drop_first=True)

In [27]:
list(test.columns)

['id',
 'teacher_id',
 'project_submitted_datetime',
 'teacher_number_of_previously_posted_projects',
 'quantity_sum',
 'quantity_min',
 'quantity_max',
 'quantity_mean',
 'quantity_std',
 'price_count',
 'price_sum',
 'price_min',
 'price_max',
 'price_mean',
 'price_std',
 'mean_price',
 'Applied Learning',
 'Health & Sports',
 'History & Civics',
 'Literacy & Language',
 'Math & Science',
 'Music & The Arts',
 'Special Needs_x',
 'Warmth Care & Hunger_x',
 'Applied Sciences',
 'Character Education',
 'Civics & Government',
 'College & Career Prep',
 'Community Service',
 'ESL',
 'Early Development',
 'Economics',
 'Environmental Science',
 'Extracurricular',
 'Financial Literacy',
 'Foreign Languages',
 'Gym & Fitness',
 'Health & Life Science',
 'Health & Wellness',
 'History & Geography',
 'Literacy',
 'Literature & Writing',
 'Mathematics',
 'Music',
 'Nutrition Education',
 'Other',
 'Parent Involvement',
 'Performing Arts',
 'Social Sciences',
 'Team Sports',
 'Visual Arts',
 '

In [28]:
count_vec_df_test = count_vec.transform(test['lemm_text'])
count_vec_df_test = pd.DataFrame(count_vec_df_test.todense(), index=test.index, columns=count_vec.get_feature_names())

In [29]:
scale_test = test[['teacher_number_of_previously_posted_projects','quantity_sum',
 'quantity_min',
 'quantity_max',
 'quantity_mean',
 'price_count',
 'price_sum',
 'price_min',
 'price_max',
 'price_mean',
 'mean_price']]

scale_test.isna().any()

teacher_number_of_previously_posted_projects    False
quantity_sum                                    False
quantity_min                                    False
quantity_max                                    False
quantity_mean                                   False
price_count                                     False
price_sum                                       False
price_min                                       False
price_max                                       False
price_mean                                      False
mean_price                                      False
dtype: bool

In [30]:
scale_features_test = scaler.transform(scale_test.values)
scale_test = pd.DataFrame(scale_features_test, index=scale_test.index, columns=scale_test.columns)

In [31]:
test_x = test.drop(['teacher_id','project_submitted_datetime','id','full_text','lemm_text','teacher_number_of_previously_posted_projects','quantity_sum',
 'quantity_min',
 'quantity_max',
 'quantity_mean',
 'price_count',
 'price_sum',
 'price_min',
 'price_max',
 'price_mean',
 'mean_price','essay1_count',
 'essay2_count',
 'essay3_count',
 'essay4_count',
 'proj_res_count',
 'desc_count','essay1_count_nostop',
 'essay2_count_nostop',
 'essay3_count_nostop',
 'essay4_count_nostop',
 'proj_resouce_count_nostop',
 'desc_count_nostop','minute','price_std','quantity_std','hour'], 1)

In [32]:
pre_test_x = test_x.merge(count_vec_df_test, how='left', left_index=True, right_index=True)
full_test_x = pre_test_x.merge(scale_test, how='left', left_index=True, right_index=True)

In [33]:
full_test_x = full_test_x[full_x.columns].as_matrix().astype(np.float)

In [34]:
full_test_x.shape

(78035, 594)

In [35]:
pred = model_gs.predict_proba(full_test_x)

In [36]:
pred_prob_approved = [x[1] for x in pred]

In [37]:
ids = np.array(test['id'])

In [38]:
pred_dict = {'id':ids, 'project_is_approved': pred_prob_approved}
pred_dict

{'id': array(['p233245', 'p096795', 'p236235', ..., 'p210728', 'p060531',
        'p087783'], dtype=object),
 'project_is_approved': [0.6839797,
  0.56060505,
  0.6603396,
  0.5683067,
  0.47738868,
  0.63064116,
  0.6829172,
  0.6373851,
  0.4006821,
  0.8055185,
  0.49346715,
  0.58780354,
  0.39523467,
  0.5006304,
  0.7529122,
  0.8401776,
  0.7743162,
  0.32950243,
  0.5391019,
  0.33664384,
  0.70119154,
  0.7939787,
  0.57931966,
  0.46954,
  0.38812926,
  0.54807496,
  0.6453051,
  0.70168746,
  0.7258294,
  0.6193686,
  0.61112237,
  0.50393474,
  0.49144652,
  0.722907,
  0.73562276,
  0.60649174,
  0.61951077,
  0.6475868,
  0.57701814,
  0.56163985,
  0.31524086,
  0.530942,
  0.27149183,
  0.6350876,
  0.4358384,
  0.48640892,
  0.57662857,
  0.4556707,
  0.70799917,
  0.5085403,
  0.23501626,
  0.47212577,
  0.31927398,
  0.8115723,
  0.56294936,
  0.31248814,
  0.73326385,
  0.48052034,
  0.7370087,
  0.4655617,
  0.5022606,
  0.42198578,
  0.6901412,
  0.36483878,
  0.6

In [39]:
submission = pd.DataFrame(pred_dict)

In [40]:
submission.to_csv('xgboost1.csv', index=False)