In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils.class_weight import compute_sample_weight

In [2]:
donors_choose = pd.read_csv('train_clean3.csv')

In [3]:
donors_choose.head()

Unnamed: 0.1,Unnamed: 0,id,teacher_id,teacher_prefix,school_state,project_submitted_datetime,project_grade_category,teacher_number_of_previously_posted_projects,project_is_approved,Applied Learning,...,quantity_std,price_count,price_sum,price_min,price_max,price_mean,price_std,mean_price,hour,minute
0,0,p036502,484aaf11257089a66cfedc9461c6bd0a,Ms.,NV,2016-11-18 14:45:59,Grades PreK-2,26,1,0.0,...,0.0,2,299.98,149.99,149.99,149.99,0.0,49.996667,14,45
1,1,p039565,df72a3ba8089423fa8a94be88060f6ed,Mrs.,GA,2017-04-26 15:57:28,Grades 3-5,1,0,0.0,...,,1,20.0,20.0,20.0,20.0,,1.0,15,57
2,2,p233823,a9b876a9252e08a55e3d894150f75ba3,Ms.,UT,2017-01-01 22:57:44,Grades 3-5,5,1,0.0,...,,1,469.99,469.99,469.99,469.99,,469.99,22,57
3,3,p185307,525fdbb6ec7f538a48beebaa0a51b24f,Mr.,NC,2016-08-12 15:42:11,Grades 3-5,16,0,0.0,...,0.0,5,684.47,18.95,354.99,136.894,133.428098,136.894,15,42
4,4,p013780,a63b5547a7239eae4c1872670848e61a,Mr.,CA,2016-08-06 09:09:11,Grades 6-8,42,1,0.0,...,,1,355.5,355.5,355.5,355.5,,177.75,9,9


In [4]:
donors_choose.drop('Unnamed: 0', 1, inplace=True)

In [5]:
donors_choose['teacher_prefix'] = donors_choose['teacher_prefix'].astype('category')
donors_choose['school_state'] = donors_choose['school_state'].astype('category')
donors_choose['project_grade_category'] = donors_choose['project_grade_category'].astype('category')

donors_choose = pd.get_dummies(donors_choose,columns = ['teacher_prefix','school_state','project_grade_category'],drop_first=True)

In [6]:
count_vec = TfidfVectorizer(max_df=0.99, min_df=0.01, norm='l2')
count_vec_df = count_vec.fit_transform(donors_choose['lemm_text'])
count_vec_df = pd.DataFrame(count_vec_df.todense(), index=donors_choose.index, columns=count_vec.get_feature_names())

In [7]:
list(donors_choose.columns)

['id',
 'teacher_id',
 'project_submitted_datetime',
 'teacher_number_of_previously_posted_projects',
 'project_is_approved',
 'Applied Learning',
 'Health & Sports',
 'History & Civics',
 'Literacy & Language',
 'Math & Science',
 'Music & The Arts',
 'Special Needs_x',
 'Warmth Care & Hunger_x',
 'Applied Sciences',
 'Character Education',
 'Civics & Government',
 'College & Career Prep',
 'Community Service',
 'ESL',
 'Early Development',
 'Economics',
 'Environmental Science',
 'Extracurricular',
 'Financial Literacy',
 'Foreign Languages',
 'Gym & Fitness',
 'Health & Life Science',
 'Health & Wellness',
 'History & Geography',
 'Literacy',
 'Literature & Writing',
 'Mathematics',
 'Music',
 'Nutrition Education',
 'Other',
 'Parent Involvement',
 'Performing Arts',
 'Social Sciences',
 'Team Sports',
 'Visual Arts',
 'essay1_count',
 'essay2_count',
 'essay3_count',
 'essay4_count',
 'project_resource_summary_count',
 'desc_count',
 'essay1_count_nostop',
 'essay2_count_nostop',


In [8]:
scale_df = donors_choose[['teacher_number_of_previously_posted_projects','quantity_sum',
 'quantity_min',
 'quantity_max',
 'quantity_mean',
 'price_count',
 'price_sum',
 'price_min',
 'price_max',
 'price_mean',
 'mean_price']]

scale_df.isna().any()

teacher_number_of_previously_posted_projects    False
quantity_sum                                    False
quantity_min                                    False
quantity_max                                    False
quantity_mean                                   False
price_count                                     False
price_sum                                       False
price_min                                       False
price_max                                       False
price_mean                                      False
mean_price                                      False
dtype: bool

In [9]:
scaler = MinMaxScaler()
scale_features = scaler.fit_transform(scale_df.values)
scale_df= pd.DataFrame(scale_features, index=scale_df.index, columns=scale_df.columns)

In [10]:
x = donors_choose.drop(['teacher_id','project_submitted_datetime','project_is_approved','id','full_text','lemm_text','teacher_number_of_previously_posted_projects','quantity_sum',
 'quantity_min',
 'quantity_max',
 'quantity_mean',
 'price_count',
 'price_sum',
 'price_min',
 'price_max',
 'price_mean',
 'mean_price','essay1_count',
 'essay2_count',
 'essay3_count',
 'essay4_count',
 'project_resource_summary_count',
 'desc_count','essay1_count_nostop',
 'essay2_count_nostop',
 'essay3_count_nostop',
 'essay4_count_nostop',
 'proj_resouce_count_nostop',
 'desc_count_nostop','minute','price_std','quantity_std','hour'], 1)
y = donors_choose['project_is_approved']

In [11]:
list(x.columns)

['Applied Learning',
 'Health & Sports',
 'History & Civics',
 'Literacy & Language',
 'Math & Science',
 'Music & The Arts',
 'Special Needs_x',
 'Warmth Care & Hunger_x',
 'Applied Sciences',
 'Character Education',
 'Civics & Government',
 'College & Career Prep',
 'Community Service',
 'ESL',
 'Early Development',
 'Economics',
 'Environmental Science',
 'Extracurricular',
 'Financial Literacy',
 'Foreign Languages',
 'Gym & Fitness',
 'Health & Life Science',
 'Health & Wellness',
 'History & Geography',
 'Literacy',
 'Literature & Writing',
 'Mathematics',
 'Music',
 'Nutrition Education',
 'Other',
 'Parent Involvement',
 'Performing Arts',
 'Social Sciences',
 'Team Sports',
 'Visual Arts',
 'month',
 'dow',
 'teacher_prefix_Mr.',
 'teacher_prefix_Mrs.',
 'teacher_prefix_Ms.',
 'teacher_prefix_Teacher',
 'school_state_AL',
 'school_state_AR',
 'school_state_AZ',
 'school_state_CA',
 'school_state_CO',
 'school_state_CT',
 'school_state_DC',
 'school_state_DE',
 'school_state_FL

In [12]:
pre_x = x.merge(count_vec_df, how='left', left_index=True, right_index=True)
full_x = pre_x.merge(scale_df, how='left', left_index=True, right_index=True)

full_x_array = full_x.as_matrix().astype(np.float)
y = y.as_matrix().astype(np.float)

In [13]:
x_train, x_test, y_train, y_test = train_test_split(full_x_array, y, test_size = 0.2, random_state=17)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((130422, 1816), (32606, 1816), (130422,), (32606,))

In [14]:
list(full_x.columns)

['Applied Learning',
 'Health & Sports',
 'History & Civics',
 'Literacy & Language',
 'Math & Science',
 'Music & The Arts',
 'Special Needs_x',
 'Warmth Care & Hunger_x',
 'Applied Sciences',
 'Character Education',
 'Civics & Government',
 'College & Career Prep',
 'Community Service',
 'ESL',
 'Early Development',
 'Economics',
 'Environmental Science',
 'Extracurricular',
 'Financial Literacy',
 'Foreign Languages',
 'Gym & Fitness',
 'Health & Life Science',
 'Health & Wellness',
 'History & Geography',
 'Literacy',
 'Literature & Writing',
 'Mathematics',
 'Music',
 'Nutrition Education',
 'Other',
 'Parent Involvement',
 'Performing Arts',
 'Social Sciences',
 'Team Sports',
 'Visual Arts',
 'month_x',
 'dow',
 'teacher_prefix_Mr.',
 'teacher_prefix_Mrs.',
 'teacher_prefix_Ms.',
 'teacher_prefix_Teacher',
 'school_state_AL',
 'school_state_AR',
 'school_state_AZ',
 'school_state_CA',
 'school_state_CO',
 'school_state_CT',
 'school_state_DC',
 'school_state_DE',
 'school_state_

In [15]:
compute_class_weight('balanced', np.unique(y_train), y_train)

array([3.24545862, 0.59105947])

In [16]:
sample_weight = compute_sample_weight({0:3.24545862, 1:0.59105947}, y_train)

In [19]:
max_depth = [2,3,4,5,6,7,8,9,10]
learning_rate = np.linspace(0.0001, 2, 100)
param_grid = {'max_depth':max_depth, 'learning_rate':learning_rate}

In [21]:
model = xgb.XGBClassifier()

model_gs = RandomizedSearchCV(model, param_grid, cv=5, scoring='roc_auc', n_iter=10, verbose=2, n_jobs=-1)

model_gs.fit(x_train, y_train, sample_weight=sample_weight)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] max_depth=6, learning_rate=1.8989949494949496 ...................
[CV] max_depth=6, learning_rate=1.8989949494949496 ...................
[CV] max_depth=6, learning_rate=1.8989949494949496 ...................
[CV] max_depth=6, learning_rate=1.8989949494949496 ...................
[CV] .... max_depth=6, learning_rate=1.8989949494949496, total=25.3min
[CV] max_depth=6, learning_rate=1.8989949494949496 ...................
[CV] .... max_depth=6, learning_rate=1.8989949494949496, total=25.3min
[CV] max_depth=4, learning_rate=1.2929646464646465 ...................
[CV] .... max_depth=6, learning_rate=1.8989949494949496, total=32.5min
[CV] max_depth=4, learning_rate=1.2929646464646465 ...................
[CV] .... max_depth=6, learning_rate=1.8989949494949496, total=33.1min
[CV] max_depth=4, learning_rate=1.2929646464646465 ...................
[CV] .... max_depth=4, learning_rate=1.2929646464646465, total=22.7min
[CV] max_depth=4

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 200.4min


[CV] max_depth=6, learning_rate=1.393969696969697 ....................
[CV] .... max_depth=2, learning_rate=1.8383919191919191, total=11.0min
[CV] max_depth=6, learning_rate=1.393969696969697 ....................
[CV] .... max_depth=9, learning_rate=1.2727636363636363, total=51.7min
[CV] max_depth=6, learning_rate=1.393969696969697 ....................
[CV] ..... max_depth=6, learning_rate=1.393969696969697, total=33.5min
[CV] max_depth=6, learning_rate=1.393969696969697 ....................
[CV] ..... max_depth=6, learning_rate=1.393969696969697, total=33.5min
[CV] max_depth=2, learning_rate=1.5555777777777777 ...................
[CV] ..... max_depth=6, learning_rate=1.393969696969697, total=34.4min
[CV] max_depth=2, learning_rate=1.5555777777777777 ...................
[CV] .... max_depth=2, learning_rate=1.5555777777777777, total=11.2min
[CV] max_depth=2, learning_rate=1.5555777777777777 ...................
[CV] ..... max_depth=6, learning_rate=1.393969696969697, total=35.7min
[CV] m

[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 342.9min finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
          fit_params=None, iid=True, n_iter=10, n_jobs=-1,
          param_distributions={'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10], 'learning_rate': array([1.0000e-04, 2.0301e-02, ..., 1.9798e+00, 2.0000e+00])},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=2)

In [22]:
pred = model_gs.predict(x_test)

  if diff:


In [23]:
model_gs.score(x_test, y_test)

0.7362921295681928

In [24]:
print(classification_report(y_test, pred,
     target_names=['rejected','approved']))

             precision    recall  f1-score   support

   rejected       0.28      0.66      0.39      4909
   approved       0.92      0.70      0.79     27697

avg / total       0.82      0.69      0.73     32606



In [25]:
confusion_matrix(y_test, pred)

array([[ 3243,  1666],
       [ 8373, 19324]])

In [26]:
roc_auc_score(y_test, pred)

0.679158117901786

In [27]:
test = pd.read_csv('test_clean3.csv')

In [28]:
test = test.rename({'proj_res_count_nostop':'proj_resouce_count_nostop'}, axis=1)

In [29]:
test['teacher_prefix'] = test['teacher_prefix'].astype('category')
test['school_state'] = test['school_state'].astype('category')
test['project_grade_category'] = test['project_grade_category'].astype('category')


test = pd.get_dummies(test,columns = ['teacher_prefix','school_state','project_grade_category'],drop_first=True)

In [30]:
list(test.columns)

['id',
 'teacher_id',
 'project_submitted_datetime',
 'teacher_number_of_previously_posted_projects',
 'quantity_sum',
 'quantity_min',
 'quantity_max',
 'quantity_mean',
 'quantity_std',
 'price_count',
 'price_sum',
 'price_min',
 'price_max',
 'price_mean',
 'price_std',
 'mean_price',
 'Applied Learning',
 'Health & Sports',
 'History & Civics',
 'Literacy & Language',
 'Math & Science',
 'Music & The Arts',
 'Special Needs_x',
 'Warmth Care & Hunger_x',
 'Applied Sciences',
 'Character Education',
 'Civics & Government',
 'College & Career Prep',
 'Community Service',
 'ESL',
 'Early Development',
 'Economics',
 'Environmental Science',
 'Extracurricular',
 'Financial Literacy',
 'Foreign Languages',
 'Gym & Fitness',
 'Health & Life Science',
 'Health & Wellness',
 'History & Geography',
 'Literacy',
 'Literature & Writing',
 'Mathematics',
 'Music',
 'Nutrition Education',
 'Other',
 'Parent Involvement',
 'Performing Arts',
 'Social Sciences',
 'Team Sports',
 'Visual Arts',
 '

In [31]:
count_vec_df_test = count_vec.transform(test['lemm_text'])
count_vec_df_test = pd.DataFrame(count_vec_df_test.todense(), index=test.index, columns=count_vec.get_feature_names())

In [32]:
scale_test = test[['teacher_number_of_previously_posted_projects','quantity_sum',
 'quantity_min',
 'quantity_max',
 'quantity_mean',
 'price_count',
 'price_sum',
 'price_min',
 'price_max',
 'price_mean',
 'mean_price']]

scale_test.isna().any()

teacher_number_of_previously_posted_projects    False
quantity_sum                                    False
quantity_min                                    False
quantity_max                                    False
quantity_mean                                   False
price_count                                     False
price_sum                                       False
price_min                                       False
price_max                                       False
price_mean                                      False
mean_price                                      False
dtype: bool

In [33]:
scale_features_test = scaler.transform(scale_test.values)
scale_test = pd.DataFrame(scale_features_test, index=scale_test.index, columns=scale_test.columns)

In [34]:
test_x = test.drop(['teacher_id','project_submitted_datetime','id','full_text','lemm_text','teacher_number_of_previously_posted_projects','quantity_sum',
 'quantity_min',
 'quantity_max',
 'quantity_mean',
 'price_count',
 'price_sum',
 'price_min',
 'price_max',
 'price_mean',
 'mean_price','essay1_count',
 'essay2_count',
 'essay3_count',
 'essay4_count',
 'proj_res_count',
 'desc_count','essay1_count_nostop',
 'essay2_count_nostop',
 'essay3_count_nostop',
 'essay4_count_nostop',
 'proj_resouce_count_nostop',
 'desc_count_nostop','minute','price_std','quantity_std','hour'], 1)

In [35]:
pre_test_x = test_x.merge(count_vec_df_test, how='left', left_index=True, right_index=True)
full_test_x = pre_test_x.merge(scale_test, how='left', left_index=True, right_index=True)

In [36]:
full_test_x = full_test_x[full_x.columns].as_matrix().astype(np.float)

In [37]:
full_test_x.shape

(78035, 1816)

In [38]:
pred = model_gs.predict_proba(full_test_x)

In [39]:
pred_prob_approved = [x[1] for x in pred]

In [40]:
ids = np.array(test['id'])

In [41]:
pred_dict = {'id':ids, 'project_is_approved': pred_prob_approved}
pred_dict

{'id': array(['p233245', 'p096795', 'p236235', ..., 'p210728', 'p060531',
        'p087783'], dtype=object),
 'project_is_approved': [0.77313304,
  0.44574988,
  0.65006286,
  0.8848644,
  0.27281234,
  0.90777254,
  0.95108914,
  0.77922165,
  0.29077974,
  0.9328222,
  0.48341033,
  0.54576087,
  0.4281348,
  0.48483998,
  0.92703706,
  0.83099186,
  0.9210652,
  0.1475335,
  0.47157463,
  0.2915581,
  0.659469,
  0.9366192,
  0.70964557,
  0.56021106,
  0.37428874,
  0.52484065,
  0.883275,
  0.76752234,
  0.66190135,
  0.8377974,
  0.7270613,
  0.68422306,
  0.58109474,
  0.75630265,
  0.82108116,
  0.7654137,
  0.7393805,
  0.55628157,
  0.85737723,
  0.7817702,
  0.10296429,
  0.7176922,
  0.18218145,
  0.59513724,
  0.37063694,
  0.46940464,
  0.2856409,
  0.49468768,
  0.7072062,
  0.44712278,
  0.1840747,
  0.7161427,
  0.65908784,
  0.9764025,
  0.5843147,
  0.20754002,
  0.8625,
  0.4189056,
  0.90240586,
  0.247336,
  0.76446635,
  0.28941235,
  0.45668402,
  0.30912438,
  

In [42]:
submission = pd.DataFrame(pred_dict)

In [43]:
submission.to_csv('xgboost3.csv', index=False)