In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils.class_weight import compute_sample_weight

In [3]:
donors_choose = pd.read_csv('train_clean3.csv')

In [4]:
donors_choose.head()

Unnamed: 0.1,Unnamed: 0,id,teacher_id,teacher_prefix,school_state,project_submitted_datetime,project_grade_category,teacher_number_of_previously_posted_projects,project_is_approved,Applied Learning,...,quantity_std,price_count,price_sum,price_min,price_max,price_mean,price_std,mean_price,hour,minute
0,0,p036502,484aaf11257089a66cfedc9461c6bd0a,Ms.,NV,2016-11-18 14:45:59,Grades PreK-2,26,1,0.0,...,0.0,2,299.98,149.99,149.99,149.99,0.0,49.996667,14,45
1,1,p039565,df72a3ba8089423fa8a94be88060f6ed,Mrs.,GA,2017-04-26 15:57:28,Grades 3-5,1,0,0.0,...,,1,20.0,20.0,20.0,20.0,,1.0,15,57
2,2,p233823,a9b876a9252e08a55e3d894150f75ba3,Ms.,UT,2017-01-01 22:57:44,Grades 3-5,5,1,0.0,...,,1,469.99,469.99,469.99,469.99,,469.99,22,57
3,3,p185307,525fdbb6ec7f538a48beebaa0a51b24f,Mr.,NC,2016-08-12 15:42:11,Grades 3-5,16,0,0.0,...,0.0,5,684.47,18.95,354.99,136.894,133.428098,136.894,15,42
4,4,p013780,a63b5547a7239eae4c1872670848e61a,Mr.,CA,2016-08-06 09:09:11,Grades 6-8,42,1,0.0,...,,1,355.5,355.5,355.5,355.5,,177.75,9,9


In [5]:
donors_choose.drop('Unnamed: 0', 1, inplace=True)

In [6]:
donors_choose['teacher_prefix'] = donors_choose['teacher_prefix'].astype('category')
donors_choose['school_state'] = donors_choose['school_state'].astype('category')
donors_choose['project_grade_category'] = donors_choose['project_grade_category'].astype('category')
donors_choose['month'] = donors_choose['month'].astype('category')
donors_choose['dow'] = donors_choose['dow'].astype('category')

donors_choose = pd.get_dummies(donors_choose,columns = ['teacher_prefix','school_state','project_grade_category','month', 'dow'],drop_first=True)

In [7]:
count_vec = TfidfVectorizer(max_df=0.96, min_df=0.05, norm='l2')
count_vec_df = count_vec.fit_transform(donors_choose['lemm_text'])
count_vec_df = pd.DataFrame(count_vec_df.todense(), index=donors_choose.index, columns=count_vec.get_feature_names())

In [8]:
list(donors_choose.columns)

['id',
 'teacher_id',
 'project_submitted_datetime',
 'teacher_number_of_previously_posted_projects',
 'project_is_approved',
 'Applied Learning',
 'Health & Sports',
 'History & Civics',
 'Literacy & Language',
 'Math & Science',
 'Music & The Arts',
 'Special Needs_x',
 'Warmth Care & Hunger_x',
 'Applied Sciences',
 'Character Education',
 'Civics & Government',
 'College & Career Prep',
 'Community Service',
 'ESL',
 'Early Development',
 'Economics',
 'Environmental Science',
 'Extracurricular',
 'Financial Literacy',
 'Foreign Languages',
 'Gym & Fitness',
 'Health & Life Science',
 'Health & Wellness',
 'History & Geography',
 'Literacy',
 'Literature & Writing',
 'Mathematics',
 'Music',
 'Nutrition Education',
 'Other',
 'Parent Involvement',
 'Performing Arts',
 'Social Sciences',
 'Team Sports',
 'Visual Arts',
 'essay1_count',
 'essay2_count',
 'essay3_count',
 'essay4_count',
 'project_resource_summary_count',
 'desc_count',
 'essay1_count_nostop',
 'essay2_count_nostop',


In [9]:
scale_df = donors_choose[['teacher_number_of_previously_posted_projects','quantity_sum',
 'quantity_min',
 'quantity_max',
 'quantity_mean',
 'price_count',
 'price_sum',
 'price_min',
 'price_max',
 'price_mean',
 'mean_price']]

scale_df.isna().any()

teacher_number_of_previously_posted_projects    False
quantity_sum                                    False
quantity_min                                    False
quantity_max                                    False
quantity_mean                                   False
price_count                                     False
price_sum                                       False
price_min                                       False
price_max                                       False
price_mean                                      False
mean_price                                      False
dtype: bool

In [10]:
scaler = MinMaxScaler()
scale_features = scaler.fit_transform(scale_df.values)
scale_df= pd.DataFrame(scale_features, index=scale_df.index, columns=scale_df.columns)

In [17]:
x = donors_choose.drop(['teacher_id','project_submitted_datetime','project_is_approved','id','full_text','lemm_text','teacher_number_of_previously_posted_projects','quantity_sum',
 'quantity_min',
 'quantity_max',
 'quantity_mean',
 'price_count',
 'price_sum',
 'price_min',
 'price_max',
 'price_mean',
 'mean_price','essay1_count',
 'essay2_count',
 'essay3_count',
 'essay4_count',
 'project_resource_summary_count',
 'desc_count','essay1_count_nostop',
 'essay2_count_nostop',
 'essay3_count_nostop',
 'essay4_count_nostop',
 'proj_resouce_count_nostop',
 'desc_count_nostop','minute','price_std','quantity_std','hour'], 1)
y = donors_choose['project_is_approved']

In [18]:
list(x.columns)

['Applied Learning',
 'Health & Sports',
 'History & Civics',
 'Literacy & Language',
 'Math & Science',
 'Music & The Arts',
 'Special Needs_x',
 'Warmth Care & Hunger_x',
 'Applied Sciences',
 'Character Education',
 'Civics & Government',
 'College & Career Prep',
 'Community Service',
 'ESL',
 'Early Development',
 'Economics',
 'Environmental Science',
 'Extracurricular',
 'Financial Literacy',
 'Foreign Languages',
 'Gym & Fitness',
 'Health & Life Science',
 'Health & Wellness',
 'History & Geography',
 'Literacy',
 'Literature & Writing',
 'Mathematics',
 'Music',
 'Nutrition Education',
 'Other',
 'Parent Involvement',
 'Performing Arts',
 'Social Sciences',
 'Team Sports',
 'Visual Arts',
 'teacher_prefix_Mr.',
 'teacher_prefix_Mrs.',
 'teacher_prefix_Ms.',
 'teacher_prefix_Teacher',
 'school_state_AL',
 'school_state_AR',
 'school_state_AZ',
 'school_state_CA',
 'school_state_CO',
 'school_state_CT',
 'school_state_DC',
 'school_state_DE',
 'school_state_FL',
 'school_state_

In [19]:
pre_x = x.merge(count_vec_df, how='left', left_index=True, right_index=True)
full_x = pre_x.merge(scale_df, how='left', left_index=True, right_index=True)

full_x_array = full_x.as_matrix().astype(np.float)
y = y.as_matrix().astype(np.float)

In [20]:
x_train, x_test, y_train, y_test = train_test_split(full_x_array, y, test_size = 0.2, random_state=17)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((130422, 594), (32606, 594), (130422,), (32606,))

In [21]:
list(full_x.columns)

['Applied Learning',
 'Health & Sports',
 'History & Civics',
 'Literacy & Language',
 'Math & Science',
 'Music & The Arts',
 'Special Needs_x',
 'Warmth Care & Hunger_x',
 'Applied Sciences',
 'Character Education',
 'Civics & Government',
 'College & Career Prep',
 'Community Service',
 'ESL',
 'Early Development',
 'Economics',
 'Environmental Science',
 'Extracurricular',
 'Financial Literacy',
 'Foreign Languages',
 'Gym & Fitness',
 'Health & Life Science',
 'Health & Wellness',
 'History & Geography',
 'Literacy',
 'Literature & Writing',
 'Mathematics',
 'Music',
 'Nutrition Education',
 'Other',
 'Parent Involvement',
 'Performing Arts',
 'Social Sciences',
 'Team Sports',
 'Visual Arts',
 'teacher_prefix_Mr.',
 'teacher_prefix_Mrs.',
 'teacher_prefix_Ms.',
 'teacher_prefix_Teacher',
 'school_state_AL',
 'school_state_AR',
 'school_state_AZ',
 'school_state_CA',
 'school_state_CO',
 'school_state_CT',
 'school_state_DC',
 'school_state_DE',
 'school_state_FL',
 'school_state_

In [22]:
compute_class_weight('balanced', np.unique(y_train), y_train)

array([3.24545862, 0.59105947])

In [23]:
sample_weight = compute_sample_weight({0:3.24545862, 1:0.59105947}, y_train)

In [24]:
max_depth = [3,4,5,6]
learning_rate = np.linspace(0.0001, 3, 50)
param_grid = {'max_depth':max_depth, 'learning_rate':learning_rate}

In [25]:
model = xgb.XGBClassifier(n_jobs=-1)

model_gs = RandomizedSearchCV(model, param_grid, cv=5, scoring='roc_auc', n_iter=10, verbose=2)

model_gs.fit(x_train, y_train, sample_weight=sample_weight)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] max_depth=5, learning_rate=1.5918836734693875 ...................
[CV] .... max_depth=5, learning_rate=1.5918836734693875, total= 6.0min
[CV] max_depth=5, learning_rate=1.5918836734693875 ...................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  6.0min remaining:    0.0s


[CV] .... max_depth=5, learning_rate=1.5918836734693875, total= 6.1min
[CV] max_depth=5, learning_rate=1.5918836734693875 ...................
[CV] .... max_depth=5, learning_rate=1.5918836734693875, total= 6.2min
[CV] max_depth=5, learning_rate=1.5918836734693875 ...................
[CV] .... max_depth=5, learning_rate=1.5918836734693875, total= 5.8min
[CV] max_depth=5, learning_rate=1.5918836734693875 ...................
[CV] .... max_depth=5, learning_rate=1.5918836734693875, total= 5.8min
[CV] max_depth=5, learning_rate=2.0204408163265306 ...................
[CV] .... max_depth=5, learning_rate=2.0204408163265306, total= 2.3min
[CV] max_depth=5, learning_rate=2.0204408163265306 ...................
[CV] .... max_depth=5, learning_rate=2.0204408163265306, total= 2.0min
[CV] max_depth=5, learning_rate=2.0204408163265306 ...................
[CV] .... max_depth=5, learning_rate=2.0204408163265306, total= 2.1min
[CV] max_depth=5, learning_rate=2.0204408163265306 ...................
[CV] .

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed: 182.3min finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'max_depth': [3, 4, 5, 6], 'learning_rate': array([1.00000e-04, 6.13224e-02, 1.22545e-01, 1.83767e-01, 2.44990e-01,
       3.06212e-01, 3.67435e-01, 4.28657e-01, 4.89880e-01, 5.51102e-01,
       6.12324e-01, 6.73547e-01, 7.34769e-01, 7.95992e-01, 8.57214e-01,
       9.18437e-01,... 2.63267e+00, 2.69389e+00,
       2.75511e+00, 2.81633e+00, 2.87756e+00, 2.93878e+00, 3.00000e+00])},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
     

In [26]:
pred = model_gs.predict(x_test)

  if diff:


In [27]:
model_gs.score(x_test, y_test)

0.7389343031290954

In [28]:
print(classification_report(y_test, pred,
     target_names=['rejected','approved']))

             precision    recall  f1-score   support

   rejected       0.28      0.67      0.39      4909
   approved       0.92      0.69      0.79     27697

avg / total       0.82      0.69      0.73     32606



In [29]:
confusion_matrix(y_test, pred)

array([[ 3266,  1643],
       [ 8453, 19244]])

In [30]:
roc_auc_score(y_test, pred)

0.6800565541437033

In [31]:
test = pd.read_csv('test_clean3.csv')

In [32]:
test = test.rename({'proj_res_count_nostop':'proj_resouce_count_nostop'}, axis=1)

In [33]:
test['teacher_prefix'] = test['teacher_prefix'].astype('category')
test['school_state'] = test['school_state'].astype('category')
test['project_grade_category'] = test['project_grade_category'].astype('category')
test['month'] = test['month'].astype('category')
test['dow'] = test['dow'].astype('category')

test = pd.get_dummies(test,columns = ['teacher_prefix','school_state','project_grade_category','month','dow'],drop_first=True)

In [34]:
list(test.columns)

['id',
 'teacher_id',
 'project_submitted_datetime',
 'teacher_number_of_previously_posted_projects',
 'quantity_sum',
 'quantity_min',
 'quantity_max',
 'quantity_mean',
 'quantity_std',
 'price_count',
 'price_sum',
 'price_min',
 'price_max',
 'price_mean',
 'price_std',
 'mean_price',
 'Applied Learning',
 'Health & Sports',
 'History & Civics',
 'Literacy & Language',
 'Math & Science',
 'Music & The Arts',
 'Special Needs_x',
 'Warmth Care & Hunger_x',
 'Applied Sciences',
 'Character Education',
 'Civics & Government',
 'College & Career Prep',
 'Community Service',
 'ESL',
 'Early Development',
 'Economics',
 'Environmental Science',
 'Extracurricular',
 'Financial Literacy',
 'Foreign Languages',
 'Gym & Fitness',
 'Health & Life Science',
 'Health & Wellness',
 'History & Geography',
 'Literacy',
 'Literature & Writing',
 'Mathematics',
 'Music',
 'Nutrition Education',
 'Other',
 'Parent Involvement',
 'Performing Arts',
 'Social Sciences',
 'Team Sports',
 'Visual Arts',
 '

In [35]:
count_vec_df_test = count_vec.transform(test['lemm_text'])
count_vec_df_test = pd.DataFrame(count_vec_df_test.todense(), index=test.index, columns=count_vec.get_feature_names())

In [36]:
scale_test = test[['teacher_number_of_previously_posted_projects','quantity_sum',
 'quantity_min',
 'quantity_max',
 'quantity_mean',
 'price_count',
 'price_sum',
 'price_min',
 'price_max',
 'price_mean',
 'mean_price']]

scale_test.isna().any()

teacher_number_of_previously_posted_projects    False
quantity_sum                                    False
quantity_min                                    False
quantity_max                                    False
quantity_mean                                   False
price_count                                     False
price_sum                                       False
price_min                                       False
price_max                                       False
price_mean                                      False
mean_price                                      False
dtype: bool

In [37]:
scale_features_test = scaler.transform(scale_test.values)
scale_test = pd.DataFrame(scale_features_test, index=scale_test.index, columns=scale_test.columns)

In [38]:
test_x = test.drop(['teacher_id','project_submitted_datetime','id','full_text','lemm_text','teacher_number_of_previously_posted_projects','quantity_sum',
 'quantity_min',
 'quantity_max',
 'quantity_mean',
 'price_count',
 'price_sum',
 'price_min',
 'price_max',
 'price_mean',
 'mean_price','essay1_count',
 'essay2_count',
 'essay3_count',
 'essay4_count',
 'proj_res_count',
 'desc_count','essay1_count_nostop',
 'essay2_count_nostop',
 'essay3_count_nostop',
 'essay4_count_nostop',
 'proj_resouce_count_nostop',
 'desc_count_nostop','minute','price_std','quantity_std','hour'], 1)

In [49]:
pre_test_x = test_x.merge(count_vec_df_test, how='left', left_index=True, right_index=True)
full_test_x = pre_test_x.merge(scale_test, how='left', left_index=True, right_index=True)

In [51]:
full_test_x = full_test_x[full_x.columns].as_matrix().astype(np.float)

In [52]:
full_test_x.shape

(78035, 594)

In [53]:
pred = model_gs.predict_proba(full_test_x)

In [54]:
pred_prob_approved = [x[1] for x in pred]

In [55]:
ids = np.array(test['id'])

In [56]:
pred_dict = {'id':ids, 'project_is_approved': pred_prob_approved}
pred_dict

{'id': array(['p233245', 'p096795', 'p236235', ..., 'p210728', 'p060531',
        'p087783'], dtype=object),
 'project_is_approved': [0.726335,
  0.65027463,
  0.5923178,
  0.6089584,
  0.47600594,
  0.70158917,
  0.66081333,
  0.71240777,
  0.4308978,
  0.9073456,
  0.5282836,
  0.65599495,
  0.29876378,
  0.47296995,
  0.707054,
  0.85986567,
  0.8174756,
  0.3101503,
  0.528037,
  0.28239688,
  0.7123717,
  0.8742967,
  0.6107387,
  0.45423388,
  0.42631575,
  0.5371591,
  0.67174536,
  0.73620003,
  0.79624534,
  0.6741571,
  0.6510378,
  0.5083904,
  0.5462803,
  0.720865,
  0.81869096,
  0.6359385,
  0.62463385,
  0.60562015,
  0.579221,
  0.55347264,
  0.33509365,
  0.4850194,
  0.34292555,
  0.63539344,
  0.46168226,
  0.52210045,
  0.5553521,
  0.52194136,
  0.6860828,
  0.5284089,
  0.32430354,
  0.5565326,
  0.3366008,
  0.8398182,
  0.55182344,
  0.26914203,
  0.7985068,
  0.4960216,
  0.8243648,
  0.38091862,
  0.6108618,
  0.41780964,
  0.6536929,
  0.43449107,
  0.716701

In [57]:
submission = pd.DataFrame(pred_dict)

In [58]:
submission.to_csv('xgboost1.csv', index=False)