In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.pipeline import FeatureUnion

In [2]:
donors_choose = pd.read_csv('train_clean4.csv')

In [3]:
donors_choose.head()

Unnamed: 0.1,Unnamed: 0,id,teacher_id,teacher_prefix,school_state,project_submitted_datetime,project_grade_category,project_title,project_resource_summary,teacher_number_of_previously_posted_projects,...,project_resource_summary_count,desc_count,essay1_count_nostop,essay2_count_nostop,proj_resouce_count_nostop,desc_count_nostop,month,year,dow,hour
0,0,p036502,484aaf11257089a66cfedc9461c6bd0a,Ms.,NV,2016-11-18 14:45:59,Grades PreK-2,super sight word centers,student need ipod nanos create differentiate e...,26,...,20,22.0,80,72,13,22,11,2016,4,14
1,1,p039565,df72a3ba8089423fa8a94be88060f6ed,Mrs.,GA,2017-04-26 15:57:28,Grades 3-5,keep calm dance,student need match shirt wear dance performanc...,1,...,12,11.0,54,54,8,11,4,2017,2,15
2,2,p233823,a9b876a9252e08a55e3d894150f75ba3,Ms.,UT,2017-01-01 22:57:44,Grades 3-5,lets doodle learn,student need doodler sem school mean student l...,5,...,33,5.0,80,47,18,5,1,2017,6,22
3,3,p185307,525fdbb6ec7f538a48beebaa0a51b24f,Mr.,NC,2016-08-12 15:42:11,Grades 3-5,kid inspired equipment increase activities gai...,student need ball activity equipment meet need...,16,...,36,22.0,95,91,19,20,8,2016,4,15
4,4,p013780,a63b5547a7239eae4c1872670848e61a,Mr.,CA,2016-08-06 09:09:11,Grades 6-8,need clean water culinary arts class,student need water filtration system culinary ...,42,...,12,10.0,36,49,8,8,8,2016,5,9


In [4]:
donors_choose.drop('Unnamed: 0', 1, inplace=True)

In [5]:
donors_choose['teacher_prefix'] = donors_choose['teacher_prefix'].astype('category')
donors_choose['school_state'] = donors_choose['school_state'].astype('category')
donors_choose['project_grade_category'] = donors_choose['project_grade_category'].astype('category')
donors_choose['month'] = donors_choose['month'].astype('category')
donors_choose['dow'] = donors_choose['dow'].astype('category')

donors_choose = pd.get_dummies(donors_choose,columns = ['teacher_prefix','school_state','project_grade_category','month', 'dow'],drop_first=True)

In [6]:
list(donors_choose.columns)

['id',
 'teacher_id',
 'project_submitted_datetime',
 'project_title',
 'project_resource_summary',
 'teacher_number_of_previously_posted_projects',
 'project_is_approved',
 'quantity_sum',
 'quantity_min',
 'quantity_max',
 'quantity_mean',
 'quantity_std',
 'price_count',
 'price_sum',
 'price_min',
 'price_max',
 'price_mean',
 'price_std',
 'mean_price',
 'description',
 'Applied Learning',
 'Health & Sports',
 'History & Civics',
 'Literacy & Language',
 'Math & Science',
 'Music & The Arts',
 'Special Needs_x',
 'Warmth Care & Hunger_x',
 'Applied Sciences',
 'Character Education',
 'Civics & Government',
 'College & Career Prep',
 'Community Service',
 'ESL',
 'Early Development',
 'Economics',
 'Environmental Science',
 'Extracurricular',
 'Financial Literacy',
 'Foreign Languages',
 'Gym & Fitness',
 'Health & Life Science',
 'Health & Wellness',
 'History & Geography',
 'Literacy',
 'Literature & Writing',
 'Mathematics',
 'Music',
 'Nutrition Education',
 'Other',
 'Parent I

In [7]:
donors_choose.isnull().any()

id                                              False
teacher_id                                      False
project_submitted_datetime                      False
project_title                                    True
project_resource_summary                        False
teacher_number_of_previously_posted_projects    False
project_is_approved                             False
quantity_sum                                    False
quantity_min                                    False
quantity_max                                    False
quantity_mean                                   False
quantity_std                                     True
price_count                                     False
price_sum                                       False
price_min                                       False
price_max                                       False
price_mean                                      False
price_std                                        True
mean_price                  

In [8]:
donors_choose['project_title'] = donors_choose['project_title'].fillna('')

In [9]:
project_title = CountVectorizer(ngram_range=(1, 2),max_df=0.99,min_df=0.01)
project_title_df = project_title.fit_transform(donors_choose['project_title'])
project_title_df = pd.DataFrame(project_title_df.todense(), index=donors_choose.index, columns=project_title.get_feature_names())
project_title_df = project_title_df.add_suffix('_title')

In [10]:
project_title_df.head()

Unnamed: 0,active_title,art_title,book_title,books_title,building_title,chromebooks_title,class_title,classroom_title,first_title,flexible_title,...,stem_title,students_title,success_title,supplies_title,technology_title,time_title,us_title,wiggle_title,work_title,world_title
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
project_resource = TfidfVectorizer(ngram_range=(1, 2),max_df=0.96,min_df=0.03)
project_resource_df = project_resource.fit_transform(donors_choose['project_resource_summary'])
project_resource_df = pd.DataFrame(project_resource_df.todense(), index=donors_choose.index, columns=project_resource.get_feature_names())
project_resource_df = project_resource_df.add_suffix('_resource')

In [12]:
project_resource_df.head(15)

Unnamed: 0,able_resource,access_resource,activity_resource,allow_resource,ball_resource,book_resource,center_resource,chair_resource,chromebooks_resource,class_resource,...,set_resource,skill_resource,stool_resource,supply_resource,technology_resource,time_resource,use_resource,wobble_resource,work_resource,write_resource
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.327386,0.0,0.0,0.0
3,0.0,0.0,0.788018,0.0,0.550752,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.500106,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.645211,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.690678,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.723162,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.372217,0.0,0.0,0.0,0.34174,0.0,0.0,...,0.0,0.0,0.389723,0.0,0.0,0.0,0.0,0.386581,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.555569,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.612242,0.0,0.0,0.0,0.0


In [13]:
donors_choose['student_description'] = donors_choose['description'].fillna('')

In [14]:
project_essay1 = TfidfVectorizer(ngram_range=(1, 2),max_df=0.96,min_df=0.03)
project_essay1_df = project_essay1.fit_transform(donors_choose['student_description'])
project_essay1_df = pd.DataFrame(project_essay1_df.todense(), index=donors_choose.index, columns=project_essay1.get_feature_names())
project_essay1_df = project_essay1_df.add_suffix('_student')

In [15]:
project_essay1_df.head()

Unnamed: 0,activity_student,air_student,apple_student,apple ipad_student,assort_student,assort color_student,bag_student,balance_student,balance ball_student,ball_student,...,toy_student,washable_student,white_student,wifi_student,wifi gb_student,wipe_student,wobble_student,wobble chair_student,write_student,yellow_student
0,0.0,0.0,0.679246,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.599007,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
project_essay2 = TfidfVectorizer(ngram_range=(1, 2),max_df=0.96,min_df=0.03)
project_essay2_df = project_essay2.fit_transform(donors_choose['project_description'])
project_essay2_df = pd.DataFrame(project_essay2_df.todense(), index=donors_choose.index, columns=project_essay2.get_feature_names())
project_essay2_df = project_essay2_df.add_suffix('_project')

In [17]:
project_essay2_df.head()

Unnamed: 0,ability_project,able_project,able use_project,academic_project,access_project,active_project,activity_project,add_project,addition_project,age_project,...,whole_project,wiggle_project,wobble_project,wonderful_project,word_project,work_project,world_project,write_project,year_project,young_project
0,0.0,0.0,0.0,0.0,0.0,0.0,0.148518,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.780083,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.171424,0.0,0.0,0.0,...,0.251734,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.107161,0.0,0.0,0.0,0.0,0.0,0.201057,0.0,0.0,...,0.0,0.0,0.0,0.233955,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.181692,0.397258,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
donors_choose['description'] = donors_choose['description'].fillna('')

In [19]:
descr = CountVectorizer(ngram_range=(1, 2),max_df=0.96,min_df=0.03)
descr_df = descr.fit_transform(donors_choose['description'])
descr_df = pd.DataFrame(descr_df.todense(), index=donors_choose.index, columns=descr.get_feature_names())
descr_df = descr_df.add_suffix('_descr')

In [20]:
descr_df.head()

Unnamed: 0,activity_descr,air_descr,apple_descr,apple ipad_descr,assort_descr,assort color_descr,bag_descr,balance_descr,balance ball_descr,ball_descr,...,toy_descr,washable_descr,white_descr,wifi_descr,wifi gb_descr,wipe_descr,wobble_descr,wobble chair_descr,write_descr,yellow_descr
0,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
scale_df = donors_choose[['teacher_number_of_previously_posted_projects','quantity_sum',
 'quantity_min',
 'quantity_max',
 'quantity_mean',
 'price_count',
 'price_sum',
 'price_min',
 'price_max',
 'price_mean',
 'mean_price']]

scale_df.isna().any()

teacher_number_of_previously_posted_projects    False
quantity_sum                                    False
quantity_min                                    False
quantity_max                                    False
quantity_mean                                   False
price_count                                     False
price_sum                                       False
price_min                                       False
price_max                                       False
price_mean                                      False
mean_price                                      False
dtype: bool

In [22]:
scaler = MinMaxScaler()
scale_features = scaler.fit_transform(scale_df.values)
scale_df= pd.DataFrame(scale_features, index=scale_df.index, columns=scale_df.columns)

In [23]:
x = donors_choose.drop(['teacher_id','project_submitted_datetime','project_is_approved','id','teacher_number_of_previously_posted_projects','quantity_sum',
 'quantity_min',
 'quantity_max',
 'quantity_mean',
 'price_count',
 'price_sum',
 'price_min',
 'price_max',
 'price_mean',
 'mean_price','essay1_count',
 'essay2_count',
 'project_resource_summary_count',
 'desc_count','essay1_count_nostop',
 'essay2_count_nostop',
 'proj_resouce_count_nostop',
 'desc_count_nostop','price_std','quantity_std','hour','project_title',
 'project_resource_summary',
 'description','year','student_description',
 'project_description'], 1)
y = donors_choose['project_is_approved']

In [24]:
list(x.columns)

['Applied Learning',
 'Health & Sports',
 'History & Civics',
 'Literacy & Language',
 'Math & Science',
 'Music & The Arts',
 'Special Needs_x',
 'Warmth Care & Hunger_x',
 'Applied Sciences',
 'Character Education',
 'Civics & Government',
 'College & Career Prep',
 'Community Service',
 'ESL',
 'Early Development',
 'Economics',
 'Environmental Science',
 'Extracurricular',
 'Financial Literacy',
 'Foreign Languages',
 'Gym & Fitness',
 'Health & Life Science',
 'Health & Wellness',
 'History & Geography',
 'Literacy',
 'Literature & Writing',
 'Mathematics',
 'Music',
 'Nutrition Education',
 'Other',
 'Parent Involvement',
 'Performing Arts',
 'Social Sciences',
 'Team Sports',
 'Visual Arts',
 'teacher_prefix_Mr.',
 'teacher_prefix_Mrs.',
 'teacher_prefix_Ms.',
 'teacher_prefix_Teacher',
 'school_state_AL',
 'school_state_AR',
 'school_state_AZ',
 'school_state_CA',
 'school_state_CO',
 'school_state_CT',
 'school_state_DC',
 'school_state_DE',
 'school_state_FL',
 'school_state_

In [25]:
pre_x1 = x.merge(project_title_df, how='left', left_index=True, right_index=True)
pre_x2 = pre_x1.merge(project_resource_df, how='left', left_index=True, right_index=True)
pre_x3 = pre_x2.merge(project_essay1_df, how='left', left_index=True, right_index=True)
pre_x4 = pre_x3.merge(project_essay2_df, how='left', left_index=True, right_index=True)
pre_x5 = pre_x4.merge(descr_df, how='left', left_index=True, right_index=True)
full_x = pre_x5.merge(scale_df, how='left', left_index=True, right_index=True)

full_x_array = full_x.as_matrix().astype(np.float)
y = y.as_matrix().astype(np.float)

In [26]:
x_train, x_test, y_train, y_test = train_test_split(full_x, y, test_size = 0.2, random_state=17)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((145664, 849), (36416, 849), (145664,), (36416,))

In [27]:
list(full_x.columns)

['Applied Learning',
 'Health & Sports',
 'History & Civics',
 'Literacy & Language',
 'Math & Science',
 'Music & The Arts',
 'Special Needs_x',
 'Warmth Care & Hunger_x',
 'Applied Sciences',
 'Character Education',
 'Civics & Government',
 'College & Career Prep',
 'Community Service',
 'ESL',
 'Early Development',
 'Economics',
 'Environmental Science',
 'Extracurricular',
 'Financial Literacy',
 'Foreign Languages',
 'Gym & Fitness',
 'Health & Life Science',
 'Health & Wellness',
 'History & Geography',
 'Literacy',
 'Literature & Writing',
 'Mathematics',
 'Music',
 'Nutrition Education',
 'Other',
 'Parent Involvement',
 'Performing Arts',
 'Social Sciences',
 'Team Sports',
 'Visual Arts',
 'teacher_prefix_Mr.',
 'teacher_prefix_Mrs.',
 'teacher_prefix_Ms.',
 'teacher_prefix_Teacher',
 'school_state_AL',
 'school_state_AR',
 'school_state_AZ',
 'school_state_CA',
 'school_state_CO',
 'school_state_CT',
 'school_state_DC',
 'school_state_DE',
 'school_state_FL',
 'school_state_

In [28]:
compute_class_weight('balanced', np.unique(y_train), y_train)

array([3.28441939, 0.58978533])

In [29]:
sample_weight = compute_sample_weight({0:3.24545862, 1:0.59105947}, y_train)

In [30]:
params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'max_depth': 15,
        'num_leaves': 30,
        'learning_rate': 0.1,
        'feature_fraction': 0.85,
        'bagging_fraction': 0.85,
        'bagging_freq': 5,
        'verbose': 1,
        'lambda_l2': 1,
        'n_estimators': 200,
        
}  
gridParams = {
    'learning_rate': [0.005,0.01,0.1],
    'num_leaves': [15,20,25,30],
    'boosting_type' : ['gbdt'],
    'objective' : ['binary'],
    'max_depth': [15,20,25]
    }

In [31]:
model = lgb.LGBMClassifier(boosting_type= params['boosting_type'], 
          objective = params['objective'], 
          n_jobs = 4, # Updated from 'nthread'
          max_depth = params['max_depth'],
          class_weight =  {0:3.24545862, 1:0.59105947},
          metric = params['metric'],
          num_leaves = params['num_leaves'],
          learning_rate = params['learning_rate'],
          feature_fraction = params['feature_fraction'],
          bagging_fraction = params['bagging_fraction'],
          bagging_freq = params['bagging_freq'],
          verbose = params['verbose'],
          reg_lambda = params['lambda_l2'],
          n_estimators = params['n_estimators'])

In [32]:
grid = RandomizedSearchCV(model, gridParams, n_iter=10, verbose=1, cv=4, n_jobs=-1)
grid.fit(x_train, y_train)

Fitting 4 folds for each of 10 candidates, totalling 40 fits


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed: 17.4min finished


RandomizedSearchCV(cv=4, error_score='raise',
          estimator=LGBMClassifier(bagging_fraction=0.85, bagging_freq=5, boosting_type='gbdt',
        class_weight={0: 3.24545862, 1: 0.59105947}, colsample_bytree=1.0,
        feature_fraction=0.85, learning_rate=0.1, max_depth=15,
        metric='auc', min_child_samples=20, min_child_weight=0.001,
       ...,
        silent=True, subsample=1.0, subsample_for_bin=200000,
        subsample_freq=1, verbose=1),
          fit_params=None, iid=True, n_iter=10, n_jobs=-1,
          param_distributions={'learning_rate': [0.005, 0.01, 0.1], 'num_leaves': [15, 20, 25, 30], 'boosting_type': ['gbdt'], 'objective': ['binary'], 'max_depth': [15, 20, 25]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=1)

In [36]:
test_preds_lgbm = grid.predict(x_test)

  if diff:


In [37]:
test_preds_lgbm

array([1., 1., 1., ..., 1., 1., 1.])

In [38]:
roc_auc_score(y_test, test_preds_lgbm)

0.7097538609576338

In [42]:
max_depth = [5,10,15,20]
learning_rate = np.linspace(0.0001, 3, 50)
param_grid = {'max_depth':max_depth, 'learning_rate':learning_rate}

In [43]:
model = xgb.XGBClassifier(n_jobs=-1)

model_gs = RandomizedSearchCV(model, param_grid, cv=5, n_jobs=-1, scoring='roc_auc', n_iter=10, verbose=2)

model_gs.fit(x_train, y_train, sample_weight=sample_weight)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] max_depth=15, learning_rate=0.06132244897959183 .................
[CV] max_depth=15, learning_rate=0.06132244897959183 .................
[CV] max_depth=15, learning_rate=0.06132244897959183 .................
[CV] max_depth=15, learning_rate=0.06132244897959183 .................
[CV] .. max_depth=15, learning_rate=0.06132244897959183, total=40.7min
[CV] max_depth=15, learning_rate=0.06132244897959183 .................
[CV] .. max_depth=15, learning_rate=0.06132244897959183, total=40.7min
[CV] max_depth=10, learning_rate=2.0204408163265306 ..................
[CV] .. max_depth=15, learning_rate=0.06132244897959183, total=40.8min
[CV] max_depth=10, learning_rate=2.0204408163265306 ..................
[CV] .. max_depth=15, learning_rate=0.06132244897959183, total=40.8min
[CV] max_depth=10, learning_rate=2.0204408163265306 ..................
[CV] ... max_depth=10, learning_rate=2.0204408163265306, total=11.5min
[CV] max_depth=1

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 169.9min


[CV] max_depth=5, learning_rate=2.204108163265306 ....................
[CV] ..... max_depth=5, learning_rate=2.204108163265306, total= 4.1min
[CV] max_depth=5, learning_rate=2.204108163265306 ....................
[CV] ..... max_depth=5, learning_rate=2.204108163265306, total= 3.5min
[CV] max_depth=5, learning_rate=2.204108163265306 ....................
[CV] ..... max_depth=5, learning_rate=2.204108163265306, total= 3.8min
[CV] max_depth=5, learning_rate=2.204108163265306 ....................
[CV] .. max_depth=10, learning_rate=0.12254489795918366, total=23.3min
[CV] max_depth=15, learning_rate=2.7551102040816327 ..................
[CV] ..... max_depth=5, learning_rate=2.204108163265306, total= 3.6min
[CV] max_depth=15, learning_rate=2.7551102040816327 ..................
[CV] ..... max_depth=5, learning_rate=2.204108163265306, total= 3.8min
[CV] max_depth=15, learning_rate=2.7551102040816327 ..................
[CV] ... max_depth=15, learning_rate=2.7551102040816327, total= 6.4min
[CV] m

[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 198.9min finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1),
          fit_params=None, iid=True, n_iter=10, n_jobs=-1,
          param_distributions={'max_depth': [5, 10, 15, 20], 'learning_rate': array([1.00000e-04, 6.13224e-02, 1.22545e-01, 1.83767e-01, 2.44990e-01,
       3.06212e-01, 3.67435e-01, 4.28657e-01, 4.89880e-01, 5.51102e-01,
       6.12324e-01, 6.73547e-01, 7.34769e-01, 7.95992e-01, 8.57214e-01,
       9.18437e-... 2.63267e+00, 2.69389e+00,
       2.75511e+00, 2.81633e+00, 2.87756e+00, 2.93878e+00, 3.00000e+00])},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
    

In [44]:
pred_xgb = model_gs.predict(x_test)

  if diff:


In [45]:
model_gs.score(x_test, y_test)

0.7636688405147845

In [46]:
print(classification_report(y_test, pred_xgb,
     target_names=['rejected','approved']))

             precision    recall  f1-score   support

   rejected       0.36      0.58      0.44      5559
   approved       0.91      0.82      0.86     30857

avg / total       0.83      0.78      0.80     36416



In [47]:
confusion_matrix(y_test, pred_xgb)

array([[ 3214,  2345],
       [ 5707, 25150]])

In [48]:
roc_auc_score(y_test, pred_xgb)

0.6966058047607724

In [84]:
test = pd.read_csv('test_clean4.csv')

In [85]:
test.drop('Unnamed: 0', 1, inplace=True)

In [86]:
test['teacher_prefix'] = test['teacher_prefix'].astype('category')
test['school_state'] = test['school_state'].astype('category')
test['project_grade_category'] = test['project_grade_category'].astype('category')
test['month'] = test['month'].astype('category')
test['dow'] = test['dow'].astype('category')

test = pd.get_dummies(test,columns = ['teacher_prefix','school_state','project_grade_category','month','dow'],drop_first=True)

In [87]:
list(test.columns)

['id',
 'teacher_id',
 'project_submitted_datetime',
 'project_title',
 'project_resource_summary',
 'teacher_number_of_previously_posted_projects',
 'quantity_sum',
 'quantity_min',
 'quantity_max',
 'quantity_mean',
 'quantity_std',
 'price_count',
 'price_sum',
 'price_min',
 'price_max',
 'price_mean',
 'price_std',
 'mean_price',
 'description',
 'Applied Learning',
 'Health & Sports',
 'History & Civics',
 'Literacy & Language',
 'Math & Science',
 'Music & The Arts',
 'Special Needs_x',
 'Warmth Care & Hunger_x',
 'Applied Sciences',
 'Character Education',
 'Civics & Government',
 'College & Career Prep',
 'Community Service',
 'ESL',
 'Early Development',
 'Economics',
 'Environmental Science',
 'Extracurricular',
 'Financial Literacy',
 'Foreign Languages',
 'Gym & Fitness',
 'Health & Life Science',
 'Health & Wellness',
 'History & Geography',
 'Literacy',
 'Literature & Writing',
 'Mathematics',
 'Music',
 'Nutrition Education',
 'Other',
 'Parent Involvement',
 'Performin

In [88]:
test['project_title'] = test['project_title'].fillna('')

In [89]:
project_title_test_df = project_title.transform(test['project_title'])
project_title_test_df = pd.DataFrame(project_title_test_df.todense(), index=test.index, columns=project_title.get_feature_names())
project_title_test_df = project_title_test_df.add_suffix('_title')

In [90]:
project_title_test_df.head()

Unnamed: 0,active_title,art_title,book_title,books_title,building_title,chromebooks_title,class_title,classroom_title,first_title,flexible_title,...,stem_title,students_title,success_title,supplies_title,technology_title,time_title,us_title,wiggle_title,work_title,world_title
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [91]:
project_resource_test_df = project_resource.transform(test['project_resource_summary'])
project_resource_test_df = pd.DataFrame(project_resource_test_df.todense(), index=test.index, columns=project_resource.get_feature_names())
project_resource_test_df = project_resource_test_df.add_suffix('_resource')

In [92]:
project_resource_df.head()

Unnamed: 0,able_resource,access_resource,activity_resource,allow_resource,ball_resource,book_resource,center_resource,chair_resource,chromebooks_resource,class_resource,...,set_resource,skill_resource,stool_resource,supply_resource,technology_resource,time_resource,use_resource,wobble_resource,work_resource,write_resource
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.327386,0.0,0.0,0.0
3,0.0,0.0,0.788018,0.0,0.550752,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [93]:
test['student_description'] = test['description'].fillna('')

In [94]:
project_essay1_test_df = project_essay1.transform(test['student_description'])
project_essay1_test_df = pd.DataFrame(project_essay1_test_df.todense(), index=test.index, columns=project_essay1.get_feature_names())
project_essay1_test_df = project_essay1_test_df.add_suffix('_student')

In [95]:
project_essay1_test_df.head()

Unnamed: 0,activity_student,air_student,apple_student,apple ipad_student,assort_student,assort color_student,bag_student,balance_student,balance ball_student,ball_student,...,toy_student,washable_student,white_student,wifi_student,wifi gb_student,wipe_student,wobble_student,wobble chair_student,write_student,yellow_student
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.252609
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [96]:
project_essay2_test_df = project_essay2.transform(test['project_description'])
project_essay2_test_df = pd.DataFrame(project_essay2_test_df.todense(), index=test.index, columns=project_essay2.get_feature_names())
project_essay2_test_df = project_essay2_test_df.add_suffix('_project')

In [97]:
project_essay2_test_df.head()

Unnamed: 0,ability_project,able_project,able use_project,academic_project,access_project,active_project,activity_project,add_project,addition_project,age_project,...,whole_project,wiggle_project,wobble_project,wonderful_project,word_project,work_project,world_project,write_project,year_project,young_project
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.05722,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.226193,0.0,0.084072,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.08761,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.124226,0.067512,0.0,0.0,0.0,0.0
4,0.0,0.086227,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17536


In [98]:
test['description'] = test['description'].fillna('')

In [99]:
descr_test_df = descr.transform(test['description'])
descr_test_df = pd.DataFrame(descr_test_df.todense(), index=test.index, columns=descr.get_feature_names())
descr_test_df = descr_test_df.add_suffix('_descr')

In [100]:
descr_df.head()

Unnamed: 0,activity_descr,air_descr,apple_descr,apple ipad_descr,assort_descr,assort color_descr,bag_descr,balance_descr,balance ball_descr,ball_descr,...,toy_descr,washable_descr,white_descr,wifi_descr,wifi gb_descr,wipe_descr,wobble_descr,wobble chair_descr,write_descr,yellow_descr
0,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [101]:
scale_df_test = test[['teacher_number_of_previously_posted_projects','quantity_sum',
 'quantity_min',
 'quantity_max',
 'quantity_mean',
 'price_count',
 'price_sum',
 'price_min',
 'price_max',
 'price_mean',
 'mean_price']]

scale_df_test.isna().any()

teacher_number_of_previously_posted_projects    False
quantity_sum                                    False
quantity_min                                    False
quantity_max                                    False
quantity_mean                                   False
price_count                                     False
price_sum                                       False
price_min                                       False
price_max                                       False
price_mean                                      False
mean_price                                      False
dtype: bool

In [102]:
scale_features_test = scaler.transform(scale_df_test.values)
scale_df_test = pd.DataFrame(scale_features_test, index=scale_df_test.index, columns=scale_df_test.columns)

In [103]:
test_x = test.drop(['teacher_id','project_submitted_datetime','id','teacher_number_of_previously_posted_projects','quantity_sum',
 'quantity_min',
 'quantity_max',
 'quantity_mean',
 'price_count',
 'price_sum',
 'price_min',
 'price_max',
 'price_mean',
 'mean_price','essay1_count',
 'essay2_count',
 'project_resource_summary_count',
 'desc_count','essay1_count_nostop',
 'essay2_count_nostop',
 'proj_resouce_count_nostop',
 'desc_count_nostop','price_std','quantity_std','hour','project_title',
 'project_resource_summary',
 'description','year','student_description',
 'project_description'], 1)

In [104]:
pre_x1_test = test_x.merge(project_title_test_df, how='left', left_index=True, right_index=True)
pre_x2_test = pre_x1_test.merge(project_resource_test_df, how='left', left_index=True, right_index=True)
pre_x3_test = pre_x2_test.merge(project_essay1_test_df, how='left', left_index=True, right_index=True)
pre_x4_test = pre_x3_test.merge(project_essay2_test_df, how='left', left_index=True, right_index=True)
pre_x5_test = pre_x4_test.merge(descr_test_df, how='left', left_index=True, right_index=True)
full_x_test = pre_x5_test.merge(scale_df_test, how='left', left_index=True, right_index=True)

In [203]:
test_pred_1 = grid.predict_proba(full_x_test)
test_pred_2 = model_gs.predict_proba(full_x_test)
pred_prob_approved1 = [x[1] * 0.36 for x in test_pred_1]
pred_prob_approved2 = [x[1] * 0.64 for x in test_pred_2]

In [204]:
test_pred = np.add(pred_prob_approved2,pred_prob_approved1)

In [205]:
test_pred

array([0.71205312, 0.684191  , 0.83359388, ..., 0.69826105, 0.89287415,
       0.34516541])

In [206]:
ids = np.array(test['id'])

In [207]:
len(ids)

78035

In [208]:
pred_dict = {'id':ids, 'project_is_approved': test_pred}
pred_dict

{'id': array(['p233245', 'p096795', 'p236235', ..., 'p210728', 'p060531',
        'p087783'], dtype=object),
 'project_is_approved': array([0.71205312, 0.684191  , 0.83359388, ..., 0.69826105, 0.89287415,
        0.34516541])}

In [209]:
submission = pd.DataFrame(pred_dict)

In [178]:
submission.to_csv('lgbm_xgb1.csv', index=False)

In [186]:
submission.to_csv('lgbm_xgb2.csv', index=False)

In [210]:
submission.to_csv('lgbm_xgb4.csv', index=False)