In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.pipeline import FeatureUnion

In [2]:
donors_choose = pd.read_csv('train_clean4.csv')

In [3]:
donors_choose.head()

Unnamed: 0.1,Unnamed: 0,id,teacher_id,teacher_prefix,school_state,project_submitted_datetime,project_grade_category,project_title,project_resource_summary,teacher_number_of_previously_posted_projects,...,project_resource_summary_count,desc_count,essay1_count_nostop,essay2_count_nostop,proj_resouce_count_nostop,desc_count_nostop,month,year,dow,hour
0,0,p036502,484aaf11257089a66cfedc9461c6bd0a,Ms.,NV,2016-11-18 14:45:59,Grades PreK-2,super sight word centers,student need ipod nanos create differentiate e...,26,...,20,22.0,80,72,13,22,11,2016,4,14
1,1,p039565,df72a3ba8089423fa8a94be88060f6ed,Mrs.,GA,2017-04-26 15:57:28,Grades 3-5,keep calm dance,student need match shirt wear dance performanc...,1,...,12,11.0,54,54,8,11,4,2017,2,15
2,2,p233823,a9b876a9252e08a55e3d894150f75ba3,Ms.,UT,2017-01-01 22:57:44,Grades 3-5,lets doodle learn,student need doodler sem school mean student l...,5,...,33,5.0,80,47,18,5,1,2017,6,22
3,3,p185307,525fdbb6ec7f538a48beebaa0a51b24f,Mr.,NC,2016-08-12 15:42:11,Grades 3-5,kid inspired equipment increase activities gai...,student need ball activity equipment meet need...,16,...,36,22.0,95,91,19,20,8,2016,4,15
4,4,p013780,a63b5547a7239eae4c1872670848e61a,Mr.,CA,2016-08-06 09:09:11,Grades 6-8,need clean water culinary arts class,student need water filtration system culinary ...,42,...,12,10.0,36,49,8,8,8,2016,5,9


In [4]:
donors_choose.drop('Unnamed: 0', 1, inplace=True)

In [5]:
donors_choose['teacher_prefix'] = donors_choose['teacher_prefix'].astype('category')
donors_choose['school_state'] = donors_choose['school_state'].astype('category')
donors_choose['project_grade_category'] = donors_choose['project_grade_category'].astype('category')
donors_choose['month'] = donors_choose['month'].astype('category')
donors_choose['dow'] = donors_choose['dow'].astype('category')

donors_choose = pd.get_dummies(donors_choose,columns = ['teacher_prefix','school_state','project_grade_category','month', 'dow'],drop_first=True)

In [6]:
list(donors_choose.columns)

['id',
 'teacher_id',
 'project_submitted_datetime',
 'project_title',
 'project_resource_summary',
 'teacher_number_of_previously_posted_projects',
 'project_is_approved',
 'quantity_sum',
 'quantity_min',
 'quantity_max',
 'quantity_mean',
 'quantity_std',
 'price_count',
 'price_sum',
 'price_min',
 'price_max',
 'price_mean',
 'price_std',
 'mean_price',
 'description',
 'Applied Learning',
 'Health & Sports',
 'History & Civics',
 'Literacy & Language',
 'Math & Science',
 'Music & The Arts',
 'Special Needs_x',
 'Warmth Care & Hunger_x',
 'Applied Sciences',
 'Character Education',
 'Civics & Government',
 'College & Career Prep',
 'Community Service',
 'ESL',
 'Early Development',
 'Economics',
 'Environmental Science',
 'Extracurricular',
 'Financial Literacy',
 'Foreign Languages',
 'Gym & Fitness',
 'Health & Life Science',
 'Health & Wellness',
 'History & Geography',
 'Literacy',
 'Literature & Writing',
 'Mathematics',
 'Music',
 'Nutrition Education',
 'Other',
 'Parent I

In [22]:
donors_choose.isnull().any()

id                                              False
teacher_id                                      False
project_submitted_datetime                      False
project_title                                   False
project_resource_summary                        False
teacher_number_of_previously_posted_projects    False
project_is_approved                             False
quantity_sum                                    False
quantity_min                                    False
quantity_max                                    False
quantity_mean                                   False
quantity_std                                     True
price_count                                     False
price_sum                                       False
price_min                                       False
price_max                                       False
price_mean                                      False
price_std                                        True
mean_price                  

In [9]:
donors_choose['project_title'] = donors_choose['project_title'].fillna('')

In [12]:
project_title = CountVectorizer(ngram_range=(1, 2),max_df=0.99,min_df=0.01)
project_title_df = project_title.fit_transform(donors_choose['project_title'])
project_title_df = pd.DataFrame(project_title_df.todense(), index=donors_choose.index, columns=project_title.get_feature_names())
project_title_df = project_title_df.add_suffix('_title')

In [13]:
project_title_df.head()

Unnamed: 0,active_title,art_title,book_title,books_title,building_title,chromebooks_title,class_title,classroom_title,first_title,flexible_title,...,stem_title,students_title,success_title,supplies_title,technology_title,time_title,us_title,wiggle_title,work_title,world_title
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
project_resource = TfidfVectorizer(ngram_range=(1, 2),max_df=0.96,min_df=0.03)
project_resource_df = project_resource.fit_transform(donors_choose['project_resource_summary'])
project_resource_df = pd.DataFrame(project_resource_df.todense(), index=donors_choose.index, columns=project_resource.get_feature_names())
project_resource_df = project_resource_df.add_suffix('_resource')

In [18]:
project_resource_df.head(15)

Unnamed: 0,able_resource,access_resource,activity_resource,allow_resource,ball_resource,book_resource,center_resource,chair_resource,chromebooks_resource,class_resource,...,set_resource,skill_resource,stool_resource,supply_resource,technology_resource,time_resource,use_resource,wobble_resource,work_resource,write_resource
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.327386,0.0,0.0,0.0
3,0.0,0.0,0.788018,0.0,0.550752,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.500106,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.645211,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.690678,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.723162,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.372217,0.0,0.0,0.0,0.34174,0.0,0.0,...,0.0,0.0,0.389723,0.0,0.0,0.0,0.0,0.386581,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.555569,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.612242,0.0,0.0,0.0,0.0


In [23]:
donors_choose['student_description'] = donors_choose['description'].fillna('')

In [24]:
project_essay1 = TfidfVectorizer(ngram_range=(1, 2),max_df=0.96,min_df=0.03)
project_essay1_df = project_essay1.fit_transform(donors_choose['student_description'])
project_essay1_df = pd.DataFrame(project_essay1_df.todense(), index=donors_choose.index, columns=project_essay1.get_feature_names())
project_essay1_df = project_essay1_df.add_suffix('_student')

In [25]:
project_essay1_df.head()

Unnamed: 0,activity_student,air_student,apple_student,apple ipad_student,assort_student,assort color_student,bag_student,balance_student,balance ball_student,ball_student,...,toy_student,washable_student,white_student,wifi_student,wifi gb_student,wipe_student,wobble_student,wobble chair_student,write_student,yellow_student
0,0.0,0.0,0.679246,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.599007,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
project_essay2 = TfidfVectorizer(ngram_range=(1, 2),max_df=0.96,min_df=0.03)
project_essay2_df = project_essay2.fit_transform(donors_choose['project_description'])
project_essay2_df = pd.DataFrame(project_essay2_df.todense(), index=donors_choose.index, columns=project_essay2.get_feature_names())
project_essay2_df = project_essay2_df.add_suffix('_project')

In [27]:
project_essay2_df.head()

Unnamed: 0,ability_project,able_project,able use_project,academic_project,access_project,active_project,activity_project,add_project,addition_project,age_project,...,whole_project,wiggle_project,wobble_project,wonderful_project,word_project,work_project,world_project,write_project,year_project,young_project
0,0.0,0.0,0.0,0.0,0.0,0.0,0.148518,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.780083,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.171424,0.0,0.0,0.0,...,0.251734,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.107161,0.0,0.0,0.0,0.0,0.0,0.201057,0.0,0.0,...,0.0,0.0,0.0,0.233955,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.181692,0.397258,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
donors_choose['description'] = donors_choose['description'].fillna('')

In [31]:
descr = CountVectorizer(ngram_range=(1, 2),max_df=0.96,min_df=0.03)
descr_df = descr.fit_transform(donors_choose['description'])
descr_df = pd.DataFrame(descr_df.todense(), index=donors_choose.index, columns=descr.get_feature_names())
descr_df = descr_df.add_suffix('_descr')

In [32]:
descr_df.head()

Unnamed: 0,activity_descr,air_descr,apple_descr,apple ipad_descr,assort_descr,assort color_descr,bag_descr,balance_descr,balance ball_descr,ball_descr,...,toy_descr,washable_descr,white_descr,wifi_descr,wifi gb_descr,wipe_descr,wobble_descr,wobble chair_descr,write_descr,yellow_descr
0,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
scale_df = donors_choose[['teacher_number_of_previously_posted_projects','quantity_sum',
 'quantity_min',
 'quantity_max',
 'quantity_mean',
 'price_count',
 'price_sum',
 'price_min',
 'price_max',
 'price_mean',
 'mean_price']]

scale_df.isna().any()

teacher_number_of_previously_posted_projects    False
quantity_sum                                    False
quantity_min                                    False
quantity_max                                    False
quantity_mean                                   False
price_count                                     False
price_sum                                       False
price_min                                       False
price_max                                       False
price_mean                                      False
mean_price                                      False
dtype: bool

In [34]:
scaler = MinMaxScaler()
scale_features = scaler.fit_transform(scale_df.values)
scale_df= pd.DataFrame(scale_features, index=scale_df.index, columns=scale_df.columns)

In [45]:
x = donors_choose.drop(['teacher_id','project_submitted_datetime','project_is_approved','id','teacher_number_of_previously_posted_projects','quantity_sum',
 'quantity_min',
 'quantity_max',
 'quantity_mean',
 'price_count',
 'price_sum',
 'price_min',
 'price_max',
 'price_mean',
 'mean_price','essay1_count',
 'essay2_count',
 'project_resource_summary_count',
 'desc_count','essay1_count_nostop',
 'essay2_count_nostop',
 'proj_resouce_count_nostop',
 'desc_count_nostop','price_std','quantity_std','hour','project_title',
 'project_resource_summary',
 'description','year','student_description',
 'project_description'], 1)
y = donors_choose['project_is_approved']

In [46]:
list(x.columns)

['Applied Learning',
 'Health & Sports',
 'History & Civics',
 'Literacy & Language',
 'Math & Science',
 'Music & The Arts',
 'Special Needs_x',
 'Warmth Care & Hunger_x',
 'Applied Sciences',
 'Character Education',
 'Civics & Government',
 'College & Career Prep',
 'Community Service',
 'ESL',
 'Early Development',
 'Economics',
 'Environmental Science',
 'Extracurricular',
 'Financial Literacy',
 'Foreign Languages',
 'Gym & Fitness',
 'Health & Life Science',
 'Health & Wellness',
 'History & Geography',
 'Literacy',
 'Literature & Writing',
 'Mathematics',
 'Music',
 'Nutrition Education',
 'Other',
 'Parent Involvement',
 'Performing Arts',
 'Social Sciences',
 'Team Sports',
 'Visual Arts',
 'teacher_prefix_Mr.',
 'teacher_prefix_Mrs.',
 'teacher_prefix_Ms.',
 'teacher_prefix_Teacher',
 'school_state_AL',
 'school_state_AR',
 'school_state_AZ',
 'school_state_CA',
 'school_state_CO',
 'school_state_CT',
 'school_state_DC',
 'school_state_DE',
 'school_state_FL',
 'school_state_

In [47]:
pre_x1 = x.merge(project_title_df, how='left', left_index=True, right_index=True)
pre_x2 = pre_x1.merge(project_resource_df, how='left', left_index=True, right_index=True)
pre_x3 = pre_x2.merge(project_essay1_df, how='left', left_index=True, right_index=True)
pre_x4 = pre_x3.merge(project_essay2_df, how='left', left_index=True, right_index=True)
pre_x5 = pre_x4.merge(descr_df, how='left', left_index=True, right_index=True)
full_x = pre_x5.merge(scale_df, how='left', left_index=True, right_index=True)

In [48]:
x_train, x_test, y_train, y_test = train_test_split(full_x, y, test_size = 0.2, random_state=17)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((145664, 849), (36416, 849), (145664,), (36416,))

In [None]:
list(full_x.columns)

In [None]:
compute_class_weight('balanced', np.unique(y_train), y_train)

In [None]:
sample_weight = compute_sample_weight({0:3.24545862, 1:0.59105947}, y_train)

In [49]:
params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'max_depth': 16,
        'num_leaves': 31,
        'learning_rate': 0.25,
        'feature_fraction': 0.85,
        'bagging_fraction': 0.85,
        'bagging_freq': 5,
        'verbose': 1,
        'num_threads': 4,
        'lambda_l2': 1,
        'min_gain_to_split': 0,
}  

In [51]:
model = lgb.train(
        params,
        lgb.Dataset(x_train, y_train),
        num_boost_round=200,
        early_stopping_rounds=25,
        valid_sets=[lgb.Dataset(x_test, y_test)])

[1]	valid_0's auc: 0.703455
Training until validation scores don't improve for 25 rounds.
[2]	valid_0's auc: 0.719172
[3]	valid_0's auc: 0.72245
[4]	valid_0's auc: 0.725314
[5]	valid_0's auc: 0.729111
[6]	valid_0's auc: 0.731024
[7]	valid_0's auc: 0.735074
[8]	valid_0's auc: 0.73889
[9]	valid_0's auc: 0.741676
[10]	valid_0's auc: 0.744182
[11]	valid_0's auc: 0.746231
[12]	valid_0's auc: 0.748437
[13]	valid_0's auc: 0.750219
[14]	valid_0's auc: 0.751722
[15]	valid_0's auc: 0.753887
[16]	valid_0's auc: 0.755201
[17]	valid_0's auc: 0.756136
[18]	valid_0's auc: 0.758378
[19]	valid_0's auc: 0.759851
[20]	valid_0's auc: 0.762222
[21]	valid_0's auc: 0.762715
[22]	valid_0's auc: 0.763133
[23]	valid_0's auc: 0.764063
[24]	valid_0's auc: 0.764933
[25]	valid_0's auc: 0.765372
[26]	valid_0's auc: 0.766013
[27]	valid_0's auc: 0.766758
[28]	valid_0's auc: 0.76719
[29]	valid_0's auc: 0.767677
[30]	valid_0's auc: 0.767807
[31]	valid_0's auc: 0.768276
[32]	valid_0's auc: 0.768454
[33]	valid_0's auc: 0.

In [56]:
test_preds = model.predict(x_test)

In [62]:
len(test_preds)

36416

In [73]:
y_pred = []
for i in range(0,36416):
    if test_preds[i] > 0.5:       # setting threshold to .5
        y_pred.append(1)
    else:  
        y_pred.append(0)

In [74]:
y_pred

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


In [75]:
accuracy_score(y_test, y_pred)

0.8530865553602812

In [76]:
print(classification_report(y_test, y_pred,
     target_names=['rejected','approved']))

             precision    recall  f1-score   support

   rejected       0.57      0.16      0.25      5559
   approved       0.87      0.98      0.92     30857

avg / total       0.82      0.85      0.82     36416



In [78]:
roc_auc_score(y_test, test_preds)

0.7701933172305258

In [79]:
test = pd.read_csv('test_clean4.csv')

In [80]:
test['teacher_prefix'] = test['teacher_prefix'].astype('category')
test['school_state'] = test['school_state'].astype('category')
test['project_grade_category'] = test['project_grade_category'].astype('category')
test['month'] = test['month'].astype('category')
test['dow'] = test['dow'].astype('category')

test = pd.get_dummies(test,columns = ['teacher_prefix','school_state','project_grade_category','month','dow'],drop_first=True)

In [81]:
list(test.columns)

['Unnamed: 0',
 'id',
 'teacher_id',
 'project_submitted_datetime',
 'project_title',
 'project_resource_summary',
 'teacher_number_of_previously_posted_projects',
 'quantity_sum',
 'quantity_min',
 'quantity_max',
 'quantity_mean',
 'quantity_std',
 'price_count',
 'price_sum',
 'price_min',
 'price_max',
 'price_mean',
 'price_std',
 'mean_price',
 'description',
 'Applied Learning',
 'Health & Sports',
 'History & Civics',
 'Literacy & Language',
 'Math & Science',
 'Music & The Arts',
 'Special Needs_x',
 'Warmth Care & Hunger_x',
 'Applied Sciences',
 'Character Education',
 'Civics & Government',
 'College & Career Prep',
 'Community Service',
 'ESL',
 'Early Development',
 'Economics',
 'Environmental Science',
 'Extracurricular',
 'Financial Literacy',
 'Foreign Languages',
 'Gym & Fitness',
 'Health & Life Science',
 'Health & Wellness',
 'History & Geography',
 'Literacy',
 'Literature & Writing',
 'Mathematics',
 'Music',
 'Nutrition Education',
 'Other',
 'Parent Involvemen

In [82]:
test['project_title'] = test['project_title'].fillna('')

In [85]:
project_title_test_df = project_title.transform(test['project_title'])
project_title_test_df = pd.DataFrame(project_title_test_df.todense(), index=test.index, columns=project_title.get_feature_names())
project_title_test_df = project_title_test_df.add_suffix('_title')

In [86]:
project_title_test_df.head()

Unnamed: 0,active_title,art_title,book_title,books_title,building_title,chromebooks_title,class_title,classroom_title,first_title,flexible_title,...,stem_title,students_title,success_title,supplies_title,technology_title,time_title,us_title,wiggle_title,work_title,world_title
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [87]:
project_resource_test_df = project_resource.transform(test['project_resource_summary'])
project_resource_test_df = pd.DataFrame(project_resource_test_df.todense(), index=test.index, columns=project_resource.get_feature_names())
project_resource_test_df = project_resource_test_df.add_suffix('_resource')

In [88]:
project_resource_df.head()

Unnamed: 0,able_resource,access_resource,activity_resource,allow_resource,ball_resource,book_resource,center_resource,chair_resource,chromebooks_resource,class_resource,...,set_resource,skill_resource,stool_resource,supply_resource,technology_resource,time_resource,use_resource,wobble_resource,work_resource,write_resource
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.327386,0.0,0.0,0.0
3,0.0,0.0,0.788018,0.0,0.550752,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [89]:
test['student_description'] = test['description'].fillna('')

In [90]:
project_essay1_test_df = project_essay1.transform(test['student_description'])
project_essay1_test_df = pd.DataFrame(project_essay1_test_df.todense(), index=test.index, columns=project_essay1.get_feature_names())
project_essay1_test_df = project_essay1_test_df.add_suffix('_student')

In [91]:
project_essay1_test_df.head()

Unnamed: 0,activity_student,air_student,apple_student,apple ipad_student,assort_student,assort color_student,bag_student,balance_student,balance ball_student,ball_student,...,toy_student,washable_student,white_student,wifi_student,wifi gb_student,wipe_student,wobble_student,wobble chair_student,write_student,yellow_student
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.252609
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [92]:
project_essay2_test_df = project_essay2.transform(test['project_description'])
project_essay2_test_df = pd.DataFrame(project_essay2_test_df.todense(), index=test.index, columns=project_essay2.get_feature_names())
project_essay2_test_df = project_essay2_test_df.add_suffix('_project')

In [93]:
project_essay2_test_df.head()

Unnamed: 0,ability_project,able_project,able use_project,academic_project,access_project,active_project,activity_project,add_project,addition_project,age_project,...,whole_project,wiggle_project,wobble_project,wonderful_project,word_project,work_project,world_project,write_project,year_project,young_project
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.05722,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.226193,0.0,0.084072,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.08761,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.124226,0.067512,0.0,0.0,0.0,0.0
4,0.0,0.086227,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17536


In [94]:
test['description'] = test['description'].fillna('')

In [96]:
descr_test_df = descr.transform(test['description'])
descr_test_df = pd.DataFrame(descr_test_df.todense(), index=test.index, columns=descr.get_feature_names())
descr_test_df = descr_test_df.add_suffix('_descr')

In [97]:
descr_df.head()

Unnamed: 0,activity_descr,air_descr,apple_descr,apple ipad_descr,assort_descr,assort color_descr,bag_descr,balance_descr,balance ball_descr,ball_descr,...,toy_descr,washable_descr,white_descr,wifi_descr,wifi gb_descr,wipe_descr,wobble_descr,wobble chair_descr,write_descr,yellow_descr
0,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [98]:
scale_df_test = test[['teacher_number_of_previously_posted_projects','quantity_sum',
 'quantity_min',
 'quantity_max',
 'quantity_mean',
 'price_count',
 'price_sum',
 'price_min',
 'price_max',
 'price_mean',
 'mean_price']]

scale_df_test.isna().any()

teacher_number_of_previously_posted_projects    False
quantity_sum                                    False
quantity_min                                    False
quantity_max                                    False
quantity_mean                                   False
price_count                                     False
price_sum                                       False
price_min                                       False
price_max                                       False
price_mean                                      False
mean_price                                      False
dtype: bool

In [99]:
scale_features_test = scaler.transform(scale_df_test.values)
scale_df_test = pd.DataFrame(scale_features_test, index=scale_df_test.index, columns=scale_df_test.columns)

In [112]:
test_x = test.drop(['teacher_id','project_submitted_datetime','id','teacher_number_of_previously_posted_projects','quantity_sum',
 'quantity_min',
 'quantity_max',
 'quantity_mean',
 'price_count',
 'price_sum',
 'price_min',
 'price_max',
 'price_mean',
 'mean_price','essay1_count',
 'essay2_count',
 'project_resource_summary_count',
 'desc_count','essay1_count_nostop',
 'essay2_count_nostop',
 'proj_resouce_count_nostop',
 'desc_count_nostop','price_std','quantity_std','hour','project_title',
 'project_resource_summary',
 'description','year','student_description',
 'project_description'], 1)
y = donors_choose['project_is_approved']

In [113]:
pre_x1_test = test_x.merge(project_title_test_df, how='left', left_index=True, right_index=True)
pre_x2_test = pre_x1_test.merge(project_resource_test_df, how='left', left_index=True, right_index=True)
pre_x3_test = pre_x2_test.merge(project_essay1_test_df, how='left', left_index=True, right_index=True)
pre_x4_test = pre_x3_test.merge(project_essay2_test_df, how='left', left_index=True, right_index=True)
pre_x5_test = pre_x4_test.merge(descr_test_df, how='left', left_index=True, right_index=True)
full_x_test = pre_x5_test.merge(scale_df_test, how='left', left_index=True, right_index=True)

In [114]:
full_x_test.shape

(78035, 850)

In [115]:
test_pred = model.predict(full_x_test)

In [116]:
len(test_pred)

78035

In [117]:
ids = np.array(test['id'])

In [120]:
len(ids)

78035

In [121]:
pred_dict = {'id':ids, 'project_is_approved': test_pred}
pred_dict

{'id': array(['p233245', 'p096795', 'p236235', ..., 'p210728', 'p060531',
        'p087783'], dtype=object),
 'project_is_approved': array([0.72655242, 0.63305238, 0.96456985, ..., 0.71492192, 0.94689149,
        0.85680794])}

In [122]:
submission = pd.DataFrame(pred_dict)

In [123]:
submission.to_csv('lgbm1.csv', index=False)