In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import os

%matplotlib inline

In [None]:
train_test_dtypes = {'id':str, 'teacher_id':str, 'teacher_prefix':str, 'school_state':str, 'project_submitted_datetime':str, 'project_grade_category':str, 'project_subject_categories':str,
                     'project_subject_subcategories':str, 'project_title':str, 'project_essay_1':str, 'project_essay_2':str, 'project_essay_3':str, 'project_essay_4':str, 'project_resource_summary':str, 
                     'teacher_number_of_previously_posted_projects':int, 'project_is_approved':int}

In [None]:
train_data_raw = pd.read_csv('../input/donorschoose-application-screening/train.csv', sep=',', dtype=train_test_dtypes, low_memory=True)
test_data_raw = pd.read_csv('../input/donorschoose-application-screening/test.csv', sep=',', dtype=train_test_dtypes, low_memory=True)
resource_data_raw = pd.read_csv('../input/donorschoose-application-screening/resources.csv', sep=',')

Change the project_submitted_datetime column to type datetime64, and extract year, month, and day as new features

In [None]:
train_data_raw['year'] = train_data_raw.project_submitted_datetime.apply(lambda x: x.split("-")[0])
train_data_raw['month'] = train_data_raw.project_submitted_datetime.apply(lambda x: x.split("-")[1])
train_data_raw['day'] = train_data_raw.project_submitted_datetime.apply(lambda x: x.split("-")[2])
train_data_raw['day'] = train_data_raw.day.apply(lambda x: x.split(" ")[0])
train_data_raw['project_submitted_datetime'] = pd.to_datetime(train_data_raw['project_submitted_datetime'], format="%Y-%m-%d %H:%M:%S")
test_data_raw['year'] = test_data_raw.project_submitted_datetime.apply(lambda x: x.split("-")[0])
test_data_raw['month'] = test_data_raw.project_submitted_datetime.apply(lambda x: x.split("-")[1])
test_data_raw['day'] = test_data_raw.project_submitted_datetime.apply(lambda x: x.split("-")[2])
test_data_raw['day'] = test_data_raw.day.apply(lambda x: x.split(" ")[0])
test_data_raw['project_submitted_datetime'] = pd.to_datetime(test_data_raw['project_submitted_datetime'], format="%Y-%m-%d %H:%M:%S")

Create stats features based off of the quantity and price info in the resources dataset. Merge these into the training and test datasets

In [None]:
resource_data_total = resource_data_raw.copy()
resource_data_total['total'] = resource_data_total['quantity'] * resource_data_total['price']
resource_data_total.head()

In [None]:
res = resource_data_total[['id', 'total']].groupby('id').total.agg(\
    [
        'count', 
        'sum', 
        'min', 
        'max', 
        'mean', 
        'median',
        'std',
    ]).reset_index()
print(res.head())

In [None]:
train_data_raw = train_data_raw.merge(res, on='id')
test_data_raw = test_data_raw.merge(res, on='id')

Some descriptions of items were NAN, these can be replaced with empty strings.

In [None]:
resource_dropped = resource_data_raw.fillna('')
resource_dropped.info()

Create a pivot_table that joins all of the resources descriptions together by proposal id, then merge these combined descriptions into the training and test datasets.

In [None]:
pivot_table = resource_dropped.groupby('id').description.apply(lambda x: "%s" % ';'.join(x)).reset_index()

In [None]:
train_data_raw = train_data_raw.merge(pivot_table, on='id')
test_data_raw = test_data_raw.merge(pivot_table, on='id')

Some STDDEV values were NAN, replace these with 0.0

In [None]:
values = {'std': 0.0}
train_data_raw.fillna(value=values, inplace=True)
test_data_raw.fillna(value=values, inplace=True)

For earlier proposals, there were 4 essay questions instead of 2. For these, combine 1&2 as essay 1, and 3&4 as essay 2. Then remove essay 3 & 4 columns from the dataframe

In [None]:
essay_3_4_nonull_filter = train_data_raw.project_essay_3.notnull()

train_data_raw.loc[essay_3_4_nonull_filter,'project_essay_1'] = train_data_raw[essay_3_4_nonull_filter].project_essay_1.str.cat(train_data_raw[essay_3_4_nonull_filter].project_essay_2)
train_data_raw.loc[essay_3_4_nonull_filter, 'project_essay_2'] = train_data_raw[essay_3_4_nonull_filter].project_essay_3.str.cat(train_data_raw[essay_3_4_nonull_filter].project_essay_4)

train_data_raw.drop(['project_essay_3', 'project_essay_4'], axis=1, inplace=True)

Drop remaining 4 rows in training dataset which have Null values (in teacher_prefix)

In [None]:
train_data_raw[pd.isnull(train_data_raw).any(axis=1)]
train_data_raw.dropna(inplace=True)

In [None]:
test_essay_3_4_nonull_filter = test_data_raw.project_essay_3.notnull()

test_data_raw.loc[test_essay_3_4_nonull_filter,'project_essay_1'] = test_data_raw[test_essay_3_4_nonull_filter].project_essay_1.str.cat(test_data_raw[test_essay_3_4_nonull_filter].project_essay_2)
test_data_raw.loc[test_essay_3_4_nonull_filter, 'project_essay_2'] = test_data_raw[test_essay_3_4_nonull_filter].project_essay_3.str.cat(test_data_raw[test_essay_3_4_nonull_filter].project_essay_4)

test_data_raw.drop(['project_essay_3', 'project_essay_4'], axis=1, inplace=True)

The test dataset has one NAN value for teacher_prefix. Replace with NAN with "Teacher"

In [None]:
test_data_raw.fillna(value="Teacher", inplace=True)

Scale the numeric data 

In [None]:
from sklearn.preprocessing import StandardScaler

num_features = ['teacher_number_of_previously_posted_projects', 'count', 'sum', 'min', 'max', 'mean', 'median', 'std']
scalar = StandardScaler()
X_train_num = scalar.fit_transform(train_data_raw[num_features])
X_test_num = scalar.fit_transform(test_data_raw[num_features])

Create new hash features from the categorical data

In [None]:
cat_features = ['project_grade_category', 'project_subject_categories', 'project_subject_subcategories', 'teacher_prefix', 'school_state','year','month','day']
cat_features_hash = [col+"_hash" for col in cat_features]

In [None]:
#create new categorical hash features
max_size=1501
def feature_hash(df, max_size=max_size):
    for col in cat_features:
        df[col+"_hash"] = df[col].apply(lambda x: (hash(x)%(max_size-1)+1))
    return df

train_data_raw = feature_hash(train_data_raw)
test_data_raw = feature_hash(test_data_raw)
X_train_cat = np.array(train_data_raw[cat_features_hash], dtype=np.int)
X_test_cat = np.array(test_data_raw[cat_features_hash], dtype=np.int)

In [None]:
# concatenate all of the text fields together as a new 'text' feature
text_features_final = ['project_title', 'project_essay_1', 'project_essay_2', 'project_resource_summary', 'description']

train_data_raw['text'] = train_data_raw.apply(lambda x: " ".join(x[col] for col in text_features_final), axis=1)
test_data_raw['text'] = test_data_raw.apply(lambda x: " ".join(x[col] for col in text_features_final), axis=1)

Tokenize the training text data

In [None]:
EMBEDDING_DIM = 300
max_features = 50000

t = Tokenizer(num_words=max_features)
t.fit_on_texts(train_data_raw['text'].tolist() + test_data_raw['text'].tolist())
sequences = t.texts_to_sequences(train_data_raw['text'])

word_index = t.word_index
print('Found %s unique tokens.' % len(word_index))

X_train_word = pad_sequences(sequences, maxlen=EMBEDDING_DIM)
y_train = train_data_raw.project_is_approved

Split off 10% of the training data as a validation set

In [None]:
from sklearn.model_selection import train_test_split

X_train_num_split, X_val_num, y_train_num_split, y_val_num = train_test_split(X_train_num, y_train, test_size=.1, shuffle=False)
X_train_cat_split, X_val_cat, y_train_cat_split, y_val_cat = train_test_split(X_train_cat, y_train, test_size=.1, shuffle=False)
X_train_word_split, X_val_word, y_train_word_split, y_val_word = train_test_split(X_train_word, y_train, test_size=.1, shuffle=False)

Oversample the minority data

In [None]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X_train_num_smote, y_train_num_smote = sm.fit_sample(X_train_num_split, y_train_num_split)
X_train_word_smote, y_train_word_smote = sm.fit_sample(X_train_word_split, y_train_word_split)
X_train_cat_smote, y_train_cat_smote = sm.fit_sample(X_train_cat_split, y_train_cat_split)

Use pre-trained embeddings

In [None]:
#GLOVE_DIR = '../input/glove-global-vectors-for-word-representation'
FASTTEXT_DIR = '../input/fatsttext-common-crawl/crawl-300d-2M'
embeddings_index = {}
#f = open(os.path.join(GLOVE_DIR, 'glove.6B.200d.txt'))
f = open(os.path.join(FASTTEXT_DIR, 'crawl-300d-2M.vec'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
from keras.layers import Embedding, Input, Dense, Conv1D, MaxPooling1D, Flatten, GlobalMaxPool1D, Dropout, Convolution1D, Bidirectional, GRU, SpatialDropout1D, concatenate
from keras.models import Model, Sequential
from keras import optimizers
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from sklearn.metrics import roc_auc_score

In [None]:
def get_deep_num_cat_words_model():
    input_words = Input((EMBEDDING_DIM, ))
    input_cat = Input((len(cat_features_hash), ))
    input_num = Input((len(num_features),))
    
    x_cat = Embedding(max_size, 10)(input_cat)
    x_cat = SpatialDropout1D(0.3)(x_cat)
    x_cat = Flatten()(x_cat)
    x_cat = Dense(50, activation='relu')(x_cat)
    x_cat = Dropout(0.25)(x_cat)
    x_cat = Dense(100, activation="relu")(x_cat)
    
    x_words = Embedding(len(word_index) + 1, EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=False)(input_words)
    x_words = SpatialDropout1D(0.3)(x_words)
    x_words = Bidirectional(GRU(50, return_sequences=True))(x_words)
    x_words = Convolution1D(100, 3, activation="relu")(x_words)
    x_words = GlobalMaxPool1D()(x_words)
    
    x_num = Dense(100, activation='relu')(input_num)
    x_num = Dropout(0.3)(x_num)
    x_num = Dense(100, activation='relu')(x_num)
    x_num = Dropout(0.3)(x_num)
    
    x = concatenate([x_cat, x_num, x_words])
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.25)(x)
    predictions = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=[input_cat, input_num, input_words], outputs=predictions)
    model.compile(optimizer=optimizers.Adam(0.0005, decay=1e-6),
              loss='binary_crossentropy',
              metrics=['accuracy'])
    #model.summary()
    return model

Instatiate the model, use ModelCheckpoint to save best weights

In [None]:
deep_model = get_deep_num_cat_words_model()
deep_model_weight_filepath = "deep_num_cat_words_weights.best.hdf5"
deep_model_checkpoint = ModelCheckpoint(deep_model_weight_filepath, monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True)
early_stopping = EarlyStopping(monitor='val_loss', patience=2)
deep_model_callbacks_list = [deep_model_checkpoint, early_stopping]

Train the model

In [None]:
history = deep_model.fit([X_train_cat_smote, X_train_num_smote, X_train_word_smote], y_train_cat_smote, validation_split=0.1, epochs=4, batch_size=256, callbacks=deep_model_callbacks_list)

Load the best weights and make predictions on the validation set. Report the AUROC

In [None]:
deep_model.load_weights(deep_model_weight_filepath)
deep_model_pred_val = deep_model.predict([X_val_cat, X_val_num, X_val_word], batch_size=2000)

deep_model_val_AUROC = roc_auc_score(y_val_cat, deep_model_pred_val)
print("deep_model_val_AUROC AUROC: {}".format(deep_model_val_AUROC))

AUROC_file = "deep_model_val_AUROC.txt"
f = open(AUROC_file, "w")
f.write("{}".format(deep_model_val_AUROC))
f.close()

Make predictions on the Kaggle public test set and save to file so that they can be submitted to the competition.

In [None]:
list_tokenized_test = t.texts_to_sequences(test_data_raw['text'].tolist())
X_test_words = pad_sequences(list_tokenized_test, maxlen=EMBEDDING_DIM)

deep_model_pred_test = deep_model.predict([X_test_cat, X_test_num, X_test_words])
test_data_raw["project_is_approved"] = deep_model_pred_test
test_data_raw[['id', 'project_is_approved']].to_csv("deep_model_fasttext300_fullSMOTE_submission.csv", index=False)