In [3]:
# reference https://www.kaggle.com/CVxTz/keras-baseline-feature-hashing-cnn

import numpy as np
import pandas as pd

import os
dir='/Users/xinwang/ai/dataset/kaggle/DonorsChoose/'

print(os.listdir(dir))

train = pd.read_csv(dir+'train.csv')
test = pd.read_csv(dir+'test.csv')
resources = pd.read_csv(dir+'resources.csv')

train = train.sort_values(by='project_submitted_datetime')

print(train.columns.values)

['resources.csv', 'sample_submission.csv', 'test.csv', 'train.csv', 'train_1.csv', 'train_2.csv']


  interactivity=interactivity, compiler=compiler, result=result)


['id' 'teacher_id' 'teacher_prefix' 'school_state'
 'project_submitted_datetime' 'project_grade_category'
 'project_subject_categories' 'project_subject_subcategories'
 'project_title' 'project_essay_1' 'project_essay_2' 'project_essay_3'
 'project_essay_4' 'project_resource_summary'
 'teacher_number_of_previously_posted_projects' 'project_is_approved']


In [4]:
teachers_train = list(set(train.teacher_id.values))
teachers_test = list(set(test.teacher_id.values))

inter = set(teachers_train).intersection(teachers_test)

print('num of teachers in train:%s, number of teachers in test:%s, overlap:%s' % (len(teachers_train),
                                                                                 len(teachers_test),
                                                                                 len(inter)))

num of teachers in train:104414, number of teachers in test:55508, overlap:27789


In [8]:
char_cols = ['project_subject_categories','project_subject_subcategories','project_title',
             'project_essay_1','project_essay_2','project_essay_3','project_essay_4','project_resource_summary']

resources.columns
resources['total_price'] = resources.quantity * resources.price

mean_total_price = pd.DataFrame(resources.groupby('id').total_price.mean())
sum_total_price = pd.DataFrame(resources.groupby('id').total_price.sum())
count_total_price = pd.DataFrame(resources.groupby('id').total_price.count())

mean_total_price['id'] = mean_total_price.index
sum_total_price['id'] = sum_total_price.index
count_total_price['id'] = count_total_price.index


def create_features(df):
    df = pd.merge(df, mean_total_price, on='id')
    df = pd.merge(df, sum_total_price, on='id')
    df = pd.merge(df, count_total_price, on='id')
    
    df['year'] = df.project_submitted_datetime.apply(lambda x: x.split('-')[0])
    df['month'] = df.project_submitted_datetime.apply(lambda x: x.split('-')[1])
    
    for col in char_cols:
        df[col] = df[col].fillna('NA')

    df['text'] = df.apply(lambda x: " ".join(x[col] for col in char_cols), axis=1)
    return df


train = create_features(train)
test = create_features(test)


Defaulting to column, but this will raise an ambiguity error in a future version
Defaulting to column, but this will raise an ambiguity error in a future version


In [10]:
train.columns.values

array(['id', 'teacher_id', 'teacher_prefix', 'school_state',
       'project_submitted_datetime', 'project_grade_category',
       'project_subject_categories', 'project_subject_subcategories',
       'project_title', 'project_essay_1', 'project_essay_2',
       'project_essay_3', 'project_essay_4', 'project_resource_summary',
       'teacher_number_of_previously_posted_projects',
       'project_is_approved', 'total_price_x', 'total_price_y',
       'total_price', 'year', 'month', 'text'], dtype=object)

In [11]:
cate_features = ['teacher_prefix', 'school_state','year', 'month','project_grade_category',
                 'project_subject_categories', 'project_subject_subcategories']
cate_features_hash = [col+'_hash' for col in cate_features]

max_size = 15000

def feature_hash(df, max_size=max_size):
    for col in cate_features:
        df[col+'_hash'] = df[col].apply(lambda x: hash(x)%max_size)

    return df

train = feature_hash(train)
test = feature_hash(test)
    
print(train.columns.values)

['id' 'teacher_id' 'teacher_prefix' 'school_state'
 'project_submitted_datetime' 'project_grade_category'
 'project_subject_categories' 'project_subject_subcategories'
 'project_title' 'project_essay_1' 'project_essay_2' 'project_essay_3'
 'project_essay_4' 'project_resource_summary'
 'teacher_number_of_previously_posted_projects' 'project_is_approved'
 'total_price_x' 'total_price_y' 'total_price' 'year' 'month' 'text'
 'teacher_prefix_hash' 'school_state_hash' 'year_hash' 'month_hash'
 'project_grade_category_hash' 'project_subject_categories_hash'
 'project_subject_subcategories_hash']


In [12]:
from sklearn.preprocessing import StandardScaler
from keras.preprocessing import text, sequence

max_features = 50000
maxlen = 300

numeric_features = ['teacher_number_of_previously_posted_projects','total_price_x', 'total_price_y','total_price']

scaler = StandardScaler()
x_train_numeric = scaler.fit_transform(train[numeric_features])
x_test_numeric = scaler.fit_transform(test[numeric_features])
print('processed numeric features')

x_train_cate = np.array(train[cate_features_hash], dtype=np.int)
x_test_cate = np.array(test[cate_features_hash], dtype=np.int)
print('processed categorical features')

tokenizer = text.Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(train['text'].tolist() + test['text'].tolist())

list_tokenized_train = tokenizer.texts_to_sequences(train['text'].tolist())
list_tokenized_test = tokenizer.texts_to_sequences(test['text'].tolist())

x_train_words = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
x_test_words = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)
print('processed text features')

x_train_target = train.project_is_approved

print('x_train_words.shape', x_train_words.shape)

Using TensorFlow backend.


x_train_words.shape (182080, 300)


In [14]:
from keras.layers import Input, Dense, Embedding, Flatten, concatenate, Dropout, Convolution1D, GlobalMaxPool1D
from keras.models import Model
from keras import optimizers


def get_model():
    input_cate = Input((len(cate_features_hash),))
    input_numeric = Input((len(numeric_features),))
    input_words = Input((maxlen,))
    
    x_cate = Embedding(max_size, 10)(input_cate)
    x_cate = Flatten()(x_cate)
    x_cate = Dropout(0.2)(x_cate)
    x_cate = Dense(100, activation='relu')(x_cate)
    
    x_numeric = Dense(100, activation='relu')(input_numeric)
    x_numeric = Dropout(0.2)(x_numeric)
    
    x_words = Embedding(max_features, 100)(input_words)
    x_words = Convolution1D(100, 3, activation='relu')(x_words)
    x_words = GlobalMaxPool1D()(x_words)
    x_words = Dropout(0.2)(x_words)
    
    x = concatenate([x_cate, x_numeric, x_words])
    
    x = Dense(100, activation='relu')(x)
    x = Dropout(0.2)(x)
    
    predictions = Dense(1, activation='sigmoid')(x)
    
    model = Model(inputs=[input_cate,input_numeric,input_words], outputs=predictions)
    model.compile(optimizer=optimizers.Adam(0.001, decay=1e-6), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model
    
model = get_model()
model.fit([x_train_cate,x_train_numeric,x_train_words],x_train_target, validation_split=0.1, epochs=5, batch_size=1024)
predict_test = model.predict([x_test_cate,x_test_numeric,x_test_words])


test['project_is_approved'] = predict_test
test[['id','project_is_approved']].to_csv('baseline_keras_nn.csv', index=False)


Train on 163872 samples, validate on 18208 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
