In [102]:
## reference: https://colab.research.google.com/drive/1QhSnbh-WJVGZjQJF8u974msOL_vAgMeS#scrollTo=PlAGuj5kuZm9
## https://github.com/eisenjulian/nlp_estimator_tutorial/blob/master/nlp_estimators.ipynb

import pandas as pd
import numpy as np

from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection
from sklearn import feature_selection
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit
from sklearn.svm import SVR
from sklearn.svm import SVC

from keras.preprocessing.text import Tokenizer

import tensorflow as tf
from tensorboard import summary as summary_lib
from tensorflow.python.keras.preprocessing import sequence

import seaborn as sns
import matplotlib.pyplot as plt

import tempfile
import os
print(tf.__version__)


dir='/Users/xinwang/ai/dataset/kaggle/DonorsChoose/'
train_file='train_1.csv'
eval_file='train_2.csv'
eval_file='train_2.csv'
test_file='test.csv'
resource_file='resources.csv'
SEED=1000
positive_sample_size=30000

train_df = pd.read_csv(dir + train_file)
eval_df = pd.read_csv(dir + eval_file)
test_df = pd.read_csv(dir + test_file)
resource_df = pd.read_csv(dir + resource_file)
label = LabelEncoder()
low_memory=False


def sampleData():
    train_label_1_df = train_df[train_df['project_is_approved']==1].sample(n=positive_sample_size,
                                                                           random_state=SEED)
    train_label_0_df = train_df[train_df['project_is_approved']==0]

    train_data = pd.concat([train_label_1_df,train_label_0_df])
    train_data = shuffle(train_data)
    
    return train_data

train_df.fillna(value={"teacher_prefix":'Mr.'}, inplace=True)
eval_df.fillna(value={"teacher_prefix":'Mr.'}, inplace=True)
test_df.fillna(value={"teacher_prefix":'Mr.'}, inplace=True)

train_data = train_df
eval_data = eval_df
test_data = test_df 


def transfer_posted_project_buckets(x):
    if x == 0:
        return 'L'
    elif x ==1:
        return 'M'
    else:
        return 'H'

all_data = (train_data, eval_data, test_data)
for dataset in all_data:
    dataset['posted_projects_bins'] = pd.cut(dataset['teacher_number_of_previously_posted_projects'],
                                                [-1, 7, 50, 1000])
    dataset['posted_projects_buckets'] = label.fit_transform(dataset['posted_projects_bins'])
    dataset['posted_projects_buckets'] = dataset['posted_projects_buckets'].apply(transfer_posted_project_buckets)
    dataset.info()

target = 'project_is_approved'
model_dir = tempfile.mkdtemp()
vocab_size=10000
embedding_size=50
sentence_size=150


def text_2_vec(feature_name, data_set):
    texts = data_set[feature_name].values
    
    tokenizer = Tokenizer(vocab_size)
    tokenizer.fit_on_texts(texts)
    sequence_data = tokenizer.texts_to_sequences(texts)
    
    data = sequence.pad_sequences(sequence_data, 
                                 maxlen=sentence_size,
                                 truncating='post',
                                 padding='post',
                                 value=pad_id)
    print("text_2_vec data shape:", data.shape)
    
    return data

title_train = text_2_vec('project_title', train_data)
title_eval = text_2_vec('project_title', eval_data)
title_test = text_2_vec('project_title', test_data)

essay_1_train = text_2_vec('project_essay_1', train_data)
essay_1_eval = text_2_vec('project_essay_1', eval_data)
essay_1_test = text_2_vec('project_essay_1', test_data)

essay_2_train = text_2_vec('project_essay_2', train_data)
essay_2_eval = text_2_vec('project_essay_2', eval_data)
essay_2_test = text_2_vec('project_essay_2', test_data)

resource_summary_train = text_2_vec('project_resource_summary', train_data)
resource_summary_eval = text_2_vec('project_resource_summary', eval_data)
resource_summary_test = text_2_vec('project_resource_summary', test_data)


1.9.0


  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29999 entries, 0 to 29998
Data columns (total 18 columns):
id                                              29999 non-null object
teacher_id                                      29999 non-null object
teacher_prefix                                  29999 non-null object
school_state                                    29999 non-null object
project_submitted_datetime                      29999 non-null object
project_grade_category                          29999 non-null object
project_subject_categories                      29999 non-null object
project_subject_subcategories                   29999 non-null object
project_title                                   29999 non-null object
project_essay_1                                 29999 non-null object
project_essay_2                                 29999 non-null object
project_essay_3                                 994 non-null object
project_essay_4                                 994 n

In [106]:
feature_columns = []

FEATURES = ['teacher_id','teacher_prefix','school_state','project_grade_category',
            'project_subject_categories','project_subject_subcategories',
            'posted_projects_buckets'
           ]

teacher_id = tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_hash_bucket('teacher_id', 1000))
feature_columns.append(teacher_id)

teacher_prefix = tf.feature_column.indicator_column(
    tf.feature_column.categorical_column_with_vocabulary_list(
    "teacher_prefix", [
        "Mrs.","Ms.","Mr.","Teacher","Dr."
    ]))
feature_columns.append(teacher_prefix)

school_state = tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(
    "school_state",[
        "AK","AL","AR","AZ","CA","CO","CT","DC","DE","FL","GA","HI","IA","ID","IL",
        "IN","KS","KY","LA","MA","MD","ME","MI","MN","MO","MS","MT","NC","ND","NE",
        "NH","NJ","NM","NV","NY","OH","OK","OR","PA","RI","SC","SD","TN",
        "TX","UT","VA","VT","WA","WI","WV","WY"
    ]))
feature_columns.append(school_state)

# TODO project_submitted_datetime

project_grade_category = tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(
    "project_grade_category",["Grades 3-5","Grades 6-8","Grades 9-12","Grades PreK-2"]))
feature_columns.append(project_grade_category)

project_subject_categories = tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(
    "project_subject_categories",[
       "Applied Learning","Health & Sports","History & Civics","Literacy & Language",
        "Math & Science","Music & The Arts","Special Needs","Warmth"]))
feature_columns.append(project_subject_categories)


project_subject_subcategories = tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(
    "project_subject_subcategories",[
        "Applied Learning","Care & Hunger","Health & Sports","History & Civics",
        "Literacy & Language","Math & Science","Music & The Arts",
        "Warmth","Applied Sciences","Character Education","Civics & Government",
        "College & Career Prep","Community Service","ESL","Early Development",
        "Economics","Environmental Science","Extracurricular","Financial Literacy",
        "Foreign Languages","Gym & Fitness","Health & Life Science","Health & Wellness",
        "History & Geography","Literacy","Literature & Writing","Mathematics","Music",
        "Nutrition Education","Other","Parent Involvement","Performing Arts",
        "Social Sciences","Special Needs","Team Sports","Visual Arts"]))
feature_columns.append(project_subject_subcategories)

posted_projects_buckets = tf.feature_column.indicator_column(
    tf.feature_column.categorical_column_with_vocabulary_list('posted_projects_buckets',
                                                              ['L','M','H']))
feature_columns.append(posted_projects_buckets)



###############################################################################
project_title = tf.feature_column.embedding_column(tf.feature_column.categorical_column_with_identity('project_title', vocab_size),
                                                  dimension=embedding_size)
feature_columns.append(project_title)

project_essay_1 = tf.feature_column.embedding_column(tf.feature_column.categorical_column_with_identity('project_essay_1', vocab_size),
                                                     dimension=embedding_size)
# feature_columns.append(project_essay_1)

project_essay_2 = tf.feature_column.embedding_column(tf.feature_column.categorical_column_with_identity('project_essay_2', vocab_size),
                                                    dimension=embedding_size)
# feature_columns.append(project_essay_2)

project_resource_summary = tf.feature_column.embedding_column(
    tf.feature_column.categorical_column_with_identity('project_resource_summary',vocab_size),
    dimension=embedding_size)
feature_columns.append(project_resource_summary)
#################################################################################


print('feature_columns len', len(feature_columns))
print(feature_columns)

feature_columns len 9
[_IndicatorColumn(categorical_column=_HashedCategoricalColumn(key='teacher_id', hash_bucket_size=1000, dtype=tf.string)), _IndicatorColumn(categorical_column=_VocabularyListCategoricalColumn(key='teacher_prefix', vocabulary_list=('Mrs.', 'Ms.', 'Mr.', 'Teacher', 'Dr.'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), _IndicatorColumn(categorical_column=_VocabularyListCategoricalColumn(key='school_state', vocabulary_list=('AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), _IndicatorColumn(categorical_column=_VocabularyListCategoricalColumn(key='project_grade_category', vocabulary_list=('Grades 3-5', 'Grades 6-8', 'Grades 9-12', 'Grades PreK-2'), dtype

In [107]:
train_data_features = train_data[FEATURES].values
eval_data_features = eval_data[FEATURES].values
test_data_features = test_data[FEATURES].values

classifier = tf.estimator.LinearClassifier(feature_columns=feature_columns,
                                           model_dir=os.path.join(model_dir, 'bow_sparse'))

def parser(essay_1, essay_2, resource_summary, title, data_set, y):
    features = {k:data_set[:, FEATURES.index(k)] for k in FEATURES}

#     features["project_essay_1"] = essay_1
#     features["project_essay_2"] = essay_2
    features["project_resource_summary"] = resource_summary
    features["project_title"] = title
    
    return features, y

def train_input_fn():
    y_train = train_data[target].values
    
    dataset = tf.data.Dataset.from_tensor_slices((essay_1_train, essay_2_train,
                                                  resource_summary_train, title_train,
                                                  train_data_features,
                                                  y_train))
    dataset = dataset.shuffle(buffer_size=100)
    dataset = dataset.batch(100)
    
    dataset = dataset.map(parser)
    dataset = dataset.repeat()
    iterator = dataset.make_one_shot_iterator()

    return iterator.get_next()
    
def eval_input_fn():
    y_eval = eval_data[target]
    
    dataset = tf.data.Dataset.from_tensor_slices((essay_1_eval,essay_2_eval,
                                                  resource_summary_eval, title_eval,
                                                  eval_data_features,
                                                  y_eval))
    dataset = dataset.batch(100)
    
    dataset = dataset.map(parser)
    
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

def test_input_fn():
    y_test = test_data[target]
    
    dataset = tf.data.Dataset.from_tensor_slices((essay_1_test,essay_2_test,
                                                  resource_summary_test, title_test,
                                                  test_data_features,
                                                  y_test))
    dataset = dataset.batch(100)
    
    dataset = dataset.map(parser)
    
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()


all_classifiers = {}
def train_and_evaluate(classifier):
    all_classifiers[classifier.model_dir] = classifier
    classifier.train(input_fn=train_input_fn, steps = 20000)
    eval_results = classifier.evaluate(input_fn=eval_input_fn)
    
    predictions = np.array([p['logistic'][0] for p in classifier.predict(
        input_fn=eval_input_fn)])
    
    tf.reset_default_graph()
    pr = summary_lib.pr_curve('precision_recall', predictions=predictions, 
                              labels = eval_data[target].astype(bool),
                              num_thresholds = 21)
    
    with tf.Session() as sess:
        writer = tf.summary.FileWriter(os.path.join(classifier.model_dir, 'eval'),
                                      sess.graph)
        writer.add_summary(sess.run(pr), global_step = 0)
        writer.close()


train_and_evaluate(classifier)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/hz/zfwx8n_d19g70bf5p8jpl4f43zxvln/T/tmpehpys581/bow_sparse', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x135373da0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/hz/zfwx8n_d19g70bf5p8jpl4f43zxvln/T/tmpehpys581/bow_sparse/

INFO:tensorflow:loss = 28.137901, step = 27001 (0.664 sec)
INFO:tensorflow:global_step/sec: 200.952
INFO:tensorflow:loss = 36.18256, step = 27101 (0.498 sec)
INFO:tensorflow:global_step/sec: 186.474
INFO:tensorflow:loss = 39.35386, step = 27201 (0.536 sec)
INFO:tensorflow:global_step/sec: 220.946
INFO:tensorflow:loss = 38.632267, step = 27301 (0.453 sec)
INFO:tensorflow:global_step/sec: 218.944
INFO:tensorflow:loss = 45.550964, step = 27401 (0.457 sec)
INFO:tensorflow:global_step/sec: 226.817
INFO:tensorflow:loss = 44.471027, step = 27501 (0.441 sec)
INFO:tensorflow:global_step/sec: 222.461
INFO:tensorflow:loss = 44.091484, step = 27601 (0.450 sec)
INFO:tensorflow:global_step/sec: 215.947
INFO:tensorflow:loss = 38.94671, step = 27701 (0.463 sec)
INFO:tensorflow:global_step/sec: 225.217
INFO:tensorflow:loss = 41.572014, step = 27801 (0.444 sec)
INFO:tensorflow:global_step/sec: 173.819
INFO:tensorflow:loss = 41.510536, step = 27901 (0.577 sec)
INFO:tensorflow:global_step/sec: 199.967
INF

INFO:tensorflow:global_step/sec: 217.017
INFO:tensorflow:loss = 36.195637, step = 35301 (0.461 sec)
INFO:tensorflow:global_step/sec: 192.679
INFO:tensorflow:loss = 32.445057, step = 35401 (0.519 sec)
INFO:tensorflow:global_step/sec: 221.972
INFO:tensorflow:loss = 30.704779, step = 35501 (0.450 sec)
INFO:tensorflow:global_step/sec: 210.797
INFO:tensorflow:loss = 27.934298, step = 35601 (0.475 sec)
INFO:tensorflow:global_step/sec: 223.306
INFO:tensorflow:loss = 38.960335, step = 35701 (0.448 sec)
INFO:tensorflow:global_step/sec: 217.589
INFO:tensorflow:loss = 37.18516, step = 35801 (0.459 sec)
INFO:tensorflow:global_step/sec: 219.19
INFO:tensorflow:loss = 31.476276, step = 35901 (0.457 sec)
INFO:tensorflow:global_step/sec: 211.175
INFO:tensorflow:loss = 26.60847, step = 36001 (0.473 sec)
INFO:tensorflow:global_step/sec: 218.789
INFO:tensorflow:loss = 30.86615, step = 36101 (0.457 sec)
INFO:tensorflow:global_step/sec: 218.97
INFO:tensorflow:loss = 39.082882, step = 36201 (0.457 sec)
INFO:

In [207]:
def predict_input_fn():
    length = len(test_data_features[:, 1])
    print('length',length)
    y_test = np.empty([length,1])
    
    dataset = tf.data.Dataset.from_tensor_slices((essay_1_test,essay_2_test,
                                                  resource_summary_test, test_data_features,
                                                  y_test))
    dataset = dataset.batch(100)
    
    dataset = dataset.map(parser)
    
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()


predictions = np.array([p['logistic'][0] for p in classifier.predict(
    input_fn=predict_input_fn)])

print(predictions[:10])

predict_result = pd.DataFrame({
    "id":test_data['id'],
    "project_is_approved":predictions
})
predict_result.to_csv('prince_baseline_submission.csv', index=False)

length 78035
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/hz/zfwx8n_d19g70bf5p8jpl4f43zxvln/T/tmpiubdbhgs/bow_sparse/model.ckpt-10000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
[0.9698241  0.860133   0.99670154 0.9966903  0.9979532  0.99721354
 0.6550113  0.994561   0.9991621  0.18014787]
