In [204]:
## reference: https://colab.research.google.com/drive/1QhSnbh-WJVGZjQJF8u974msOL_vAgMeS#scrollTo=PlAGuj5kuZm9
## https://github.com/eisenjulian/nlp_estimator_tutorial/blob/master/nlp_estimators.ipynb

import pandas as pd
import numpy as np

from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection
from sklearn import feature_selection
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit
from sklearn.svm import SVR
from sklearn.svm import SVC

from keras.preprocessing.text import Tokenizer

import tensorflow as tf
from tensorboard import summary as summary_lib
from tensorflow.python.keras.preprocessing import sequence

import seaborn as sns
import matplotlib.pyplot as plt

import tempfile
import os
print(tf.__version__)


dir='/Users/xinwang/ai/dataset/kaggle/DonorsChoose/'
train_file='train_1.csv'
eval_file='train_2.csv'
eval_file='train_2.csv'
test_file='test.csv'
resource_file='resources.csv'
SEED=1000
positive_sample_size=30000

train_df = pd.read_csv(dir + train_file)
eval_df = pd.read_csv(dir + eval_file)
test_df = pd.read_csv(dir + test_file)
resource_df = pd.read_csv(dir + resource_file)
label = LabelEncoder()
low_memory=False


def sampleData():
    train_label_1_df = train_df[train_df['project_is_approved']==1].sample(n=positive_sample_size,
                                                                           random_state=SEED)
    train_label_0_df = train_df[train_df['project_is_approved']==0]

    train_data = pd.concat([train_label_1_df,train_label_0_df])
    train_data = shuffle(train_data)
    
    return train_data

train_df.fillna(value={"teacher_prefix":'Mr.'}, inplace=True)
eval_df.fillna(value={"teacher_prefix":'Mr.'}, inplace=True)
test_df.fillna(value={"teacher_prefix":'Mr.'}, inplace=True)

train_data = train_df
eval_data = eval_df
test_data = test_df 

train_df.info()

1.9.0


  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29999 entries, 0 to 29998
Data columns (total 16 columns):
id                                              29999 non-null object
teacher_id                                      29999 non-null object
teacher_prefix                                  29999 non-null object
school_state                                    29999 non-null object
project_submitted_datetime                      29999 non-null object
project_grade_category                          29999 non-null object
project_subject_categories                      29999 non-null object
project_subject_subcategories                   29999 non-null object
project_title                                   29999 non-null object
project_essay_1                                 29999 non-null object
project_essay_2                                 29999 non-null object
project_essay_3                                 994 non-null object
project_essay_4                                 994 n

In [205]:
feature_columns = []

vocab_size=30000
embedding_size=50
sentence_size=200

FEATURES = ['teacher_id','teacher_prefix',
            'school_state','project_grade_category',
            'project_subject_categories','project_subject_subcategories',
            'project_title',
#           'teacher_number_of_previously_posted_projects'
           ]
teacher_id = tf.feature_column.categorical_column_with_hash_bucket('teacher_id', 1000)
feature_columns.append(teacher_id)

teacher_prefix = tf.feature_column.categorical_column_with_vocabulary_list(
    "teacher_prefix", [
        "Mrs.","Ms.","Mr.","Teacher","Dr."
    ])
feature_columns.append(teacher_prefix)

school_state = tf.feature_column.categorical_column_with_vocabulary_list(
    "school_state",[
        "AK","AL","AR","AZ","CA","CO","CT","DC","DE","FL","GA","HI","IA","ID","IL",
        "IN","KS","KY","LA","MA","MD","ME","MI","MN","MO","MS","MT","NC","ND","NE",
        "NH","NJ","NM","NV","NY","OH","OK","OR","PA","RI","SC","SD","TN",
        "TX","UT","VA","VT","WA","WI","WV","WY"
    ])
feature_columns.append(school_state)

# todo project_submitted_datetime

project_grade_category = tf.feature_column.categorical_column_with_vocabulary_list(
    "project_grade_category",["Grades 3-5","Grades 6-8","Grades 9-12","Grades PreK-2"])
feature_columns.append(project_grade_category)

project_subject_categories = tf.feature_column.categorical_column_with_vocabulary_list(
    "project_subject_categories",[
       "Applied Learning","Health & Sports","History & Civics","Literacy & Language",
        "Math & Science","Music & The Arts","Special Needs","Warmth"])
feature_columns.append(project_subject_categories)


project_subject_subcategories = tf.feature_column.categorical_column_with_vocabulary_list(
    "project_subject_subcategories",[
        "Applied Learning","Care & Hunger","Health & Sports","History & Civics",
        "Literacy & Language","Math & Science","Music & The Arts",
        "Warmth","Applied Sciences","Character Education","Civics & Government",
        "College & Career Prep","Community Service","ESL","Early Development",
        "Economics","Environmental Science","Extracurricular","Financial Literacy",
        "Foreign Languages","Gym & Fitness","Health & Life Science","Health & Wellness",
        "History & Geography","Literacy","Literature & Writing","Mathematics","Music",
        "Nutrition Education","Other","Parent Involvement","Performing Arts",
        "Social Sciences","Special Needs","Team Sports","Visual Arts"])
feature_columns.append(project_subject_subcategories)

project_title = tf.feature_column.categorical_column_with_hash_bucket(
    'project_title', 5000)
feature_columns.append(project_title)

project_essay_1 = tf.feature_column.embedding_column(tf.feature_column.categorical_column_with_identity('project_essay_1', vocab_size),
                                                     dimension=embedding_size)
feature_columns.append(project_essay_1)

project_essay_2 = tf.feature_column.embedding_column(tf.feature_column.categorical_column_with_identity('project_essay_2', vocab_size),
                                                    dimension=embedding_size)
feature_columns.append(project_essay_2)

project_resource_summary = tf.feature_column.embedding_column(
    tf.feature_column.categorical_column_with_identity('project_resource_summary',vocab_size),
    dimension=embedding_size)
feature_columns.append(project_resource_summary)

teacher_number_of_previously_posted_projects = tf.feature_column.numeric_column('teacher_number_of_previously_posted_projects')
# feature_columns.append(teacher_number_of_previously_posted_projects)


target = 'project_is_approved'
model_dir = tempfile.mkdtemp()


def text_2_vec(feature_name, data_set):
    texts = data_set[feature_name].values
    
    tokenizer = Tokenizer(vocab_size)
    tokenizer.fit_on_texts(texts)
    sequence_data = tokenizer.texts_to_sequences(texts)
    
    data = sequence.pad_sequences(sequence_data, 
                                 maxlen=sentence_size,
                                 truncating='post',
                                 padding='post',
                                 value=pad_id)
    print("text_2_vec data shape:", data.shape)
    
    return data

essay_1_train = text_2_vec('project_essay_1', train_data)
essay_1_eval = text_2_vec('project_essay_1', eval_data)
essay_1_test = text_2_vec('project_essay_1', test_data)

essay_2_train = text_2_vec('project_essay_2', train_data)
essay_2_eval = text_2_vec('project_essay_2', eval_data)
essay_2_test = text_2_vec('project_essay_2', test_data)

resource_summary_train = text_2_vec('project_resource_summary', train_data)
resource_summary_eval = text_2_vec('project_resource_summary', eval_data)
resource_summary_test = text_2_vec('project_resource_summary', test_data)

text_2_vec data shape: (29999, 200)
text_2_vec data shape: (30000, 200)
text_2_vec data shape: (78035, 200)
text_2_vec data shape: (29999, 200)
text_2_vec data shape: (30000, 200)
text_2_vec data shape: (78035, 200)
text_2_vec data shape: (29999, 200)
text_2_vec data shape: (30000, 200)
text_2_vec data shape: (78035, 200)


In [206]:
train_data_features = train_data[FEATURES].values
eval_data_features = eval_data[FEATURES].values
test_data_features = test_data[FEATURES].values

def parser(essay_1, essay_2, resource_summary, data_set, y):
    features = {k:data_set[:, FEATURES.index(k)] for k in FEATURES}

    features["project_essay_1"] = essay_1
    features["project_essay_2"] = essay_2
    features["project_resource_summary"] = resource_summary
    
    return features, y

def train_input_fn():
    y_train = train_data[target]
    
    dataset = tf.data.Dataset.from_tensor_slices((essay_1_train, essay_2_train,
                                                  resource_summary_train, train_data_features,
                                                  y_train))
    dataset = dataset.shuffle(buffer_size=100)
    dataset = dataset.batch(100)
    
    dataset = dataset.map(parser)
    dataset = dataset.repeat()
    iterator = dataset.make_one_shot_iterator()

    return iterator.get_next()
    
def eval_input_fn():
    y_eval = eval_data[target]
    
    dataset = tf.data.Dataset.from_tensor_slices((essay_1_eval,essay_2_eval,
                                                  resource_summary_eval, eval_data_features,
                                                  y_eval))
    dataset = dataset.batch(100)
    
    dataset = dataset.map(parser)
    
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

def test_input_fn():
    y_test = test_data[target]
    
    dataset = tf.data.Dataset.from_tensor_slices((essay_1_test,essay_2_test,
                                                  resource_summary_test, test_data_features,
                                                  y_test))
    dataset = dataset.batch(100)
    
    dataset = dataset.map(parser)
    
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()


all_classifiers = {}
def train_and_evaluate(classifier):
    all_classifiers[classifier.model_dir] = classifier
    classifier.train(input_fn=train_input_fn, steps = 10000)
    eval_results = classifier.evaluate(input_fn=eval_input_fn)
    
    predictions = np.array([p['logistic'][0] for p in classifier.predict(
        input_fn=eval_input_fn)])
    
    tf.reset_default_graph()
    pr = summary_lib.pr_curve('precision_recall', predictions=predictions, 
                              labels = eval_data[target].astype(bool),
                              num_thresholds = 21)
    
    with tf.Session() as sess:
        writer = tf.summary.FileWriter(os.path.join(classifier.model_dir, 'eval'),
                                      sess.graph)
        writer.add_summary(sess.run(pr), global_step = 0)
        writer.close()


classifier = tf.estimator.LinearClassifier(feature_columns=feature_columns,
                                           model_dir=os.path.join(model_dir, 'bow_sparse'))

train_and_evaluate(classifier)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/hz/zfwx8n_d19g70bf5p8jpl4f43zxvln/T/tmpiubdbhgs/bow_sparse', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x132e5a2e8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoi

INFO:tensorflow:global_step/sec: 158.094
INFO:tensorflow:loss = 21.553125, step = 7301 (0.633 sec)
INFO:tensorflow:global_step/sec: 157.653
INFO:tensorflow:loss = 16.124584, step = 7401 (0.634 sec)
INFO:tensorflow:global_step/sec: 157.35
INFO:tensorflow:loss = 23.005424, step = 7501 (0.635 sec)
INFO:tensorflow:global_step/sec: 157.608
INFO:tensorflow:loss = 16.973589, step = 7601 (0.635 sec)
INFO:tensorflow:global_step/sec: 155.93
INFO:tensorflow:loss = 17.339565, step = 7701 (0.642 sec)
INFO:tensorflow:global_step/sec: 163.339
INFO:tensorflow:loss = 22.24163, step = 7801 (0.612 sec)
INFO:tensorflow:global_step/sec: 160.915
INFO:tensorflow:loss = 19.25206, step = 7901 (0.621 sec)
INFO:tensorflow:global_step/sec: 155.987
INFO:tensorflow:loss = 18.71869, step = 8001 (0.641 sec)
INFO:tensorflow:global_step/sec: 156.433
INFO:tensorflow:loss = 22.595303, step = 8101 (0.639 sec)
INFO:tensorflow:global_step/sec: 155.706
INFO:tensorflow:loss = 18.179087, step = 8201 (0.643 sec)
INFO:tensorflow

In [207]:
def predict_input_fn():
    length = len(test_data_features[:, 1])
    print('length',length)
    y_test = np.empty([length,1])
    
    dataset = tf.data.Dataset.from_tensor_slices((essay_1_test,essay_2_test,
                                                  resource_summary_test, test_data_features,
                                                  y_test))
    dataset = dataset.batch(100)
    
    dataset = dataset.map(parser)
    
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()


predictions = np.array([p['logistic'][0] for p in classifier.predict(
    input_fn=predict_input_fn)])

print(predictions[:10])

predict_result = pd.DataFrame({
    "id":test_data['id'],
    "project_is_approved":predictions
})
predict_result.to_csv('prince_baseline_submission.csv', index=False)

length 78035
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/hz/zfwx8n_d19g70bf5p8jpl4f43zxvln/T/tmpiubdbhgs/bow_sparse/model.ckpt-10000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
[0.9698241  0.860133   0.99670154 0.9966903  0.9979532  0.99721354
 0.6550113  0.994561   0.9991621  0.18014787]
