In [None]:
import pandas as pd
import numpy as np

from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection
from sklearn import feature_selection
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit
from sklearn.svm import SVR
from sklearn.svm import SVC

from keras.preprocessing.text import Tokenizer

import tensorflow as tf
from tensorboard import summary as summary_lib
from tensorflow.python.keras.preprocessing import sequence

import seaborn as sns
import matplotlib.pyplot as plt

import tempfile
import os
print(tf.__version__)


dir='/Users/xinwang/ai/dataset/kaggle/DonorsChoose/'
train_file=dir + 'train_1.csv'
eval_file=dir + 'train_2.csv'
model_dir = "/Users/xinwang/Downloads/models_temp/"
label = LabelEncoder()

CSV_COLUMNS = ['id', 'teacher_id', 'teacher_prefix', 'school_state',
       'project_submitted_datetime', 'project_grade_category',
       'project_subject_categories', 'project_subject_subcategories',
       'project_title', 'project_essay_1', 'project_essay_2',
       'project_essay_3', 'project_essay_4', 'project_resource_summary',
       'teacher_number_of_previously_posted_projects', 'project_is_approved']
target = 'project_is_approved'

summary_vocab_size=10000
summary_sentence_size=50
summary_embedding_size=20

essay_vocab_size=30000
essay_sentence_size=200
essay_embedding_size=50

teacher_id = tf.feature_column.categorical_column_with_hash_bucket('teacher_id', hash_bucket_size=1000)

teacher_prefix = tf.feature_column.categorical_column_with_vocabulary_list(
    "teacher_prefix", [
        "Mrs.","Ms.","Mr.","Teacher","Dr."
    ])
teacher_prefix_bins = tf.feature_column.numeric_column('teacher_prefix_bins')

school_state = tf.feature_column.categorical_column_with_vocabulary_list(
    "school_state",[
        "AK","AL","AR","AZ","CA","CO","CT","DC","DE","FL","GA","HI","IA","ID","IL",
        "IN","KS","KY","LA","MA","MD","ME","MI","MN","MO","MS","MT","NC","ND","NE",
        "NH","NJ","NM","NV","NY","OH","OK","OR","PA","RI","SC","SD","TN",
        "TX","UT","VA","VT","WA","WI","WV","WY"
    ])
school_state_bins = tf.feature_column.numeric_column('school_state_bins')

project_grade_category = tf.feature_column.categorical_column_with_vocabulary_list(
    "project_grade_category",["Grades 3-5","Grades 6-8","Grades 9-12","Grades PreK-2"])
project_grade_category_bins = tf.feature_column.numeric_column('project_grade_category_bins')

project_subject_categories = tf.feature_column.categorical_column_with_vocabulary_list(
    "project_subject_categories",[
       "Applied Learning","Health & Sports","History & Civics","Literacy & Language",
        "Math & Science","Music & The Arts","Special Needs","Warmth"])
project_subject_categories_bins = tf.feature_column.numeric_column('project_subject_categories_bins')

project_subject_subcategories = tf.feature_column.categorical_column_with_vocabulary_list(
    "project_subject_subcategories",[
        "Applied Learning","Care & Hunger","Health & Sports","History & Civics",
        "Literacy & Language","Math & Science","Music & The Arts",
        "Warmth","Applied Sciences","Character Education","Civics & Government",
        "College & Career Prep","Community Service","ESL","Early Development",
        "Economics","Environmental Science","Extracurricular","Financial Literacy",
        "Foreign Languages","Gym & Fitness","Health & Life Science","Health & Wellness",
        "History & Geography","Literacy","Literature & Writing","Mathematics","Music",
        "Nutrition Education","Other","Parent Involvement","Performing Arts",
        "Social Sciences","Special Needs","Team Sports","Visual Arts"])
project_subject_subcategories_bins = tf.feature_column.numeric_column('project_subject_subcategories_bins')

posted_projects = tf.feature_column.numeric_column('teacher_number_of_previously_posted_projects')

summary_vec = tf.feature_column.categorical_column_with_identity('project_resource_summary_vec', summary_vocab_size)

####################Text#########################

basic_columns = [

]

crossed_columns = [
    tf.feature_column.crossed_column(['teacher_prefix','school_state'],
                                    hash_bucket_size=400),
    tf.feature_column.crossed_column(['school_state','project_grade_category'],
                                    hash_bucket_size=400),
    tf.feature_column.crossed_column(['project_grade_category','project_subject_categories'],
                                    hash_bucket_size=400),
    tf.feature_column.crossed_column(['project_subject_categories','project_subject_subcategories'],
                                   hash_bucket_size=400)
]
deep_columns = [
    teacher_prefix_bins,
    school_state_bins,
    project_grade_category_bins,
    project_subject_categories_bins,
    project_subject_subcategories_bins,
    posted_projects,
        
    tf.feature_column.indicator_column(teacher_id),
    tf.feature_column.indicator_column(teacher_prefix),
    tf.feature_column.indicator_column(school_state),
    tf.feature_column.indicator_column(project_grade_category),
    tf.feature_column.indicator_column(project_subject_categories),
    tf.feature_column.indicator_column(project_subject_subcategories),
    
    tf.feature_column.embedding_column(summary_vec, dimension=10)
]


def text_2_vec(feature_name, data_set, vocab_size, sentence_size):
    texts = data_set[feature_name].values
    
    tokenizer = Tokenizer(vocab_size)
    tokenizer.fit_on_texts(texts)
    sequence_data = tokenizer.texts_to_sequences(texts)
    
    data = sequence.pad_sequences(sequence_data, 
                                 maxlen=sentence_size,
                                 truncating='post',
                                 padding='post',
                                 value=pad_id)
    print(feature_name + " text_2_vec data shape:", data.shape)
    
    return data

def build_input_features_dict(input_df):
    features = {}
    features['teacher_id'] = input_df['teacher_id'].values
    
    features['teacher_prefix'] = input_df['teacher_prefix'].values
    features['teacher_prefix_bins'] = input_df['teacher_prefix_bins'].values
    
    features['school_state'] = input_df['school_state'].values
    features['school_state_bins'] = input_df['school_state_bins'].values
    
    features['project_grade_category'] = input_df['project_grade_category'].values
    features['project_grade_category_bins'] = input_df['project_grade_category_bins'].values

    features['project_subject_categories'] = input_df['project_subject_categories'].values
    features['project_subject_categories_bins'] = input_df['project_subject_categories_bins'].values

    features['project_subject_subcategories'] = input_df['project_subject_subcategories'].values
    features['project_subject_subcategories_bins'] = input_df['project_subject_subcategories_bins'].values
    
    features['teacher_number_of_previously_posted_projects'] = input_df['teacher_number_of_previously_posted_projects'].values

    ############################  Text columns  ############################
    features['project_resource_summary_vec'] = text_2_vec('project_resource_summary',
                                                          input_df, 
                                                          summary_vocab_size,
                                                          summary_sentence_size)
    
    return features

def input_fn(file, num_epochs, shuffle):
    input_df = pd.read_csv(
        tf.gfile.Open(file),names=CSV_COLUMNS,
        skipinitialspace=True,engine="python",skiprows=1)
    
    input_df.dropna(subset=["teacher_prefix"], inplace=True)  
    input_df['teacher_prefix_bins'] = label.fit_transform(input_df['teacher_prefix'])

    input_df['school_state_bins'] = label.fit_transform(input_df['school_state'])
    input_df['project_grade_category_bins'] = label.fit_transform(input_df['project_grade_category'])
    input_df['project_subject_categories_bins'] = label.fit_transform(input_df['project_subject_categories'])
    input_df['project_subject_subcategories_bins'] = label.fit_transform(input_df['project_subject_subcategories'])
    
    features = build_input_features_dict(input_df)
    labels = input_df[target].values
    
    return tf.estimator.inputs.numpy_input_fn(features,
                                              labels,
                                              batch_size=100,
                                              num_epochs=num_epochs,
                                              shuffle=shuffle,
                                              num_threads=5)

def buildClassifier():
    m = tf.estimator.DNNLinearCombinedClassifier(
        linear_feature_columns = basic_columns + crossed_columns,
        dnn_feature_columns = deep_columns,
        dnn_hidden_units=[100, 50])
    return m
    
def train_and_evaluate():
    classifier = buildClassifier()

    classifier.train(input_fn=input_fn(train_file, num_epochs=None, shuffle=True), 
                     steps = 20000)
    results = classifier.evaluate(input_fn=input_fn(eval_file, num_epochs=1, shuffle=False),
                                      steps=None)
    
    print('-'*100)
    for key in sorted(results):
        print("%s: %s" % (key, results[key]))
        
        
    
train_and_evaluate()

1.9.0
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/hz/zfwx8n_d19g70bf5p8jpl4f43zxvln/T/tmpvlw3nuot', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x13df1a2b0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
project_resource_summary text_2_vec data shape: (59997, 50)
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done

INFO:tensorflow:loss = 50.111202, step = 7101 (0.560 sec)
INFO:tensorflow:global_step/sec: 175.125
INFO:tensorflow:loss = 45.053425, step = 7201 (0.571 sec)
INFO:tensorflow:global_step/sec: 179.679
INFO:tensorflow:loss = 50.746544, step = 7301 (0.556 sec)
INFO:tensorflow:global_step/sec: 179.145
INFO:tensorflow:loss = 48.27339, step = 7401 (0.559 sec)
INFO:tensorflow:global_step/sec: 177.511
INFO:tensorflow:loss = 39.552845, step = 7501 (0.563 sec)
INFO:tensorflow:global_step/sec: 176.157
INFO:tensorflow:loss = 40.213913, step = 7601 (0.568 sec)
INFO:tensorflow:global_step/sec: 178.122
INFO:tensorflow:loss = 45.42902, step = 7701 (0.562 sec)
INFO:tensorflow:global_step/sec: 178.742
INFO:tensorflow:loss = 36.50221, step = 7801 (0.559 sec)
INFO:tensorflow:global_step/sec: 178.224
INFO:tensorflow:loss = 38.89965, step = 7901 (0.561 sec)
INFO:tensorflow:global_step/sec: 178.028
INFO:tensorflow:loss = 40.83607, step = 8001 (0.562 sec)
INFO:tensorflow:global_step/sec: 243.178
INFO:tensorflow

INFO:tensorflow:loss = 54.39798, step = 15401 (0.569 sec)
INFO:tensorflow:global_step/sec: 179.922
INFO:tensorflow:loss = 38.30051, step = 15501 (0.555 sec)
INFO:tensorflow:global_step/sec: 181.806
INFO:tensorflow:loss = 41.912685, step = 15601 (0.550 sec)
INFO:tensorflow:global_step/sec: 178.848
INFO:tensorflow:loss = 37.323605, step = 15701 (0.560 sec)
INFO:tensorflow:global_step/sec: 179.277
INFO:tensorflow:loss = 36.962093, step = 15801 (0.558 sec)
INFO:tensorflow:global_step/sec: 177.195
INFO:tensorflow:loss = 46.116932, step = 15901 (0.564 sec)
INFO:tensorflow:global_step/sec: 177.244
INFO:tensorflow:loss = 46.39279, step = 16001 (0.565 sec)
INFO:tensorflow:global_step/sec: 173.369
INFO:tensorflow:loss = 37.96465, step = 16101 (0.576 sec)
INFO:tensorflow:global_step/sec: 176.154
INFO:tensorflow:loss = 48.979214, step = 16201 (0.568 sec)
INFO:tensorflow:global_step/sec: 179.452
INFO:tensorflow:loss = 26.889063, step = 16301 (0.558 sec)
INFO:tensorflow:global_step/sec: 178.033
INFO