In [2]:
import pandas as pd
import numpy as np

from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection
from sklearn import feature_selection
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit
from sklearn.svm import SVR
from sklearn.svm import SVC

from keras.preprocessing.text import Tokenizer

import tensorflow as tf
from tensorboard import summary as summary_lib
from tensorflow.python.keras.preprocessing import sequence

import seaborn as sns
import matplotlib.pyplot as plt

import tempfile
import os
print(tf.__version__)


dir='/Users/xinwang/ai/dataset/kaggle/DonorsChoose/'
train_file=dir + 'train_1.csv'
eval_file=dir + 'train_2.csv'
resource_file = dir + 'resources.csv'
model_dir = "/Users/xinwang/Downloads/models_temp/"
label = LabelEncoder()

CSV_COLUMNS = ['id', 'teacher_id', 'teacher_prefix', 'school_state',
       'project_submitted_datetime', 'project_grade_category',
       'project_subject_categories', 'project_subject_subcategories',
       'project_title', 'project_essay_1', 'project_essay_2',
       'project_essay_3', 'project_essay_4', 'project_resource_summary',
       'teacher_number_of_previously_posted_projects', 'project_is_approved']
RESOURCE_COLUMNS = ['id', 'description', 'quantity', 'price']
target = 'project_is_approved'

summary_vocab_size=10000
summary_sentence_size=50
summary_embedding_size=30

essay_vocab_size=30000
essay_sentence_size=200
essay_embedding_size=50
pad_id=0

teacher_id = tf.feature_column.categorical_column_with_hash_bucket('teacher_id', hash_bucket_size=1000)

project_title = tf.feature_column.categorical_column_with_hash_bucket('project_title', hash_bucket_size=5000)

teacher_prefix = tf.feature_column.categorical_column_with_vocabulary_list(
    "teacher_prefix", [
        "Mrs.","Ms.","Mr.","Teacher","Dr."
    ])
teacher_prefix_bins = tf.feature_column.numeric_column('teacher_prefix_bins')

school_state = tf.feature_column.categorical_column_with_vocabulary_list(
    "school_state",[
        "AK","AL","AR","AZ","CA","CO","CT","DC","DE","FL","GA","HI","IA","ID","IL",
        "IN","KS","KY","LA","MA","MD","ME","MI","MN","MO","MS","MT","NC","ND","NE",
        "NH","NJ","NM","NV","NY","OH","OK","OR","PA","RI","SC","SD","TN",
        "TX","UT","VA","VT","WA","WI","WV","WY"
    ])
school_state_bins = tf.feature_column.numeric_column('school_state_bins')

project_grade_category = tf.feature_column.categorical_column_with_vocabulary_list(
    "project_grade_category",["Grades 3-5","Grades 6-8","Grades 9-12","Grades PreK-2"])
project_grade_category_bins = tf.feature_column.numeric_column('project_grade_category_bins')

project_subject_categories = tf.feature_column.categorical_column_with_vocabulary_list(
    "project_subject_categories",[
       "Applied Learning","Health & Sports","History & Civics","Literacy & Language",
        "Math & Science","Music & The Arts","Special Needs","Warmth"])
project_subject_categories_bins = tf.feature_column.numeric_column('project_subject_categories_bins')

project_subject_subcategories = tf.feature_column.categorical_column_with_vocabulary_list(
    "project_subject_subcategories",[
        "Applied Learning","Care & Hunger","Health & Sports","History & Civics",
        "Literacy & Language","Math & Science","Music & The Arts",
        "Warmth","Applied Sciences","Character Education","Civics & Government",
        "College & Career Prep","Community Service","ESL","Early Development",
        "Economics","Environmental Science","Extracurricular","Financial Literacy",
        "Foreign Languages","Gym & Fitness","Health & Life Science","Health & Wellness",
        "History & Geography","Literacy","Literature & Writing","Mathematics","Music",
        "Nutrition Education","Other","Parent Involvement","Performing Arts",
        "Social Sciences","Special Needs","Team Sports","Visual Arts"])
project_subject_subcategories_bins = tf.feature_column.numeric_column('project_subject_subcategories_bins')

posted_projects = tf.feature_column.numeric_column('teacher_number_of_previously_posted_projects')
posted_projects_bins = tf.feature_column.numeric_column('posted_projects_bins')

quantity = tf.feature_column.numeric_column('quantity')
price = tf.feature_column.numeric_column('price')


#################### Text columns #########################
summary_vec = tf.feature_column.categorical_column_with_identity('project_resource_summary_vec', summary_vocab_size)


basic_columns = [

]

crossed_columns = [
    tf.feature_column.crossed_column(['teacher_prefix','school_state'],
                                    hash_bucket_size=1000),
    tf.feature_column.crossed_column(['school_state','project_grade_category'],
                                    hash_bucket_size=1000),
    tf.feature_column.crossed_column(['project_grade_category','project_subject_categories'],
                                    hash_bucket_size=1000),
    tf.feature_column.crossed_column(['project_subject_categories','project_subject_subcategories'],
                                   hash_bucket_size=1000),
    tf.feature_column.crossed_column(['project_subject_categories','posted_projects_bins'],
                                   hash_bucket_size=1000)
]
deep_columns = [
    teacher_prefix_bins,
    school_state_bins,
    project_grade_category_bins,
    project_subject_categories_bins,
    project_subject_subcategories_bins,
    posted_projects,
    quantity,
    price,
        
    tf.feature_column.indicator_column(teacher_id),
    tf.feature_column.indicator_column(project_title),
    tf.feature_column.indicator_column(teacher_prefix),
    tf.feature_column.indicator_column(school_state),
    tf.feature_column.indicator_column(project_grade_category),
    tf.feature_column.indicator_column(project_subject_categories),
    tf.feature_column.indicator_column(project_subject_subcategories),
    
    tf.feature_column.embedding_column(summary_vec, dimension=10)
]


def text_2_vec(feature_name, data_set, vocab_size, sentence_size):
    texts = data_set[feature_name].values
    
    tokenizer = Tokenizer(vocab_size)
    tokenizer.fit_on_texts(texts)
    sequence_data = tokenizer.texts_to_sequences(texts)
    
    data = sequence.pad_sequences(sequence_data, 
                                 maxlen=sentence_size,
                                 truncating='post',
                                 padding='post',
                                 value=pad_id)
    print(feature_name + " text_2_vec data shape:", data.shape)
    
    return data

def build_input_features_dict(input_df):
    features = {}
    features['teacher_id'] = input_df['teacher_id'].values
    
    features['project_title'] = input_df['project_title'].values
    
    features['teacher_prefix'] = input_df['teacher_prefix'].values
    features['teacher_prefix_bins'] = input_df['teacher_prefix_bins'].values
    
    features['school_state'] = input_df['school_state'].values
    features['school_state_bins'] = input_df['school_state_bins'].values
    
    features['project_grade_category'] = input_df['project_grade_category'].values
    features['project_grade_category_bins'] = input_df['project_grade_category_bins'].values

    features['project_subject_categories'] = input_df['project_subject_categories'].values
    features['project_subject_categories_bins'] = input_df['project_subject_categories_bins'].values

    features['project_subject_subcategories'] = input_df['project_subject_subcategories'].values
    features['project_subject_subcategories_bins'] = input_df['project_subject_subcategories_bins'].values
    
    features['teacher_number_of_previously_posted_projects'] = input_df['teacher_number_of_previously_posted_projects'].values
    features['posted_projects_bins'] = input_df['posted_projects_bins'].values
    
    features['quantity'] = input_df['quantity'].values
    features['price'] = input_df['price'].values
    
    ############################  Text columns  ############################
    features['project_resource_summary_vec'] = text_2_vec('project_resource_summary',
                                                          input_df, 
                                                          summary_vocab_size,
                                                          summary_sentence_size)
    ############################  Text columns  ############################
    
    return features

def input_fn(file, num_epochs, shuffle):
    input_df = pd.read_csv(
        tf.gfile.Open(file),names=CSV_COLUMNS,
        skipinitialspace=True,engine="python",skiprows=1)
    resource_df = pd.read_csv(
        tf.gfile.Open(resource_file),names=RESOURCE_COLUMNS,
        skipinitialspace=True,engine="python",skiprows=1)
    
    
    input_df.dropna(subset=["teacher_prefix"], inplace=True)
    input_df['teacher_prefix_bins'] = label.fit_transform(input_df['teacher_prefix'])

    input_df['school_state_bins'] = label.fit_transform(input_df['school_state'])
    input_df['project_grade_category_bins'] = label.fit_transform(input_df['project_grade_category'])
    input_df['project_subject_categories_bins'] = label.fit_transform(input_df['project_subject_categories'])
    input_df['project_subject_subcategories_bins'] = label.fit_transform(input_df['project_subject_subcategories'])
    
    input_df['posted_projects_periods'] = pd.cut(input_df['teacher_number_of_previously_posted_projects'], 
                                                 [-1,7,50,1000])
    input_df['posted_projects_bins'] = label.fit_transform(input_df['posted_projects_periods'])
    
    total_quantity_df = resource_df[['id','quantity']].groupby('id', as_index=False).sum()
    total_price_df = resource_df[['id','price']].groupby('id', as_index=False).sum()

    input_df = pd.merge(input_df, total_quantity_df, how='inner', on='id')
    input_df = pd.merge(input_df, total_price_df, how='inner', on='id')

     
    features = build_input_features_dict(input_df)
    labels = input_df[target].values
    
    return tf.estimator.inputs.numpy_input_fn(features,
                                              labels,
                                              batch_size=100,
                                              num_epochs=num_epochs,
                                              shuffle=shuffle,
                                              num_threads=5)

def buildClassifier():
    m = tf.estimator.DNNLinearCombinedClassifier(
        linear_feature_columns = basic_columns + crossed_columns,
        dnn_feature_columns = deep_columns,
        dnn_hidden_units=[100])
    return m
    
def train_and_evaluate():
    classifier = buildClassifier()

    classifier.train(input_fn=input_fn(train_file, num_epochs=None, shuffle=True), 
                     steps = 20000)
    results = classifier.evaluate(input_fn=input_fn(eval_file, num_epochs=1, shuffle=False),
                                      steps=None)
    
    print('-'*100)
    for key in sorted(results):
        print("%s: %s" % (key, results[key]))
        
        
    
train_and_evaluate()

1.9.0
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/hz/zfwx8n_d19g70bf5p8jpl4f43zxvln/T/tmp1_l_3foa', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x122c682b0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
project_resource_summary text_2_vec data shape: (59997, 50)
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done

INFO:tensorflow:global_step/sec: 93.3566
INFO:tensorflow:loss = 46.06017, step = 7101 (1.070 sec)
INFO:tensorflow:global_step/sec: 88.9416
INFO:tensorflow:loss = 28.571085, step = 7201 (1.125 sec)
INFO:tensorflow:global_step/sec: 82.9156
INFO:tensorflow:loss = 36.333584, step = 7301 (1.205 sec)
INFO:tensorflow:global_step/sec: 91.2674
INFO:tensorflow:loss = 39.32615, step = 7401 (1.096 sec)
INFO:tensorflow:global_step/sec: 90.2137
INFO:tensorflow:loss = 48.401115, step = 7501 (1.108 sec)
INFO:tensorflow:global_step/sec: 95.0335
INFO:tensorflow:loss = 45.5672, step = 7601 (1.052 sec)
INFO:tensorflow:global_step/sec: 85.8093
INFO:tensorflow:loss = 37.94393, step = 7701 (1.166 sec)
INFO:tensorflow:global_step/sec: 89.3341
INFO:tensorflow:loss = 44.931038, step = 7801 (1.120 sec)
INFO:tensorflow:global_step/sec: 90.3432
INFO:tensorflow:loss = 48.488194, step = 7901 (1.107 sec)
INFO:tensorflow:global_step/sec: 88.8594
INFO:tensorflow:loss = 42.0667, step = 8001 (1.125 sec)
INFO:tensorflow:g

INFO:tensorflow:global_step/sec: 89.0663
INFO:tensorflow:loss = 38.74178, step = 15401 (1.122 sec)
INFO:tensorflow:global_step/sec: 89.981
INFO:tensorflow:loss = 52.476234, step = 15501 (1.112 sec)
INFO:tensorflow:global_step/sec: 87.4954
INFO:tensorflow:loss = 42.737293, step = 15601 (1.143 sec)
INFO:tensorflow:global_step/sec: 84.2905
INFO:tensorflow:loss = 44.797543, step = 15701 (1.187 sec)
INFO:tensorflow:global_step/sec: 89.711
INFO:tensorflow:loss = 43.4347, step = 15801 (1.115 sec)
INFO:tensorflow:global_step/sec: 89.9665
INFO:tensorflow:loss = 41.97924, step = 15901 (1.111 sec)
INFO:tensorflow:global_step/sec: 94.4363
INFO:tensorflow:loss = 39.788208, step = 16001 (1.059 sec)
INFO:tensorflow:global_step/sec: 89.4881
INFO:tensorflow:loss = 46.326164, step = 16101 (1.118 sec)
INFO:tensorflow:global_step/sec: 89.7951
INFO:tensorflow:loss = 28.670942, step = 16201 (1.114 sec)
INFO:tensorflow:global_step/sec: 85.4744
INFO:tensorflow:loss = 35.67997, step = 16301 (1.170 sec)
INFO:te