In [113]:
import pandas as pd
import numpy as np

from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection
from sklearn import feature_selection
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit
from sklearn.svm import SVR
from sklearn.svm import SVC

from keras.preprocessing.text import Tokenizer

import tensorflow as tf
from tensorboard import summary as summary_lib
from tensorflow.python.keras.preprocessing import sequence

import seaborn as sns
import matplotlib.pyplot as plt

import tempfile
import os
print(tf.__version__)


dir='/Users/xinwang/ai/dataset/kaggle/DonorsChoose/'
train_file = dir + 'train.csv'
test_file = dir + 'test.csv'
resource_file = dir + 'resources.csv'
model_dir = "/Users/xinwang/Downloads/models_temp/"
label = LabelEncoder()

CSV_COLUMNS = ['id', 'teacher_id', 'teacher_prefix', 'school_state',
       'project_submitted_datetime', 'project_grade_category',
       'project_subject_categories', 'project_subject_subcategories',
       'project_title', 'project_essay_1', 'project_essay_2',
       'project_essay_3', 'project_essay_4', 'project_resource_summary',
       'teacher_number_of_previously_posted_projects', 'project_is_approved']
TEST_CSV_COLUMNS = ['id', 'teacher_id', 'teacher_prefix', 'school_state',
       'project_submitted_datetime', 'project_grade_category',
       'project_subject_categories', 'project_subject_subcategories',
       'project_title', 'project_essay_1', 'project_essay_2',
       'project_essay_3', 'project_essay_4', 'project_resource_summary',
       'teacher_number_of_previously_posted_projects']
RESOURCE_COLUMNS = ['id', 'description', 'quantity', 'price']
target = 'project_is_approved'

Train_Mode = 'train'
Eval_Mode = 'eval'
Test_Mode = 'test'

positive_sample_size=5000
negative_sample_size=5000

summary_vocab_size=10000
summary_sentence_size=50
summary_embedding_size=20

essay_vocab_size=30000
essay_sentence_size=200
essay_embedding_size=50
pad_id=0


resource_df = pd.read_csv(resource_file)
groups = resource_df[['id','description']].groupby('id', as_index=False)

id_column = []
desc_column = []
for name, group in groups:
    id_column.append(name)

    desc = ' '.join(str(k) if type(k)==int or type(k)==float else k for k in group['description'].values)
    desc_column.append(desc)
    
desc_df = pd.DataFrame({
    "id":id_column,
    "description": desc_column
})

desc_df.head()

1.9.0


Unnamed: 0,id,description
0,p000001,"Cap Barbell 300 Pound Olympic Set, Grey Cap Ba..."
1,p000002,10 Sony Headphones (BUY 9 GET 1 FREE) Belkin 6...
2,p000003,EE820X - Phonemic Awareness Instant Learning C...
3,p000004,A Bad Case of the Giggles Poems That Will Make...
4,p000005,"Fitbit Zip Wireless Activity Tracker, Lime Fit..."


In [117]:
SEED = 100
train_epoches = 5000
hidden_layers = [100, 50, 100, 50]

teacher_id = tf.feature_column.categorical_column_with_hash_bucket('teacher_id', hash_bucket_size=1000)

project_title = tf.feature_column.categorical_column_with_hash_bucket('project_title', hash_bucket_size=5000)

teacher_prefix = tf.feature_column.categorical_column_with_vocabulary_list(
    "teacher_prefix", [
        "Mrs.","Ms.","Mr.","Teacher","Dr."
    ])
teacher_prefix_bins = tf.feature_column.numeric_column('teacher_prefix_bins')

school_state = tf.feature_column.categorical_column_with_vocabulary_list(
    "school_state",[
        "AK","AL","AR","AZ","CA","CO","CT","DC","DE","FL","GA","HI","IA","ID","IL",
        "IN","KS","KY","LA","MA","MD","ME","MI","MN","MO","MS","MT","NC","ND","NE",
        "NH","NJ","NM","NV","NY","OH","OK","OR","PA","RI","SC","SD","TN",
        "TX","UT","VA","VT","WA","WI","WV","WY"
    ])
school_state_bins = tf.feature_column.numeric_column('school_state_bins')

project_grade_category = tf.feature_column.categorical_column_with_vocabulary_list(
    "project_grade_category",["Grades 3-5","Grades 6-8","Grades 9-12","Grades PreK-2"])
project_grade_category_bins = tf.feature_column.numeric_column('project_grade_category_bins')

project_subject_categories = tf.feature_column.categorical_column_with_vocabulary_list(
    "project_subject_categories",[
       "Applied Learning","Health & Sports","History & Civics","Literacy & Language",
        "Math & Science","Music & The Arts","Special Needs","Warmth"])
project_subject_categories_bins = tf.feature_column.numeric_column('project_subject_categories_bins')

project_subject_subcategories = tf.feature_column.categorical_column_with_vocabulary_list(
    "project_subject_subcategories",[
        "Applied Learning","Care & Hunger","Health & Sports","History & Civics",
        "Literacy & Language","Math & Science","Music & The Arts",
        "Warmth","Applied Sciences","Character Education","Civics & Government",
        "College & Career Prep","Community Service","ESL","Early Development",
        "Economics","Environmental Science","Extracurricular","Financial Literacy",
        "Foreign Languages","Gym & Fitness","Health & Life Science","Health & Wellness",
        "History & Geography","Literacy","Literature & Writing","Mathematics","Music",
        "Nutrition Education","Other","Parent Involvement","Performing Arts",
        "Social Sciences","Special Needs","Team Sports","Visual Arts"])
project_subject_subcategories_bins = tf.feature_column.numeric_column('project_subject_subcategories_bins')

posted_projects = tf.feature_column.numeric_column('teacher_number_of_previously_posted_projects')
posted_projects_bins = tf.feature_column.numeric_column('posted_projects_bins')

quantity = tf.feature_column.numeric_column('quantity')
price = tf.feature_column.numeric_column('price')
avgPrice = tf.feature_column.numeric_column('avgPrice')


#################### Text columns #########################
summary_vec = tf.feature_column.categorical_column_with_identity('project_resource_summary_vec', summary_vocab_size)
description_vec = tf.feature_column.categorical_column_with_identity('description_vec', summary_vocab_size)

basic_columns = [

]

crossed_columns = [
    tf.feature_column.crossed_column(['teacher_prefix', 'school_state'],
                                    hash_bucket_size=1000),
    tf.feature_column.crossed_column(['school_state', 'project_grade_category'],
                                    hash_bucket_size=1000),
    tf.feature_column.crossed_column(['project_grade_category', 'project_subject_categories'],
                                    hash_bucket_size=1000),
    tf.feature_column.crossed_column(['project_subject_categories', 'project_subject_subcategories'],
                                   hash_bucket_size=1000),
    tf.feature_column.crossed_column(['project_subject_categories', 'posted_projects_bins'],
                                   hash_bucket_size=10000)
]
deep_columns = [
    teacher_prefix_bins,
    school_state_bins,
    posted_projects,
    project_grade_category_bins,
    project_subject_categories_bins,
    project_subject_subcategories_bins,
    posted_projects_bins,
    quantity,
    price,
    avgPrice,
        
    tf.feature_column.indicator_column(teacher_id),
    tf.feature_column.indicator_column(project_title),
    tf.feature_column.indicator_column(teacher_prefix),
    tf.feature_column.indicator_column(school_state),
    tf.feature_column.indicator_column(project_grade_category),
    tf.feature_column.indicator_column(project_subject_categories),
    tf.feature_column.indicator_column(project_subject_subcategories),
    
    tf.feature_column.embedding_column(summary_vec, dimension=summary_embedding_size),
    tf.feature_column.embedding_column(description_vec, dimension=20),
]

def mix_operation(a, b):
    return str(a) + '_' + str(b)

def text_2_vec(feature_name, data_set, vocab_size, sentence_size):
    texts = data_set[feature_name].values
    
    tokenizer = Tokenizer(vocab_size)
    tokenizer.fit_on_texts(texts)
    sequence_data = tokenizer.texts_to_sequences(texts)
    
    data = sequence.pad_sequences(sequence_data, 
                                 maxlen=sentence_size,
                                 truncating='post',
                                 padding='post',
                                 value=pad_id)
    print(feature_name + " text_2_vec data shape:", data.shape)
    
    return data

def build_input_features_dict(input_df):
    features = {}
    features['teacher_id'] = input_df['teacher_id'].values
    
    features['project_title'] = input_df['project_title'].values
    
    features['teacher_prefix'] = input_df['teacher_prefix'].values
    features['teacher_prefix_bins'] = input_df['teacher_prefix_bins'].values
    
    features['school_state'] = input_df['school_state'].values
    features['school_state_bins'] = input_df['school_state_bins'].values
    
    features['project_grade_category'] = input_df['project_grade_category'].values
    features['project_grade_category_bins'] = input_df['project_grade_category_bins'].values

    features['project_subject_categories'] = input_df['project_subject_categories'].values
    features['project_subject_categories_bins'] = input_df['project_subject_categories_bins'].values

    features['project_subject_subcategories'] = input_df['project_subject_subcategories'].values
    features['project_subject_subcategories_bins'] = input_df['project_subject_subcategories_bins'].values
    
    features['teacher_number_of_previously_posted_projects'] = input_df['teacher_number_of_previously_posted_projects'].values
    features['posted_projects_bins'] = input_df['posted_projects_bins'].values
    
    features['quantity'] = input_df['quantity'].values
    features['price'] = input_df['price'].values
    features['avgPrice'] = input_df['price']/input_df['quantity']
    
    #---  Text columns  ---#
    features['project_resource_summary_vec'] = text_2_vec('project_resource_summary',input_df, 
                                                          summary_vocab_size,summary_sentence_size)
    features['description_vec'] = text_2_vec('description',input_df, summary_vocab_size, 100)
    #--- Text columns  ---#
    
    return features


def balanceData(df):
    label_1_df = df[df['project_is_approved']==1].sample(n=positive_sample_size, random_state=SEED)
    label_0_df = df[df['project_is_approved']==0].sample(n=negative_sample_size, random_state=SEED)

    data_set = pd.concat([label_1_df,label_0_df])
    data_set = shuffle(data_set)
    
    return data_set


all_df = pd.read_csv(
        tf.gfile.Open(train_file),names=CSV_COLUMNS,
        skipinitialspace=True,engine="python",skiprows=1)
balance_df = balanceData(all_df)

train_df = balance_df.sample(frac=0.8, random_state=SEED)
eval_df = balance_df.drop(train_df.index)
def build_input_df(file, mode):
    if mode == Test_Mode:
        test_df = pd.read_csv(tf.gfile.Open(test_file),names=TEST_CSV_COLUMNS,
                              skipinitialspace=True,engine="python",skiprows=1)
        
        print('get Test Mode dataset', test_df.shape)
        return test_df
    elif mode == Eval_Mode:
        print('get Eval Mode dataset', eval_df.shape)
        return eval_df
    else:
        print('get Train Mode dataset', train_df.shape)
        return train_df

def input_fn(file, mode, num_epochs, shuffle):
    input_df = build_input_df(file, mode)
    resource_df = pd.read_csv(
        tf.gfile.Open(resource_file),names=RESOURCE_COLUMNS,
        skipinitialspace=True,engine="python",skiprows=1)
    
    input_df.fillna(value={"teacher_prefix":'Mr.'}, inplace=True)
    input_df['teacher_prefix_bins'] = label.fit_transform(input_df['teacher_prefix'])

    input_df['school_state_bins'] = label.fit_transform(input_df['school_state'])
    input_df['project_grade_category_bins'] = label.fit_transform(input_df['project_grade_category'])
    input_df['project_subject_categories_bins'] = label.fit_transform(input_df['project_subject_categories'])
    input_df['project_subject_subcategories_bins'] = label.fit_transform(input_df['project_subject_subcategories'])
    
    input_df['posted_projects_periods'] = pd.cut(input_df['teacher_number_of_previously_posted_projects'], 
                                                 [-1,7,50,1000])
    input_df['posted_projects_bins'] = label.fit_transform(input_df['posted_projects_periods'])
    
    total_quantity_df = resource_df[['id','quantity']].groupby('id', as_index=False).sum()
    total_price_df = resource_df[['id','price']].groupby('id', as_index=False).sum()

    
#   add quantity column, price column and divided of both columns from resource file into input_df 
    input_df = pd.merge(input_df, total_quantity_df, how='inner', on='id')
    input_df = pd.merge(input_df, total_price_df, how='inner', on='id')
    input_df = pd.merge(input_df, desc_df, how='inner', on='id')

     
    features = build_input_features_dict(input_df)

    length = len(input_df.iloc[:, 1])
    print(mode + ' dataset length',length)
    
    threads = 8
    if mode == Test_Mode:
        input_df.info()
        labels = np.empty([length,1])
        threads = 1
    else:
        input_df.info()
        labels = input_df[target].values
    
    print(mode + ' labels size:' + str(len(labels)))
    
    
    return tf.estimator.inputs.numpy_input_fn(features,
                                              labels,
                                              batch_size=100,
                                              num_epochs=num_epochs,
                                              shuffle=shuffle,
                                              num_threads=threads)

def buildClassifier():
    m = tf.estimator.DNNLinearCombinedClassifier(
        linear_feature_columns = basic_columns + crossed_columns,
        dnn_feature_columns = deep_columns,
        dnn_hidden_units=hidden_layers,
        dnn_activation_fn=tf.nn.relu6,
        dnn_dropout=0.4)
    return m
    
def train_and_evaluate():
    classifier = buildClassifier()

    classifier.train(input_fn=input_fn(train_file, Train_Mode, num_epochs=None, shuffle=True), 
                     steps = train_epoches)
    results = classifier.evaluate(input_fn=input_fn(train_file, Eval_Mode, num_epochs=1, shuffle=False),
                                      steps=None)
    
    print('-'*100)
    for key in sorted(results):
        print("%s: %s" % (key, results[key]))
        
    
    return classifier


print('compile done')

compile done


In [None]:
classifier = train_and_evaluate()

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/hz/zfwx8n_d19g70bf5p8jpl4f43zxvln/T/tmpirtda76l', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x13d24d208>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
get Train Mode dataset (45600, 16)
project_resource_summary text_2_vec data shape: (45600, 50)
description text_2_vec data shape: (45600, 100)
train dataset length 45600
<class 'pandas.core.frame.DataFrame'>
Int64Index: 45600 entries, 0 to 45599
Data columns (total 26 columns):


INFO:tensorflow:global_step/sec: 72.0412
INFO:tensorflow:loss = 67.65539, step = 5001 (1.388 sec)
INFO:tensorflow:global_step/sec: 72.549
INFO:tensorflow:loss = 68.09361, step = 5101 (1.378 sec)
INFO:tensorflow:global_step/sec: 72.4766
INFO:tensorflow:loss = 69.397644, step = 5201 (1.380 sec)
INFO:tensorflow:global_step/sec: 72.865
INFO:tensorflow:loss = 65.21288, step = 5301 (1.372 sec)
INFO:tensorflow:global_step/sec: 71.8475
INFO:tensorflow:loss = 72.94888, step = 5401 (1.392 sec)
INFO:tensorflow:global_step/sec: 66.642
INFO:tensorflow:loss = 76.173744, step = 5501 (1.501 sec)
INFO:tensorflow:global_step/sec: 68.8403
INFO:tensorflow:loss = 66.29588, step = 5601 (1.452 sec)
INFO:tensorflow:global_step/sec: 70.522
INFO:tensorflow:loss = 70.17819, step = 5701 (1.418 sec)
INFO:tensorflow:global_step/sec: 72.7537
INFO:tensorflow:loss = 68.528015, step = 5801 (1.375 sec)
INFO:tensorflow:global_step/sec: 72.2724
INFO:tensorflow:loss = 67.742065, step = 5901 (1.384 sec)
INFO:tensorflow:glob

In [116]:
result = classifier.predict(input_fn= input_fn(test_file, Test_Mode, num_epochs=1, shuffle=False))

predictions = np.array([p['logistic'][0] for p in result])

print('predictions.shape', predictions.shape)
print(predictions[:10])

test_data = pd.read_csv(test_file)
predict_result = pd.DataFrame({
    "id":test_data['id'],
    "project_is_approved":predictions
})
predict_result.to_csv('prince_DNN_submission.csv', index=False)
print('Done')

get Test Mode dataset (78035, 15)
project_resource_summary text_2_vec data shape: (78035, 50)
description text_2_vec data shape: (78035, 100)
test dataset length 78035
<class 'pandas.core.frame.DataFrame'>
Int64Index: 78035 entries, 0 to 78034
Data columns (total 25 columns):
id                                              78035 non-null object
teacher_id                                      78035 non-null object
teacher_prefix                                  78035 non-null object
school_state                                    78035 non-null object
project_submitted_datetime                      78035 non-null object
project_grade_category                          78035 non-null object
project_subject_categories                      78035 non-null object
project_subject_subcategories                   78035 non-null object
project_title                                   78035 non-null object
project_essay_1                                 78035 non-null object
project_essay_2        

  interactivity=interactivity, compiler=compiler, result=result)


Done
