In [1]:
## reference: https://colab.research.google.com/drive/1QhSnbh-WJVGZjQJF8u974msOL_vAgMeS#scrollTo=PlAGuj5kuZm9
## https://github.com/eisenjulian/nlp_estimator_tutorial/blob/master/nlp_estimators.ipynb

import pandas as pd
import numpy as np

from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection
from sklearn import feature_selection
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit
from sklearn.svm import SVR
from sklearn.svm import SVC

from keras.preprocessing.text import Tokenizer

import tensorflow as tf
from tensorboard import summary as summary_lib
from tensorflow.python.keras.preprocessing import sequence

import seaborn as sns
import matplotlib.pyplot as plt

import tempfile
import os
print(tf.__version__)


dir='/Users/xinwang/ai/dataset/kaggle/DonorsChoose/'
train_file='train_1.csv'
eval_file='train_2.csv'
eval_file='train_2.csv'
test_file='test.csv'
resource_file='resources.csv'
SEED=1000
positive_sample_size=100 

train_df = pd.read_csv(dir + train_file)
eval_df = pd.read_csv(dir + eval_file)
test_df = pd.read_csv(dir + test_file)
resource_df = pd.read_csv(dir + resource_file)
label = LabelEncoder()
low_memory=False


def sampleData():
    train_label_1_df = train_df[train_df['project_is_approved']==1].sample(n=positive_sample_size,
                                                                           random_state=SEED)
    train_label_0_df = train_df[train_df['project_is_approved']==0]

    train_data = pd.concat([train_label_1_df,train_label_0_df])
    train_data = shuffle(train_data)
    
    return train_data

train_df.dropna(subset=["teacher_prefix"], inplace=True)
eval_df.dropna(subset=["teacher_prefix"], inplace=True)

train_data = train_df
eval_data = eval_df

train_data.head()

Using TensorFlow backend.


1.9.0


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,id,teacher_id,teacher_prefix,school_state,project_submitted_datetime,project_grade_category,project_subject_categories,project_subject_subcategories,project_title,project_essay_1,project_essay_2,project_essay_3,project_essay_4,project_resource_summary,teacher_number_of_previously_posted_projects,project_is_approved
0,p036502,484aaf11257089a66cfedc9461c6bd0a,Ms.,NV,2016-11-18 14:45:59,Grades PreK-2,Literacy & Language,Literacy,Super Sight Word Centers,Most of my kindergarten students come from low...,I currently have a differentiated sight word c...,,,My students need 6 Ipod Nano's to create and d...,26,1
1,p039565,df72a3ba8089423fa8a94be88060f6ed,Mrs.,GA,2017-04-26 15:57:28,Grades 3-5,"Music & The Arts, Health & Sports","Performing Arts, Team Sports",Keep Calm and Dance On,Our elementary school is a culturally rich sch...,We strive to provide our diverse population of...,,,My students need matching shirts to wear for d...,1,0
2,p233823,a9b876a9252e08a55e3d894150f75ba3,Ms.,UT,2017-01-01 22:57:44,Grades 3-5,"Math & Science, Literacy & Language","Applied Sciences, Literature & Writing",Lets 3Doodle to Learn,Hello;\r\nMy name is Mrs. Brotherton. I teach ...,We are looking to add some 3Doodler to our cla...,,,My students need the 3doodler. We are an SEM s...,5,1
3,p185307,525fdbb6ec7f538a48beebaa0a51b24f,Mr.,NC,2016-08-12 15:42:11,Grades 3-5,Health & Sports,Health & Wellness,"\""Kid Inspired\"" Equipment to Increase Activit...",My students are the greatest students but are ...,"The student's project which is totally \""kid-i...",,,My students need balls and other activity equi...,16,0
4,p013780,a63b5547a7239eae4c1872670848e61a,Mr.,CA,2016-08-06 09:09:11,Grades 6-8,Health & Sports,Health & Wellness,We need clean water for our culinary arts class!,My students are athletes and students who are ...,For some reason in our kitchen the water comes...,,,My students need a water filtration system for...,42,1


In [None]:
vocab_size=30000
embedding_size=50
sentence_size=200

project_essay_1 = tf.feature_column.categorical_column_with_identity('x', vocab_size)
classifier = tf.estimator.LinearClassifier(feature_columns=[project_essay_1], 
                                           model_dir=os.path.join(model_dir, 'bow_sparse'))

target = 'project_is_approved'
model_dir = tempfile.mkdtemp()

pad_id = 0
start_id = 1
oov_id = 2
index_offset = 2

def process_text(feature_name, data_set):
    vocabulary_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(sentence_size)
    data = vocabulary_processor.fit_transform(data_set[feature_name].values)
    
    array = np.array(list(data))
    print('vocabulary size ', len(vocabulary_processor.vocabulary_))
    
    return array


def text_2_vec(feature_name, data_set):
    texts = data_set[feature_name].values
    
    tokenizer = Tokenizer(vocab_size)
    tokenizer.fit_on_texts(texts)
    sequence_data = tokenizer.texts_to_sequences(texts)
    
    data = sequence.pad_sequences(sequence_data, 
                                 maxlen=sentence_size,
                                 truncating='post',
                                 padding='post',
                                 value=pad_id)
    print("text_2_vec data shape:", data.shape)
    
    return data

project_essay_1_train = text_2_vec('project_essay_1', train_data)
project_essay_1_eval = text_2_vec('project_essay_1', eval_data)

def parser(x, y):
    features = {"x": x }
    return features, y

def train_input_fn():
    x_train = project_essay_1_train
    y_train = train_data[target]
    
    print('x_train.shape',x_train.shape)
    dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    dataset = dataset.shuffle(buffer_size=100)
    dataset = dataset.batch(100)
    
    dataset = dataset.map(parser)
    dataset = dataset.repeat()
    iterator = dataset.make_one_shot_iterator()

    return iterator.get_next()
    
def eval_input_fn():
    x_test = project_essay_1_eval
    y_test = eval_data[target]
    
    dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
    dataset = dataset.batch(100)
    
    dataset = dataset.map(parser)
    
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()


all_classifiers = {}
def train_and_evaluate(classifier):
    all_classifiers[classifier.model_dir] = classifier
    classifier.train(input_fn=train_input_fn, steps = 10000)
    eval_results = classifier.evaluate(input_fn=eval_input_fn)
    
    predictions = np.array([p['logistic'][0] for p in classifier.predict(
        input_fn=eval_input_fn])

    tf.reset_default_graph()
    pr = summary_lib.pr_curve('precision_recall', predictions=predictions, 
                              labels = eval_data[target].astype(bool),
                              num_thresholds = 21)
    
    with tf.Session() as sess:
        writer = tf.summary.FileWriter(os.path.join(classifier.model_dir, 'eval'),
                                      sess.graph)
        writer.add_summary(sess.run(pr), global_step = 0)
        writer.close()


train_and_evaluate(classifier)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/hz/zfwx8n_d19g70bf5p8jpl4f43zxvln/T/tmp3_aqxn3u/bow_sparse', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12310e4e0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
