In [2]:
## reference: https://colab.research.google.com/drive/1QhSnbh-WJVGZjQJF8u974msOL_vAgMeS#scrollTo=PlAGuj5kuZm9
## https://github.com/eisenjulian/nlp_estimator_tutorial/blob/master/nlp_estimators.ipynb

import pandas as pd
import numpy as np

from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection
from sklearn import feature_selection
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit
from sklearn.svm import SVR
from sklearn.svm import SVC

from keras.preprocessing.text import Tokenizer

import tensorflow as tf
from tensorboard import summary as summary_lib
from tensorflow.python.keras.preprocessing import sequence

import seaborn as sns
import matplotlib.pyplot as plt

import tempfile
import os
print(tf.__version__)


dir='/Users/xinwang/ai/dataset/kaggle/DonorsChoose/'
train_file='train_1.csv'
eval_file='train_2.csv'
eval_file='train_2.csv'
test_file='test.csv'
resource_file='resources.csv'
SEED=1000
positive_sample_size=30000

train_df = pd.read_csv(dir + train_file)
eval_df = pd.read_csv(dir + eval_file)
test_df = pd.read_csv(dir + test_file)
resource_df = pd.read_csv(dir + resource_file)
label = LabelEncoder()
low_memory=False


def sampleData():
    train_label_1_df = train_df[train_df['project_is_approved']==1].sample(n=positive_sample_size,
                                                                           random_state=SEED)
    train_label_0_df = train_df[train_df['project_is_approved']==0]

    train_data = pd.concat([train_label_1_df,train_label_0_df])
    train_data = shuffle(train_data)
    
    return train_data

train_df.fillna(value={"teacher_prefix":'Mr.'}, inplace=True)
eval_df.fillna(value={"teacher_prefix":'Mr.'}, inplace=True)
test_df.fillna(value={"teacher_prefix":'Mr.'}, inplace=True)

train_data = train_df
eval_data = eval_df
test_data = test_df 


def transfer_posted_project_buckets(x):
    if x == 0:
        return 'L'
    elif x ==1:
        return 'M'
    else:
        return 'H'

all_data = (train_data, eval_data, test_data)
for dataset in all_data:
    dataset['posted_projects_bins'] = pd.cut(dataset['teacher_number_of_previously_posted_projects'],
                                                [-1, 7, 50, 1000])
    dataset['posted_projects_buckets'] = label.fit_transform(dataset['posted_projects_bins'])
    dataset['posted_projects_buckets'] = dataset['posted_projects_buckets'].apply(transfer_posted_project_buckets)

target = 'project_is_approved'
model_dir = tempfile.mkdtemp()
vocab_size=10000
embedding_size=50
sentence_size=150
pad_id=0


def text_2_vec(feature_name, data_set):
    texts = data_set[feature_name].values
    
    tokenizer = Tokenizer(vocab_size)
    tokenizer.fit_on_texts(texts)
    sequence_data = tokenizer.texts_to_sequences(texts)
    
    data = sequence.pad_sequences(sequence_data, 
                                 maxlen=sentence_size,
                                 truncating='post',
                                 padding='post',
                                 value=pad_id)
    print("text_2_vec data shape:", data.shape)
    
    return data

title_train = text_2_vec('project_title', train_data)
title_eval = text_2_vec('project_title', eval_data)
title_test = text_2_vec('project_title', test_data)

essay_1_train = text_2_vec('project_essay_1', train_data)
essay_1_eval = text_2_vec('project_essay_1', eval_data)
essay_1_test = text_2_vec('project_essay_1', test_data)

essay_2_train = text_2_vec('project_essay_2', train_data)
essay_2_eval = text_2_vec('project_essay_2', eval_data)
essay_2_test = text_2_vec('project_essay_2', test_data)

resource_summary_train = text_2_vec('project_resource_summary', train_data)
resource_summary_eval = text_2_vec('project_resource_summary', eval_data)
resource_summary_test = text_2_vec('project_resource_summary', test_data)

train_data.head()

1.9.0


  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29999 entries, 0 to 29998
Data columns (total 18 columns):
id                                              29999 non-null object
teacher_id                                      29999 non-null object
teacher_prefix                                  29999 non-null object
school_state                                    29999 non-null object
project_submitted_datetime                      29999 non-null object
project_grade_category                          29999 non-null object
project_subject_categories                      29999 non-null object
project_subject_subcategories                   29999 non-null object
project_title                                   29999 non-null object
project_essay_1                                 29999 non-null object
project_essay_2                                 29999 non-null object
project_essay_3                                 994 non-null object
project_essay_4                                 994 n

In [None]:
linear_columns = []
dnn_columns = []



FEATURES = ['teacher_id','teacher_prefix','school_state','project_grade_category',
            'project_subject_categories','project_subject_subcategories',
            'posted_projects_buckets'
           ]

teacher_id = tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_hash_bucket('teacher_id', 1000))
feature_columns.append(teacher_id)

teacher_prefix = tf.feature_column.indicator_column(
    tf.feature_column.categorical_column_with_vocabulary_list(
    "teacher_prefix", [
        "Mrs.","Ms.","Mr.","Teacher","Dr."
    ]))
feature_columns.append(teacher_prefix)

school_state = tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(
    "school_state",[
        "AK","AL","AR","AZ","CA","CO","CT","DC","DE","FL","GA","HI","IA","ID","IL",
        "IN","KS","KY","LA","MA","MD","ME","MI","MN","MO","MS","MT","NC","ND","NE",
        "NH","NJ","NM","NV","NY","OH","OK","OR","PA","RI","SC","SD","TN",
        "TX","UT","VA","VT","WA","WI","WV","WY"
    ]))
feature_columns.append(school_state)

# TODO project_submitted_datetime

project_grade_category = tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(
    "project_grade_category",["Grades 3-5","Grades 6-8","Grades 9-12","Grades PreK-2"]))
feature_columns.append(project_grade_category)

project_subject_categories = tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(
    "project_subject_categories",[
       "Applied Learning","Health & Sports","History & Civics","Literacy & Language",
        "Math & Science","Music & The Arts","Special Needs","Warmth"]))
feature_columns.append(project_subject_categories)


project_subject_subcategories = tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(
    "project_subject_subcategories",[
        "Applied Learning","Care & Hunger","Health & Sports","History & Civics",
        "Literacy & Language","Math & Science","Music & The Arts",
        "Warmth","Applied Sciences","Character Education","Civics & Government",
        "College & Career Prep","Community Service","ESL","Early Development",
        "Economics","Environmental Science","Extracurricular","Financial Literacy",
        "Foreign Languages","Gym & Fitness","Health & Life Science","Health & Wellness",
        "History & Geography","Literacy","Literature & Writing","Mathematics","Music",
        "Nutrition Education","Other","Parent Involvement","Performing Arts",
        "Social Sciences","Special Needs","Team Sports","Visual Arts"]))
feature_columns.append(project_subject_subcategories)

posted_projects_buckets = tf.feature_column.indicator_column(
    tf.feature_column.categorical_column_with_vocabulary_list('posted_projects_buckets',
                                                              ['L','M','H']))
feature_columns.append(posted_projects_buckets)



###############################################################################
project_title = tf.feature_column.embedding_column(tf.feature_column.categorical_column_with_identity('project_title', vocab_size),
                                                  dimension=embedding_size)
feature_columns.append(project_title)

project_essay_1 = tf.feature_column.embedding_column(tf.feature_column.categorical_column_with_identity('project_essay_1', vocab_size),
                                                     dimension=embedding_size)
# feature_columns.append(project_essay_1)

project_essay_2 = tf.feature_column.embedding_column(tf.feature_column.categorical_column_with_identity('project_essay_2', vocab_size),
                                                    dimension=embedding_size)
# feature_columns.append(project_essay_2)

project_resource_summary = tf.feature_column.embedding_column(
    tf.feature_column.categorical_column_with_identity('project_resource_summary',vocab_size),
    dimension=embedding_size)
feature_columns.append(project_resource_summary)
#################################################################################


print('feature_columns len', len(feature_columns))
print(feature_columns)