In [2]:
from __future__ import print_function

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.utils import shuffle

import keras
from keras.preprocessing import sequence
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, Dropout, Activation, LSTM
from keras.layers.convolutional import Conv1D, MaxPooling1D



import matplotlib.pyplot as plt

import os
os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"

Using TensorFlow backend.


In [1]:
def data_preprocessing(df,sample,ntest,ngrams,feature_len,model,strat_sampling=False,random_state=21):
    
    #Cleaning:
    df.dropna(subset=['name_first', 'name_last'], inplace=True)
    
    #Capitalize the first letters:
    df['name_first'] = df.name_first.str.title()
    df['name_last'] = df.name_last.str.title()
    
    # Concatenate the data depending on the model:
    if model=='fl_fips':
        df['name_last_name_first'] = df['name_last'] + ' ' + df['name_first']+' '+df['fips']
    elif model=='fl_zip':
        df['name_last_name_first'] = df['name_last'] + ' ' + df['name_first']+' '+df['zip']
    elif model=='l':
        df['name_last_name_first'] = df['name_last']
    elif model=='fl' or model=='fl_zipcomp':
        df['name_last_name_first'] = df['name_last'] + ' ' + df['name_first']
    else:
        sys.exit('Invalid model name. Should take one of the following values: fl_fips, fl_zip, fl_zipcomp, fl, l')
        
    #Randomly shuffle the data:    
    df=shuffle(df, random_state=random_state)
    
    #Split the data into test, dev and train sets:
    ndev=2*ntest
    
    sdf_test=df[:ntest]
    sdf_dev=df[ntest:ndev]
    sdf_train=df[ndev:]
    
    #Sample train sample:
    if strat_sampling==True:
    
        sample_non_white=sdf_train[sdf_train['race']!="nh_white"].groupby('race', group_keys=False).apply(lambda x: x.sample(min(len(x),sample),random_state=21))
        sample_white=sdf_train[sdf_train['race']=="nh_white"].sample(n=sample, random_state=random_state)

        sdf_train=pd.concat([sample_non_white,sample_white])
    
        sdf_train=shuffle(sdf_train, random_state=random_state)
        
    else:
        sdf_train=sdf_train.sample(n=sample, random_state=random_state)
    
    #Print the breakdown of number of observations by race in the train sample:
    
    print(sdf_train.groupby('race').agg({'name_first': 'count'}))
    
    
    #Concat test,dev and train samples:
    
    sdf=pd.concat([sdf_test,sdf_dev,sdf_train])

    #Build n-gram list:
    vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=3, ngram_range=ngrams, lowercase=False) 
    a = vect.fit_transform(sdf.name_last_name_first)
    b = vect.inverse_transform(a)
    vocab = vect.vocabulary_
    
    #Number of words in the vocabulary:
    print("Number of words:",len(vocab))
    
    #Implement one-hot encoding:
    X=[]
    for i in b:
        x=[]
        for j in i:
            x.append(vocab[j])
        X.append(x)

    # Check max/avg feature
    X_len = []
    for x in X:
        X_len.append(len(x))

    max_feature_len = max(X_len)
    avg_feature_len = int(np.mean(X_len))

    print("Max feature len = %d, Avg. feature len = %d" % (max_feature_len, avg_feature_len))
    y = np.array(sdf.race.astype('category').cat.codes)

    # Split train, dev and test datasets
    
    X_test=X[:ntest]
    y_test=y[:ntest]
    
    X_dev=X[ntest:ndev]
    y_dev=y[ntest:ndev]
    
    X_train=X[ndev:]
    y_train=y[ndev:]

    print(len(X_train), 'train sequences')
    print(len(X_dev), 'dev sequences')
    print(len(X_test), 'test sequences')
    
    #Pad sequences with 0s to make sure that the length is the same:

    print('Pad sequences (samples x time)')
    X_train = sequence.pad_sequences(X_train, maxlen=feature_len)
    X_dev = sequence.pad_sequences(X_dev, maxlen=feature_len)
    X_test = sequence.pad_sequences(X_test, maxlen=feature_len)
    print('X_train shape:', X_train.shape)
    print('X_dev shape:', X_dev.shape)
    print('X_test shape:', X_test.shape)

    num_classes = np.max(y_train) + 1
    print(num_classes, 'classes')

    print('Convert class vector to binary class matrix '
          '(for use with categorical_crossentropy)')
    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_dev = keras.utils.to_categorical(y_dev, num_classes)
    y_test = keras.utils.to_categorical(y_test, num_classes)
    print('y_train shape:', y_train.shape)
    print('y_dev shape:', y_dev.shape)
    print('y_test shape:', y_test.shape)
    print('Number of classes:', y_train.shape[1])
    
    #Getting zipcode composition data for train, test, dev:
    
    if set(["frac_white","frac_black","frac_asian"]).issubset(sdf.columns):
        
        sub=sdf[["frac_white","frac_black","frac_asian"]]
    
        N_test=sub[:ntest]
    
        N_dev=sub[ntest:ndev]
    
        N_train=sub[ndev:]

    return vocab, X_train, X_dev, X_test, y_train, y_dev, y_test, N_test, N_dev, N_train


def build_model_lstm_zipcode_composition(num_words,num_classes,feature_len,output_length,num_length):
    nlp_input = Input(shape=(feature_len,)) 
    meta_input = Input(shape=(num_length,))
    emb = Embedding(num_words, output_length, input_length=feature_len)(nlp_input) 
    nlp_out = LSTM(128)(emb) 
    concat = concatenate([nlp_out, meta_input]) 
    classifier = Dense(32, activation='relu')(concat) 
    output = Dense(num_classes, activation='sigmoid')(classifier) 
    model = Model(inputs=[nlp_input , meta_input], outputs=[output])
    model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

    print(model.summary())
    
    return model

def build_model_lstm(num_words,num_classes,feature_len,output_length):
    print('Build model...')

    model = Sequential()
    model.add(Embedding(num_words, output_length, input_length=feature_len))
    model.add(LSTM(128))
    model.add(Dropout(0.2))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                       optimizer='adam',
                       metrics=['accuracy'])

    print(model.summary())
    
    return model

def build_model_conv_lstm(num_words,num_classes,feature_len,output_length):
    model= Sequential()
    model.add(Embedding(num_words, output_length, input_length=feature_len))
    model.add(Conv1D(activation="relu", padding="same", filters=32, kernel_size=3))
    model.add(MaxPooling1D(pool_size=2))
    model.add(LSTM(100))
    model.add(Dropout(0.2))
    model.add(Dense(num_classes, activation='softmax'))


    model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

    print(model.summary())
    
    return model


def plot_loss_accuracy(history):
    # list all data in history
    print(history.history.keys())
    
    # summarize history for accuracy
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'validation'], loc='upper left')
    plt.show()
    
    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'validation'], loc='upper left')
    plt.show()
    
    
def model_evaluate(model,X,y):
    y_pred = model.predict_classes(X, verbose=2)
    
    p = model.predict_proba(X, verbose=2) # to predict probability
    
    target_names = list(df.race.astype('category').cat.categories)
    
    print("Model's performance:")
    print(classification_report(np.argmax(y, axis=1), y_pred, target_names=target_names))
    print(confusion_matrix(np.argmax(y, axis=1), y_pred))

In [5]:
#Reading in the data:
df = pd.read_csv('/Volumes/Samsung_T5/nameracesample.csv',dtype={'name_last':str,'name_first':str,'fips':str,'zip':str,'race':str,
                                                                'frac_white':float,'frac_black':float,'frac_asian':float})

Training the baseline model using data on first and last names

In [None]:
#Parameters:
sample=2000000
ntest=50000
ngrams=(2,2) #Range of n-grams
feature_len=25 #Sequence length:
mymodel='fl'

vocab, X_train, X_dev, X_test, y_train, y_dev, y_test, _, _, _=data_preprocessing(df,sample,ntest,ngrams,feature_len,model=mymodel,random_state=21)

num_classes=y_train.shape[1]
num_words=len(vocab)

In [None]:
keras.backend.clear_session()

feature_len=25 #Sequence length
output_length=32 #Length of embedding
batch_size=512 
epochs=7
model_ouput_name="emb_lstm128_drop02_fl"

model_lstm = build_model_lstm(num_words,num_classes,feature_len,output_length)

history=model_lstm.fit(X_train, y_train, batch_size=batch_size, epochs=epochs,
          validation_data=(X_dev, y_dev),verbose=1)

#Plot accuracy and loss:
plot_loss_accuracy(history)

#Evaluate the model on the dev and test sets:
model_evaluate(model_lstm,X_dev,y_dev)
model_evaluate(model_lstm,X_test,y_test)

model_lstm.save('/Users/nkotova/Documents/CS 230 Deep Learning/project/models/'+model_ouput_name+".tf")

Training the alternative architecture (CONV+MaxPool+LSTM) using first and last name data.
Performs slightly worse than LSTM, but runs much faster.

In [None]:
keras.backend.clear_session()

feature_len=25 #Sequence length
output_length=32 #Embedding
batch_size=512
epochs=7
model_name="emb_conv_maxpool_lstm100_drop02_l"


model_conv_lstm= build_model_conv_lstm(num_words,num_classes,feature_len,output_length)

history_conv_lstm=model_conv_lstm.fit(X_train, y_train, batch_size=batch_size, epochs=epochs,
          validation_data=(X_dev, y_dev),verbose=1)

plot_loss_accuracy(history_conv_lstm)

model_evaluate(model_conv_lstm,X_dev,y_dev)
model_evaluate(model_conv_lstm,X_test,y_test)

model_conv_lstm.save('/Users/nkotova/Documents/CS 230 Deep Learning/project/models/'+model_name+'.tf')

Sampling fully balanced training dataset:

In [None]:
#Parameters:
sample=500000
ntest=50000
ngrams=(2,2) #Range of n-grams
feature_len=25 #Sequence length:
mymodel='fl'
strat_sampling=True

vocab, X_train, X_dev, X_test, y_train, y_dev, y_test, _, _, _=data_preprocessing(df,sample,ntest,ngrams,feature_len,model=mymodel,strat_sampling=strat_sampling,random_state=21)

num_classes=y_train.shape[1]
num_words=len(vocab)

In [None]:
keras.backend.clear_session()

feature_len=25 #Sequence length
output_length=32 #Embedding
batch_size=512
epochs=7
model_name="strat_emb_lstm128_drop02_l"

model_lstm_bal= build_model_lstm(num_words,num_classes,feature_len,output_length)

history_lstm_bal=model_lstm_bal.fit(X_train, y_train, batch_size=batch_size, epochs=epochs,
          validation_data=(X_dev, y_dev),verbose=1)

plot_loss_accuracy(history_lstm_bal)

model_evaluate(model_lstm_bal,X_dev,y_dev)
model_evaluate(model_lstm_bal,X_test,y_test)

model_lstm_bal.save('/Users/nkotova/Documents/CS 230 Deep Learning/project/models/'+model_name+'.tf')

Sampling unbalanced dataset with zipcode strings:

In [None]:
#Parameters:
sample=2000000
ntest=50000
ngrams=(2,2) #Range of n-grams
feature_len=25 #Sequence length:
mymodel='fl_zip'

vocab, X_train, X_dev, X_test, y_train, y_dev, y_test, _, _, _=data_preprocessing(df,sample,ntest,ngrams,feature_len,model=mymodel,random_state=21)

num_classes=y_train.shape[1]
num_words=len(vocab)

Training baseline model on last name, first name and zipcode string

In [None]:
keras.backend.clear_session()

output_length=32 #Embedding
batch_size=512
epochs=7
model_name="emb_lstm128_drop02_fl_zip"

model_lstm_zip=build_model_lstm(num_words,num_classes,feature_len,output_length)

history_lstm_zip=model_lstm_zip.fit(X_train, y_train, batch_size=batch_size, epochs=epochs,
          validation_data=(X_dev, y_dev),verbose=1)

plot_loss_accuracy(history)

model_evaluate(model_lstm_zip,X_dev,y_dev)
model_evaluate(model_lstm_zip,X_test,y_test)

model_lstm_zip.save('/Users/nkotova/Documents/CS 230 Deep Learning/project/models/'+model_name+'.tf')

Sampling data for the baseline LSTM model with additional data on zipcode composition:

In [None]:
#Sampling data for LSTM model with additional data on zipcode composition:

from keras.models import Model
from keras.layers import Input
from keras.layers import Concatenate, Dense, LSTM, Input, concatenate

#Parameters:
sample=2000000
ntest=50000
ngrams=(2,2) #Range of n-grams
feature_len=25 #Sequence length:
mymodel='fl_zipcomp'

vocab, X_train, X_dev, X_test, y_train, y_dev, y_test, N_test, N_dev, N_train=data_preprocessing(df,sample,ntest,ngrams,feature_len,model=mymodel,random_state=21)

num_classes=y_train.shape[1]
num_words=len(vocab)

Training LSTM model with additional data on zipcode composition:

In [None]:
keras.backend.clear_session()

output_length=32 #Embedding
batch_size=512
epochs=7
model_name="emb_lstm128_numdense_fl_zipcomp"

model_lstm_num=build_model_lstm_num(num_words,num_classes,feature_len,output_length,num_length)

history=model_lstm_num.fit([X_train, N_train], y_train, batch_size=batch_size, epochs=epochs,
          validation_data=([X_dev, N_dev], y_dev),verbose=1)

plot_loss_accuracy(history)

#Evaluating the model:

y_pred = model_lstm_num.predict([X_dev, N_dev], verbose=2)
y_pred=np.argmax(y_pred,axis=1)
target_names = list(df.race.astype('category').cat.categories)
print("Performance on dev set")
print(classification_report(np.argmax(y_dev,axis=1), y_pred, target_names=target_names))
print(confusion_matrix(np.argmax(y_dev, axis=1), y_pred))

y_pred = model_lstm_num.predict([X_test, N_test], verbose=2)
y_pred=np.argmax(y_pred,axis=1)
target_names = list(df.race.astype('category').cat.categories)
print("Performance on test set")
print(classification_report(np.argmax(y_test,axis=1), y_pred, target_names=target_names))
print(confusion_matrix(np.argmax(y_test, axis=1), y_pred))

model_lstm_num.save('models/'+model_name+'.tf')

Sampling data for last name only model:

In [None]:
#Parameters:
sample=2000000
ntest=50000
ngrams=(2,2) #Range of n-grams
feature_len=25 #Sequence length:
mymodel='l'


vocab, X_train, X_dev, X_test, y_train, y_dev, y_test, _, _, _=data_preprocessing(df,sample,ntest,ngrams,feature_len,model=mymodel,random_state=21)

num_classes=y_train.shape[1]
num_words=len(vocab)

Training last name only data

In [None]:
keras.backend.clear_session()

output_length=32 #Embedding
batch_size=512
epochs=7
model_name="emb_lstm128_drop02_l"

model_lstm_ln= build_model_lstm(num_words,num_classes,feature_len,output_length)

history_lstm_ln=model_lstm_ln.fit(X_train, y_train, batch_size=batch_size, epochs=epochs,
          validation_data=(X_dev, y_dev),verbose=1)

plot_loss_accuracy(history_lstm_ln)

model_evaluate(model_lstm_ln,X_dev,y_dev)
model_evaluate(model_lstm_ln,X_test,y_test)

model_lstm_ln.save('/Users/nkotova/Documents/CS 230 Deep Learning/project/models/'+model_name+'.tf')