In [None]:
# Comment if not running in google colab
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# Install requirements after mounting drive, then comment out, restart runtime and run all (use GPU for quicker inference)
%%capture
%cd /content/drive/My Drive/its_all_in_the_name_light_repo/code
!pip install -r requirements.txt

# Imports

In [None]:
import os
import pandas as pd
from unidecode import unidecode
from multiprocessing import Pool
import pickle
import warnings
import numpy as np

from tensorflow.keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

warnings.filterwarnings("ignore")

# Constants

In [None]:
# provide project location. If running in google colab: 
base_dir = "/content/drive/My Drive/its_all_in_the_name_light_repo"  

# base_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) # comment if running on colab

if not os.path.isdir(base_dir):
    os.mkdir(base_dir)

data_dir = base_dir+"/data/"
if not os.path.isdir(data_dir):
    os.mkdir(data_dir)

model_dir = base_dir+"/models/"
if not os.path.isdir(model_dir):
    os.mkdir(model_dir)

output_dir = base_dir+"/data/predictions/"
if not os.path.isdir(output_dir):
    os.mkdir(output_dir)

# Name of data file
dataname = "sample_data_race.csv"

# Check Data

In [None]:
df=pd.read_csv(data_dir + dataname, encoding='utf-8')
print(len(df))
print(df.head())
del df

50
                  fullname    pz_whi    pz_bla    pz_his    pz_asi    pz_oth
0     JEFFREY TODD BUCKNER  0.847145  0.046353  0.072047  0.007551  0.026903
1       CAROL WILD BUCKNER  0.780187  0.127414  0.038097  0.028054  0.026247
2       JILL WINTERS HAGER  0.741789  0.132648  0.085556  0.022901  0.017106
3        NANCY WAGNER PIKE  0.388778  0.367163  0.161937  0.042781  0.039341
4  ROBERT GREGORY COCKROFT  0.525449  0.067754  0.047286  0.324763  0.034748


In [None]:
#Set constants and variable names

colname = "fullname"
(pz_whi, pz_bla, pz_his, pz_asi, pz_oth) = ("pz_whi", "pz_bla", "pz_his", "pz_asi", "pz_oth") #variables containing proportions of respective groups (modify if required)

usegis = False # set to True for using gis with ethnicity models

modelfile = f'cnn_USA_{"meta" if usegis else "text"}.h5'

In [None]:
def normalize(word):
  return unidecode(word)

def clean_data():
  df=pd.read_csv(data_dir+dataname, encoding='utf-8')
  df[colname] = df[colname].replace(np.nan,'',regex=True)

  #Normalize
  p = Pool(4)
  df[colname] = p.map(normalize, df[colname])
  p.close()
  p.join()

  df[colname] = df[colname].str.replace("[0-9]{2,}", "", regex=True)
  # df[colname] = df[colname].str.replace("[-\()\/\*0-9]", " ")
  df[colname] = df[colname].str.replace("[^a-z^A-Z]", " ")
  df[colname] = df[colname].str.upper()
  df[colname].replace("\s+"," ", regex=True,inplace=True)

  df.to_csv(output_dir+dataname, encoding='utf-8',index=False)  

def load_data():
    data_test = pd.read_csv(output_dir+dataname) #,dtype={"pz_whi": pd.np.float32, "pz_bla": pd.np.float32, "pz_his": pd.np.float32, "pz_asi": pd.np.float32,"pz_oth": pd.np.float32}) 
    if usegis:
      default_w_prob = 1/5 #default probabilities (modify based on proportions in state, if required)
      default_b_prob = 1/5
      default_h_prob = 1/5
      default_a_prob = 1/5
      default_o_prob = 1/5
      return data_test[colname], data_test[pz_whi].fillna(default_w_prob), data_test[pz_bla].fillna(default_b_prob), data_test[pz_his].fillna(default_h_prob), data_test[pz_asi].fillna(default_a_prob), data_test[pz_oth].fillna(default_o_prob)
    else:
      return data_test[colname]

clean_data() #run once

if usegis:
  x_test, pz_whi_test, pz_bla_test, pz_his_test, pz_asi_test, pz_oth_test = load_data()
else:
  x_test = load_data()

In [None]:
model = load_model(model_dir + modelfile)

with open(model_dir+"nc_voter_encoding.pkl", "rb") as f:
  encoder = pickle.load(f)

with open(model_dir+"nc_voter_tokenizer.pkl", "rb") as f:
  (vocab_size, tokenizer, max_char)=pickle.load(f)

classes = 5

tagset = ['A', 'B', 'H', 'O', 'W']

x = tokenizer.texts_to_sequences(x_test)
x = pad_sequences(x, maxlen=max_char,padding='post', truncating='post')

if usegis:
  x = [x, pz_whi_test, pz_bla_test, pz_his_test, pz_asi_test, pz_oth_test]

y_pred_prob = model.predict(x, batch_size=1024, verbose=1, use_multiprocessing=True, workers=-1)
y_pred= pd.Series(encoder.inverse_transform(y_pred_prob))

df=pd.concat([pd.read_csv(data_dir+dataname, encoding='utf-8'),pd.DataFrame({"predicted": y_pred}),pd.DataFrame(y_pred_prob, columns= tagset)],axis=1)

df.to_csv(output_dir + dataname, index=False)

