# RNNs with characters

The idea is to one hot encode characters and then create dense embeddings for them based upon some classification problem, such as predicting the next letter or predicting nationality of last name (a common example).

## Support code

In [60]:
import pandas as pd
import numpy as np
import math
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.functional as F
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

dtype = torch.float
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [61]:
def df_string_to_cat(df:pd.DataFrame) -> dict:
    catencoders = {}
    for colname in df.columns:
        if is_string_dtype(df[colname]) or is_object_dtype(df[colname]):
            df[colname] = df[colname].astype('category').cat.as_ordered()
            catencoders[colname] = df[colname].cat.categories
    return catencoders


def df_cat_to_catcode(df):
    for col in df.columns:
        if is_categorical_dtype(df[col]):
            df[col] = df[col].cat.codes + 1

## Classifying the language of the last name

Let's download [training](https://raw.githubusercontent.com/hunkim/PyTorchZeroToAll/master/data/names_train.csv.gz) and [testing](https://raw.githubusercontent.com/hunkim/PyTorchZeroToAll/master/data/names_test.csv.gz) data for last names.   This data set is a bunch of last names and the nationality or language. 

Let's start with this known problem and then we can try a language model later.

### Load

In [62]:
df_train = pd.read_csv("data/names_train.csv", header=None)
df_train.columns = ['name','language']
df_test = pd.read_csv("data/names_train.csv", header=None)
df_test.columns = ['name','language']

In [63]:
df_train.shape, df_test.shape

((13374, 2), (13374, 2))

In [64]:
df_train.head(2)

Unnamed: 0,name,language
0,Adsit,Czech
1,Ajdrna,Czech


## Clean

In [65]:
badname = df_train['name']=='To The First Page'
df_train[badname]

Unnamed: 0,name,language
8340,To The First Page,Russian
8341,To The First Page,Russian
8342,To The First Page,Russian
8343,To The First Page,Russian
8344,To The First Page,Russian
8345,To The First Page,Russian
8346,To The First Page,Russian
8347,To The First Page,Russian
8348,To The First Page,Russian
8349,To The First Page,Russian


In [66]:
comma = df_train['name'].str.contains(',') # might as well keep
df_train[comma]

Unnamed: 0,name,language
5976,"Jevolojnov,",Russian
6549,"Lytkin,",Russian


In [67]:
df_train[df_train['name'].str.contains("'")][:3] # there are ok so keep quote

Unnamed: 0,name,language
3609,Awak'Yan,Russian
4454,Dan'Ko,Russian
4471,Dar'Kin,Russian


In [68]:
badname = df_train['name']=='To The First Page'
df_train = df_train[~badname]

badname = df_test['name']=='To The First Page'
df_test = df_test[~badname]

### Split out validation set

In [69]:
X, y = df_train[['name']], df_train['language']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20)
X_test, y_test = df_test[['name']], df_test['language']

### Get vocab, set up one-hots

In [70]:
letters = [list(l.lower()) for l in X['name']]

In [71]:
V = set([c for cl in letters for c in cl])
V = sorted(list(V))
ctoi = {c:i for i, c in enumerate(V)}
ctoi

{' ': 0,
 "'": 1,
 ',': 2,
 'a': 3,
 'b': 4,
 'c': 5,
 'd': 6,
 'e': 7,
 'f': 8,
 'g': 9,
 'h': 10,
 'i': 11,
 'j': 12,
 'k': 13,
 'l': 14,
 'm': 15,
 'n': 16,
 'o': 17,
 'p': 18,
 'q': 19,
 'r': 20,
 's': 21,
 't': 22,
 'u': 23,
 'v': 24,
 'w': 25,
 'x': 26,
 'y': 27,
 'z': 28}

In [72]:
y_train = y_train.astype('category').cat.as_ordered()
y_cats = y_train.cat.categories
y_cats

Index(['Arabic', 'Chinese', 'Czech', 'Dutch', 'English', 'French', 'German',
       'Greek', 'Irish', 'Italian', 'Japanese', 'Korean', 'Polish',
       'Portuguese', 'Russian', 'Scottish', 'Spanish', 'Vietnamese'],
      dtype='object')

In [73]:
y_train = y_train.cat.codes+1 # leave 0 for "unknown" in validation / test
y_train.values

array([ 1,  3,  5, ..., 15, 15,  5], dtype=int8)

In [74]:
y_valid = pd.Categorical(y_valid, categories=y_cats, ordered=True)
y_test = pd.Categorical(y_test, categories=y_cats, ordered=True)

In [78]:
y_valid, y_test

([Arabic, English, Arabic, Russian, English, ..., Russian, Russian, Greek, Dutch, Arabic]
 Length: 2672
 Categories (18, object): [Arabic < Chinese < Czech < Dutch ... Russian < Scottish < Spanish < Vietnamese],
 [Czech, Czech, Czech, Czech, Czech, ..., Polish, Polish, Polish, Polish, Polish]
 Length: 13358
 Categories (18, object): [Arabic < Chinese < Czech < Dutch ... Russian < Scottish < Spanish < Vietnamese])