## Problem Statement 1

### Simple RNN Model

In [1]:
import pandas as pd
import numpy as np
import string

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

In [3]:
#data
data = pd.read_csv('/content/name_gender.csv')

In [4]:
data.head(5)

Unnamed: 0,name,gender,probability
0,Aaban,M,1.0
1,Aabha,F,1.0
2,Aabid,M,1.0
3,Aabriella,F,1.0
4,Aada,F,1.0


In [11]:
#remove non-ASCII characters
data['name'] = data['name'].apply(lambda x: ''.join([i for i in x if i in string.ascii_letters]))

#convert each name into a sequence of one-hot encoded characters
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(data['name'])

max_len = max([len(i) for i in data['name']])
vocab_size = len(tokenizer.word_index)

X = tokenizer.texts_to_sequences(data['name'])
X = pad_sequences(X, maxlen=max_len, padding='post')

#create one-hot encoded representation
X_one_hot= np.array([np.eye(vocab_size+1)[i] for i in X])

#convert the labels to binary values
y = pd.get_dummies(data['gender'])['F']
y = np.array(y)

#split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
import numpy as np
from sklearn.model_selection import train_test_split

#split the data into 80% train and 20% test
train_data, test_data, train_labels, test_labels = train_test_split(X, y, test_size=0.2, random_state=42)

#define the subset sizes
subset_sizes = [0.25, 0.5, 0.75, 1.0]

In [13]:
#build the model with Simple RNN layer
model = Sequential()
model.add(Embedding(input_dim=vocab_size+1, output_dim=32, input_length=max_len))
model.add(SimpleRNN(32))
model.add(Dense(1, activation='sigmoid'))

#compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
#loop over the subset sizes
for subset_size in subset_sizes:
    #calculate the number of samples for the subset
    num_samples = int(subset_size * len(train_data))

    #randomly select the subset
    subset_indices = np.random.choice(len(train_data), size=num_samples, replace=False)
    subset_data = train_data[subset_indices]
    subset_labels = train_labels[subset_indices]

    #build the model
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size+1, output_dim=32, input_length=max_len))
    model.add(SimpleRNN(32))
    model.add(Dense(1, activation='sigmoid'))

    #compile the model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    #train a model on the subset
    model.fit(subset_data, subset_labels, epochs=10, batch_size=32, verbose=0)

    #evaluate the model on the test data
    loss, accuracy = model.evaluate(test_data, test_labels, verbose=0)

    #print the results
    print(f"Subset size:' {subset_size}%, 'Simple RNN Test accuracy:' {accuracy:.3f}")

Subset size:' 0.25%, 'Simple RNN Test accuracy:' 0.845
Subset size:' 0.5%, 'Simple RNN Test accuracy:' 0.848
Subset size:' 0.75%, 'Simple RNN Test accuracy:' 0.861
Subset size:' 1.0%, 'Simple RNN Test accuracy:' 0.857


#### LSTM Model

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding

#define the LSTM model
def build_lstm_model():
    model = Sequential([
        Embedding(vocab_size+1, output_dim=32, input_length=max_len),
        LSTM(64, dropout=0.2, recurrent_dropout=0.2),
        Dense(1, activation='sigmoid')
    ])

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [18]:
#loop over the subset sizes
for subset_size in subset_sizes:
    #calculate the number of samples for the subset
    num_samples = int(subset_size * len(train_data))

    #randomly select the subset
    subset_indices = np.random.choice(len(train_data), size=num_samples, replace=False)
    subset_data = train_data[subset_indices]
    subset_labels = train_labels[subset_indices]

    #train the LSTM model on the full dataset
    lstm_model = build_lstm_model()
    lstm_model.fit(subset_data, subset_labels, epochs=10, batch_size=32, validation_split=0.2, verbose=0)


    #evaluate the model on the test data
    loss, accuracy = lstm_model.evaluate(test_data, test_labels, verbose=0)

    #print the results
    print(f"Subset size: {subset_size}%, LSTM Test accuracy: {accuracy:.3f}")

Subset size: 0.25%, LSTM Test accuracy: 0.839
Subset size: 0.5%, LSTM Test accuracy: 0.851
Subset size: 0.75%, LSTM Test accuracy: 0.864
Subset size: 1.0%, LSTM Test accuracy: 0.874


### GRU Model

In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout, Embedding

#define the GRU model
def build_gru_model():
    model = Sequential([
        Embedding(vocab_size+1, output_dim=32 , input_length=max_len),
        GRU(64, dropout=0.2, recurrent_dropout=0.2),
        Dense(1, activation='sigmoid')
    ])

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [21]:
#loop over the subset sizes
for subset_size in subset_sizes:
    #calculate the number of samples for the subset
    num_samples = int(subset_size * len(train_data))

    #randomly select the subset
    subset_indices = np.random.choice(len(train_data), size=num_samples, replace=False)
    subset_data = train_data[subset_indices]
    subset_labels = train_labels[subset_indices]

    #train the GRU model on the full dataset
    gru_model = build_gru_model()
    gru_model.fit(subset_data, subset_labels, epochs=10, batch_size=32, validation_split=0.2, verbose=0)

    #evaluate the GRU model on the test data
    loss, accuracy = gru_model.evaluate(X_test, y_test, verbose=0)

    #print the results
    print(f"Subset size: {subset_size}%, GRU Test accuracy: {accuracy:0.3f}")

Subset size: 0.25%, GRU Test accuracy: 0.839
Subset size: 0.5%, GRU Test accuracy: 0.851
Subset size: 0.75%, GRU Test accuracy: 0.862
Subset size: 1.0%, GRU Test accuracy: 0.873


## Problem Statement 2

In [87]:
#loading dependencies
import pandas as pd
import numpy as np
import string
import string
from string import digits
import matplotlib.pyplot as plt
import re
from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Embedding
from keras.layers import TimeDistributed
from keras.layers import Dense, Activation
from keras.optimizers import RMSprop

In [88]:
#data
names=pd.read_csv('/content/name_gender.csv')

In [89]:
male_names = names.loc[names['gender'] == 'M', 'name']
female_names = names.loc[names['gender'] == 'F', 'name']

In [90]:
male_names = male_names.apply(lambda x: x.lower())
female_names = female_names.apply(lambda x: x.lower())

In [91]:
male_names=pd.DataFrame({'name':male_names.unique()})
female_names=pd.DataFrame({'name':female_names.unique()})

#### For Male Names

In [93]:
print(len(male_names))

34722


In [94]:
#to delay target by one timestamp
male_names['name']=male_names.name.apply(lambda x:'\t'+x)

In [95]:
#to indicate end of the word
male_names['target']=male_names.name.apply(lambda x:x[1:len(x)]+'\n')

In [96]:
#creating a list
lenght_list_male=[]
for l in male_names.name:
    lenght_list_male.append(len(l))
max_len = np.max(lenght_list_male)

#the vocab dict
all_chars_male=set()
for name in male_names.name:
    for c in name:
        if c not in all_chars_male:
            all_chars_male.add(c)
all_chars_male.add('\n')

char_to_ix = { ch:i for i,ch in enumerate(sorted(all_chars_male)) }
ix_to_char = { i:ch for i,ch in enumerate(sorted(all_chars_male)) }

In [97]:
input_data = np.zeros((len(male_names.name), max_len, 28),dtype='float32')
output_data = np.zeros((len(male_names.name), max_len, 28),dtype='float32')

#generate input and output data
for i, x in enumerate(male_names.name):
    for t, ch in enumerate(x):
        input_data[i, t, char_to_ix[ch]] = 1
for i, x in enumerate(male_names.target):
    for t, ch in enumerate(x):
        output_data[i,t, char_to_ix[ch]] = 1

In [98]:
#model for generating new names
model = Sequential()
model.add(LSTM(50, input_shape=(max_len, len(all_chars_male)), return_sequences=True))
model.add(TimeDistributed(Dense(len(all_chars_male))))
model.add(TimeDistributed(Activation('softmax')))
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

  super().__init__(name, **kwargs)


In [99]:
#training the model
model.fit(input_data, output_data, batch_size=32,epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fbb83c13d00>

Generating

In [100]:
#initialize an empty list to store generated male names
male_names = []

#generate 100 male names
for i in range(100):
    stop=False
    ch='\t'
    counter=1
    
    #initialize the target sequence with a tab character
    target_seq = np.zeros((1, max_len, 28))
    target_seq[0, 0, char_to_ix[ch]] = 1.
    
    #generate the name one character at a time until a newline character is encountered or the name exceeds 10 characters
    while stop == False and counter < 10:
        
        #use the model to predict the probabilities of the next character
        probs = model.predict(target_seq, verbose=0)[:,counter-1,:]
        
        #sample the next character based on the predicted probabilities
        c= np.random.choice(sorted(list(all_chars_male)), replace =False,p=probs.reshape(28))
        
        #newline character is generated, stop generating the name
        if c =='\n':
            stop=True
        else:
            #append the generated character to the name
            ch=ch+c
            
            #update the target sequence with the generated character
            target_seq[0,counter , char_to_ix[c]] = 1.
            
            #increment the counter to move to the next character
            counter=counter+1
    
    #append the generated name to the list of male names
    male_names.append(ch)


In [101]:
new_male_names = []
for i in male_names:
  k = i.replace('\t','')
  new_male_names.append(k)

In [102]:
generated_male_names = [name.capitalize() for name in new_male_names]
print(len(generated_male_names))
print(generated_male_names)

100
['Jayton', 'Ladarrius', 'Luispatro', 'Moeeur', 'Hiji', 'Bighdon', 'Zahiar', 'Wathael', 'Carno', 'Dryone', 'Anderson', 'Juzaiah', 'Singile', 'Sumaurica', 'Maxamimin', 'Ashad', 'Jourdan', 'Crisa', 'Indabi', 'Shalan', 'Taqoard', 'Reymando', 'Alfier', 'Sadmon', 'Eliamalea', 'Wiianlaan', 'Tarion', 'Isaahirie', 'Nicoro', 'Garerios', 'Jaquens', 'Antonioh', 'Sajuan', 'Hoksen', 'Malhian', 'Rippen', 'Artayiq', 'Edley', 'Velddrick', 'Muller', 'Gerron', 'Kristhiha', 'Colley', 'Jatavion', 'Nicolos', 'Tyhir', 'Keimari', 'Jaysen', 'Osben', 'Dieonelo', 'Irick', 'Broynar', 'Marvion', 'Shiger', 'Macaley', 'Crosken', 'Tibo', 'Quintta', 'Jeonte', 'Laymander', 'Aironfoin', 'Ruyina', 'Quamasle', 'Dawarde', 'Michuaz', 'Robig', 'Burvee', 'Babtham', 'Safr', 'Fardyn', 'Gilan', 'Kamoud', 'Delnas', 'Berthimy', 'Jacarion', 'Ahmaa', 'Demondr', 'Lakardo', 'Alosy', 'Tymon', 'Riz', 'Hamet', 'Gandarius', 'Abdihanis', 'Jakiyon', 'Muntio', 'Yuner', 'Khadeal', 'Rudaun', 'Zayden', 'Carstin', 'Habdeefor', 'Herucz', 'Fer

Testing

In [103]:
#converting into dataframe
df_male = pd.DataFrame(generated_male_names, columns = ['name'])

In [104]:
df_male['gender'] = np.repeat('M', 100)

In [105]:
df_male.head()

Unnamed: 0,name,gender
0,Jayton,M
1,Ladarrius,M
2,Luispatro,M
3,Moeeur,M
4,Hiji,M


In [132]:
#remove non-ASCII characters
df_male['name'] = df_male['name'].apply(lambda x: ''.join([i for i in x if i in string.ascii_letters]))

#convert each name into a sequence of one-hot encoded characters
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(df_male['name'])

max_len = max([len(i) for i in df_male['name']])
vocab_size = len(tokenizer.word_index)

X = tokenizer.texts_to_sequences(df_male['name'])
X = pad_sequences(X, maxlen=max_len, padding='post')

#create one-hot encoded representation
X_one_hot= np.array([np.eye(vocab_size+1)[i] for i in X])

#convert the labels to binary values
y = pd.get_dummies(df_male['gender'])['M']
y = np.array(y)

The best performing model among Simple RNN, LSTM and GRU is LSTM.

In [133]:
#evaluate the model on the test data
loss, accuracy = lstm_model.evaluate(X, y, verbose=0)

#print the results
print(f"Male Accuracy using LSTM: {accuracy:.3f}")

Male Accuracy using LSTM: 0.320


#### For Females Names

In [109]:
print(len(female_names))

60304


In [111]:
#to delay target by one timestamp
female_names['name']=female_names.name.apply(lambda x:'\t'+x)

In [112]:
#to indicate end of the word
female_names['target']=female_names.name.apply(lambda x:x[1:len(x)]+'\n')

In [113]:
#creating a list
lenght_list_female=[]
for l in female_names.name:
    lenght_list_female.append(len(l))
max_len = np.max(lenght_list_female)

#the vocab dict
all_chars_female=set()
for name in female_names.name:
    for c in name:
        if c not in all_chars_female:
            all_chars_female.add(c)
all_chars_female.add('\n')

char_to_ix = { ch:i for i,ch in enumerate(sorted(all_chars_female)) }
ix_to_char = { i:ch for i,ch in enumerate(sorted(all_chars_female)) }

In [114]:
input_data = np.zeros((len(female_names.name), max_len, 28),dtype='float32')
output_data = np.zeros((len(female_names.name), max_len, 28),dtype='float32')

#generate input and output data
for i, x in enumerate(female_names.name):
    for t, ch in enumerate(x):
        input_data[i, t, char_to_ix[ch]] = 1
for i, x in enumerate(female_names.target):
    for t, ch in enumerate(x):
        output_data[i,t, char_to_ix[ch]] = 1

In [115]:
#model for generating new names
model = Sequential()
model.add(LSTM(50, input_shape=(max_len, len(all_chars_male)), return_sequences=True))
model.add(TimeDistributed(Dense(len(all_chars_male))))
model.add(TimeDistributed(Activation('softmax')))
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

  super().__init__(name, **kwargs)


In [116]:
#training the model
model.fit(input_data, output_data, batch_size=32,epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fbb81b31400>

Generating

In [117]:
#initialize an empty list to store generated male names
female_names = []

#generate 100 male names
for i in range(100):
    stop=False
    ch='\t'
    counter=1
    
    #initialize the target sequence with a tab character
    target_seq = np.zeros((1, max_len, 28))
    target_seq[0, 0, char_to_ix[ch]] = 1.
    
    #generate the name one character at a time until a newline character is encountered or the name exceeds 10 characters
    while stop == False and counter < 10:
        
        #use the model to predict the probabilities of the next character
        probs = model.predict(target_seq, verbose=0)[:,counter-1,:]
        
        #sample the next character based on the predicted probabilities
        c= np.random.choice(sorted(list(all_chars_female)), replace =False,p=probs.reshape(28))
        
        #newline character is generated, stop generating the name
        if c =='\n':
            stop=True
        else:
            #append the generated character to the name
            ch=ch+c
            
            #update the target sequence with the generated character
            target_seq[0,counter , char_to_ix[c]] = 1.
            
            #increment the counter to move to the next character
            counter=counter+1
    
    #append the generated name to the list of male names
    female_names.append(ch)

In [118]:
new_female_names = []
for i in female_names:
  k = i.replace('\t','')
  new_female_names.append(k)

In [119]:
generated_female_names = [name.capitalize() for name in new_female_names]
print(len(generated_female_names))
print(generated_female_names)

100
['Chonnke', 'Elonah', 'Tonnie', 'Chauntal', 'Shuntell', 'Giloda', 'Cathelyn', 'Kimod', 'Favah', 'Lizel', 'Mekrisle', 'Chrissa', 'Anyiah', 'Iiba', 'Winetta', 'Sharmyle', 'Ubiell', 'Tika', 'Aleiza', 'Louhela', 'Maiklen', 'Gwineen', 'Feydynn', 'Racqelia', 'Zayma', 'Arnely', 'Kanquett', 'Dawnie', 'Daleyeah', 'Biquaina', 'Sharona', 'Lakitsra', 'Bryttane', 'Genniell', 'Jella', 'Kasiyah', 'Yuquita', 'Jasena', 'Tehrei', 'Keirrah', 'Jewellyn', 'Kallee', 'Ferled', 'Noel', 'Henayle', 'Karmshe', 'Tiffana', 'Breena', 'Laneta', 'Marchett', 'Yestafin', 'Malberan', 'Arnitza', 'Latrice', 'Chaitaly', 'Salena', 'Beorgina', 'Yeanna', 'Jachysel', 'Aliyana', 'Malynett', 'Mellore', 'Dely', 'Avinelle', 'Daiylee', 'Yaleber', 'Toshy', 'Kaylanna', 'Posiry', 'Karlkrin', 'Patziah', 'Bithlin', 'Kessija', 'Felicia', 'Jaselli', 'Cocelyn', 'Shyber', 'Samaha', 'Christy', 'Threes', 'Cathelen', 'Willow', 'Joi', 'Synessa', 'Anajuh', 'Measia', 'Taytina', 'Irzetta', 'Madyli', 'Hemo', 'Femeri', 'Lachinle', 'Nikolyn', 'Eb

Testing 

In [120]:
#converting into dataframe
df_female = pd.DataFrame(generated_female_names, columns = ['name'])

In [121]:
df_female['gender'] = np.repeat('F', 100)

In [122]:
df_female.head()

Unnamed: 0,name,gender
0,Chonnke,F
1,Elonah,F
2,Tonnie,F
3,Chauntal,F
4,Shuntell,F


In [138]:
#remove non-ASCII characters
df_female['name'] = df_female['name'].apply(lambda x: ''.join([i for i in x if i in string.ascii_letters]))

#convert each name into a sequence of one-hot encoded characters
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(df_female['name'])

max_len = max([len(i) for i in df_female['name']])
vocab_size = len(tokenizer.word_index)

X = tokenizer.texts_to_sequences(df_female['name'])
X = pad_sequences(X, maxlen=max_len, padding='post')

#create one-hot encoded representation
X_one_hot= np.array([np.eye(vocab_size+1)[i] for i in X])

#convert the labels to binary values
y = pd.get_dummies(df_female['gender'])['F']
y = np.array(y)

In [139]:
#evaluate the model on the test data
loss, accuracy = lstm_model.evaluate(X, y, verbose=0)

#print the results
print(f"Female Accuracy using LSTM: {accuracy:.3f}")

Female Accuracy using LSTM: 0.510


Measuring Combined Accuracy

In [134]:
frames = [df_male, df_female]

df_combined = pd.concat(frames)

In [135]:
print(len(df_combined))

200


In [136]:
#remove non-ASCII characters
df_combined['name'] = df_combined['name'].apply(lambda x: ''.join([i for i in x if i in string.ascii_letters]))

#convert each name into a sequence of one-hot encoded characters
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(df_combined['name'])

max_len = max([len(i) for i in df_combined['name']])
vocab_size = len(tokenizer.word_index)

X = tokenizer.texts_to_sequences(df_combined['name'])
X = pad_sequences(X, maxlen=max_len, padding='post')

#create one-hot encoded representation
X_one_hot= np.array([np.eye(vocab_size+1)[i] for i in X])

#convert the labels to binary values
y = pd.get_dummies(df_combined['gender'])['F']
y = np.array(y)

In [137]:
#evaluate the model on the test data
loss, accuracy = lstm_model.evaluate(X, y, verbose=0)

#print the results
print(f"Combined Accuracy using LSTM: {accuracy:.3f}")

Combined Accuracy using LSTM: 0.580


The combined accuracy of 58% states the model performed decently well in generating new male and female names.

### Problem Statement 2a

In [211]:
import pandas as pd

#load the dataset into a pandas dataframe
df_ame = pd.read_csv('/content/name_gender.csv')

#filter the dataframe to include only names that start with A, M, or Z
df_ame = df_ame[df_ame['name'].str.startswith(('A', 'M', 'Z'))]

#display the filtered dataframe
print(len(df_ame))

19080


In [212]:
df_ame.head()

Unnamed: 0,name,gender,probability
0,Aaban,M,1.0
1,Aabha,F,1.0
2,Aabid,M,1.0
3,Aabriella,F,1.0
4,Aada,F,1.0


In [213]:
df_ame_names = df_ame['name']

In [214]:
df_ame_names = df_ame_names.apply(lambda x: x.lower())

In [215]:
df_ame_names=pd.DataFrame({'name':df_ame_names.unique()})

In [216]:
#to delay target by one timestamp
df_ame_names['name']=df_ame_names.name.apply(lambda x:'\t'+x)

#to indicate end of the word
df_ame_names['target']=df_ame_names.name.apply(lambda x:x[1:len(x)]+'\n')

In [260]:
#creating a list
lenght_list_ame=[]
for l in df_ame_names.name:
    lenght_list_ame.append(len(l))
max_len = np.max(lenght_list_ame)

#the vocab dict
all_chars_ame=set()
for name in df_ame_names.name:
    for c in name:
        if c not in all_chars_ame:
            all_chars_ame.add(c)
all_chars_ame.add('\n')

char_to_ix = { ch:i for i,ch in enumerate(sorted(all_chars_ame)) }
ix_to_char = { i:ch for i,ch in enumerate(sorted(all_chars_ame)) }

In [218]:
input_data = np.zeros((len(df_ame_names.name), max_len, 28),dtype='float32')
output_data = np.zeros((len(df_ame_names.name), max_len, 28),dtype='float32')

#generate input and output data
for i, x in enumerate(df_ame_names.name):
    for t, ch in enumerate(x):
        input_data[i, t, char_to_ix[ch]] = 1
for i, x in enumerate(df_ame_names.target):
    for t, ch in enumerate(x):
        output_data[i,t, char_to_ix[ch]] = 1

In [219]:
#model for generating new names
model_lstm = Sequential()
model_lstm.add(LSTM(50, input_shape=(max_len, len(all_chars_male)), return_sequences=True))
model_lstm.add(TimeDistributed(Dense(len(all_chars_male))))
model_lstm.add(TimeDistributed(Activation('softmax')))
optimizer = RMSprop(lr=0.01)
model_lstm.compile(loss='categorical_crossentropy', optimizer=optimizer)

  super().__init__(name, **kwargs)


In [220]:
#training the model
model_lstm.fit(input_data, output_data, batch_size=32,epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fbb7de4c280>

Generating

In [237]:
#initialize an empty list to store generated male names
ame_names = []

#generate 100 male names
for i in range(50):
    stop=False
    ch='\t'
    counter=1
    
    #initialize the target sequence with a tab character
    target_seq = np.zeros((1, max_len, 28))
    target_seq[0, 0, char_to_ix[ch]] = 1.
    
    #generate the name one character at a time until a newline character is encountered or the name exceeds 10 characters
    while stop == False and counter < 16:
        
        #use the model to predict the probabilities of the next character
        probs = model.predict(target_seq, verbose=0)[:,counter-1,:]
        
        #sample the next character based on the predicted probabilities
        c= np.random.choice(sorted(list(all_chars_female)), replace =False,p=probs.reshape(28))
        
        #newline character is generated, stop generating the name
        if c =='\n':
            stop=True
        else:
            #append the generated character to the name
            ch=ch+c
            
            #update the target sequence with the generated character
            target_seq[0,counter , char_to_ix[c]] = 1.
            
            #increment the counter to move to the next character
            counter=counter+1
    
    #append the generated name to the list of male names
    ame_names.append(ch)

In [238]:
new_ame_names = []
for i in ame_names:
  k = i.replace('\t','')
  new_ame_names.append(k)

In [239]:
generated_ame_names = [name.capitalize() for name in new_ame_names]
print(len(generated_ame_names))
print(generated_ame_names)

50
['Asadelle', 'Alfreedaz', 'Melissia', 'Arron', 'Angelieja', 'Mitsurd', 'Anallys', 'Melieka', 'Mckiul', 'Zekiyah', 'Magrabella', 'Almilli', 'Menune', 'Amareya', 'Aleivah', 'Metiangel', 'Adhrin', 'Abbe', 'Alyciana', 'Angelise', 'Alan', 'Melahmat', 'Annalise', 'Mirial', 'Ahbree', 'Maudina', 'Aneanton', 'Munashak', 'Alaynna', 'Art', 'Anneeka', 'Alayziah', 'Moess', 'Allie', 'Zyaira', 'Zyane', 'Anfruna', 'Zaavin', 'Mebosa', 'Antoniebardhan', 'Mjio', 'Ametalen', 'Zinakiah', 'Moika', 'Marinell', 'Araston', 'Meynah', 'Avelleah', 'Azalee', 'Zenackahide']


Perplexity

In [252]:
import numpy as np

def cal_perplexity(model, max_len, char_to_ix, names):
  
    total_loss = 0.0
    total_chars = 0
    
    for name in names:
        #initialize the target sequence with a tab character
        target_seq = np.zeros((1, max_len, len(char_to_ix)))
        target_seq[0, 0, char_to_ix['\t']] = 1.

        #generate the name one character at a time until a newline character is encountered or the name exceeds max_len
        for t in range(1, max_len):
            #use the model to predict the probabilities of the next character
            probs = model.predict(target_seq, verbose=0)[:, t-1, :]
            #get the index of the true next character
            true_idx = char_to_ix[name[t-1]]
            #get the probability of the true next character
            true_prob = probs[0, true_idx]
            #update the total loss with the negative log probability of the true next character
            total_loss -= np.log(true_prob)
            #increment the total number of characters seen
            total_chars += 1
            #update the target sequence with the true next character
            target_seq[0, t, true_idx] = 1.

            #if newline character is generated, stop generating the name
            if name[t-1] == '\n':
                break
        
    #calculate the perplexity as the exponential of the average cross-entropy loss per character
    perplexity = np.exp(total_loss / total_chars)
    
    return perplexity

In [255]:
max_len_list=[]
for i in ame_names:
    max_len_list.append(len(i))
max_len_gen = np.max(max_len_list)

In [268]:
# Calculate the average perplexity
perplexity = cal_perplexity(model_lstm, max_len, char_to_ix, ame_names)

print("Perplexity:", perplexity)

Perplexity: 510.26


Therefore the average perplexity value of generated names is 510.26