# Kamufalu 2.0 🏘️🪵🐖

### Importing the Corpus

In [None]:
with open('corpus.txt', 'r') as file:
    city_names = file.readlines()

city_names = [line.strip() for line in city_names]

longest = 0
for i in range(len(city_names)):
    if len(city_names[i]) > longest:
        longest = len(city_names[i])

sum = 0
for i in range(len(city_names)):
    sum = sum + len(city_names[1])

print(city_names,"\nLongest:",longest,"\nAverage",sum/len(city_names))

### Pre-Processing

In [None]:
# Step 1: Clean the data and convert to lowercase
city_names = [name.lower() for name in city_names]

# Step 2: Split city names into characters and create input-output pairs
input_sequences = []
output_sequences = []

max_sequence_length = 10  # Example: Using input sequences of length 3

for name in city_names:
    #name = ' ' + name + ' '  # Add spaces at the beginning and end of the name
    
    # Using fixed maximum lenghts
    for i in range(len(name) - max_sequence_length):
        input_sequences.append(name[i:i+max_sequence_length])
        output_sequences.append(name[i+max_sequence_length])

    # Using variable lenghts
    #for i in range(len(name) - 1):
    #    input_sequences.append(name[:i+1])
    #    output_sequences.append(name[i+1])

# Print the input-output pairs
#for i in range(len(input_sequences)):
#    print('Input:', input_sequences[i], 'Output:', output_sequences[i])


### One-Hot Encoding

In [None]:
import numpy as np

# Step 3: Vectorization
characters = sorted(list(set(''.join(city_names))))  # Get unique characters and sort them
char_to_index = {char: index for index, char in enumerate(characters)}

num_chars = len(characters)

# Convert input sequences to one-hot encoded vectors
X = np.zeros((len(input_sequences), max_sequence_length, num_chars), dtype=np.float32)
for i, sequence in enumerate(input_sequences):
    for t, char in enumerate(sequence):
        X[i, t, char_to_index[char]] = 1.0

# Convert output sequences to numerical labels
y = np.zeros((len(output_sequences), num_chars), dtype=np.float32)
for i, char in enumerate(output_sequences):
    y[i, char_to_index[char]] = 1.0

# Print the vectorized input-output pairs
#for i in range(len(input_sequences)):
#    print('Input:', X[i], 'Output:', y[i])


### Model Architecture

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Step 4: Model Architecture
model = Sequential()
model.add(LSTM(units=128, input_shape=(max_sequence_length, num_chars)))
model.add(Dense(units=num_chars, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')

# Print the model summary
model.summary()


### Model Training

In [None]:
# Step 5: Model Training
batch_size = 120
epochs = 50 # Sets iteration

model.fit(X, y, batch_size=batch_size, epochs=epochs, validation_split=0.2, verbose=False)

### Model Genetarion using Random Probability

In [None]:
import random

# Step 6: Model Generation
seed_sequence = 'ku'  # Seed sequence for generating new city names
generated_name_length = max_sequence_length-len(seed_sequence)+1  # Length of the generated city name

# Generate characters one by one
generated_name = seed_sequence
for _ in range(generated_name_length):
    x_pred = np.zeros((1, max_sequence_length, num_chars))
    for t, char in enumerate(generated_name):
        if char in char_to_index:
            # print(char, "and", char_to_index[char])
            x_pred[0, t, char_to_index[char]] = 1.0

    # Predict the next character probabilities
    predictions = model.predict(x_pred)[0]
    predicted_index = np.random.choice(range(num_chars), p=predictions)

    # Convert the predicted index back to a character
    predicted_char = characters[predicted_index]

    # Append the predicted character to the generated name
    generated_name += predicted_char

print("Generated Name:", generated_name)


### Model Genetarion using Levenshtein Distance

In [None]:
import random
from Levenshtein import distance

# Step 6: Model Generation
seed_sequence = 'domb'  # Seed sequence for generating new city names
generated_name_length = 10  # Length of the generated city name
num_generated_names = 20  # Number of generated names

generated_names = []

for _ in range(num_generated_names):
    generated_name = seed_sequence
    while len(generated_name) < generated_name_length:
        x_pred = np.zeros((1, len(generated_name), num_chars))
        for t, char in enumerate(generated_name):
            x_pred[0, t, char_to_index[char]] = 1.0

        predictions = model.predict(x_pred, verbose=0)[0]
        predicted_index = np.random.choice(range(num_chars), p=predictions)
        predicted_char = characters[predicted_index]

        generated_name += predicted_char

    generated_names.append(generated_name)

# Evaluate generated city names
reference_names = city_names

for generated_name in generated_names:
    distances = [distance(generated_name, reference_name) for reference_name in reference_names]
    average_distance = np.mean(distances)
    print(f"{generated_name} - LD: {average_distance}")
    # print(f"Generated Name: {generated_name}, Average Levenshtein Distance: {average_distance}")
