In [15]:
# import libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [18]:
# Load dataset 
data = pd.read_csv('/Users/nadinejackson/Downloads/corruption-perceptions-index-cpi.csv')

# Convert data into DataFrame
data = pd.DataFrame([
    ['Benin', 42, 41, 41, 40, 39, 36, 37, 39, 36, 36],
    ['Botswana', 55, 60, 61, 61, 61, 60, 63, 63, 64, 65],
    ['Cabo Verde', 58, 58, 58, 57, 55, 59, 55, 57, 58, 60],
    ['Ethiopia', 39, 38, 37, 34, 35, 34, 33, 33, 33, 33],
    ['Kenya', 30, 31, 28, 27, 28, 26, 25, 25, 27, 27],
    ['Nigeria', 24, 25, 26, 27, 27, 28, 26, 27, 25, 27],
    ['Senegal', 43, 45, 45, 45, 45, 45, 44, 43, 41, 36],
    ['South Africa', 44, 44, 44, 43, 43, 45, 44, 44, 42, 43],
    ['Sudan', 20, 16, 16, 16, 16, 14, 12, 11, 11, 13],
    ['Tanzania', 39, 38, 37, 36, 36, 32, 30, 31, 33, 35],
    ['Zambia', 33, 33, 34, 35, 37, 38, 38, 38, 38, 37]
], columns=['Country', 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012])


In [19]:
# Preprocess the data
text_data = ''
for _, row in data.iterrows():
    country = row['Country']
    for year in range(2012, 2022):
        text_data += f"{country} has a corruption score of {row[year]} in {year}. "

tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts([text_data])

sequence_data = tokenizer.texts_to_sequences([text_data])[0]
vocab_size = len(tokenizer.word_index) + 1

seq_length = 100
input_sequences = []
output_char = []

for i in range(0, len(sequence_data) - seq_length):
    input_sequences.append(sequence_data[i:i+seq_length])
    output_char.append(sequence_data[i+seq_length])

X = np.array(input_sequences)
y = tf.keras.utils.to_categorical(output_char, num_classes=vocab_size)


In [20]:
# Tokenize and pad the sequences
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts([text_data])

sequence_data = tokenizer.texts_to_sequences([text_data])[0]
vocab_size = len(tokenizer.word_index) + 1

seq_length = 100
input_sequences = []
output_char = []

for i in range(0, len(sequence_data) - seq_length):
    input_sequences.append(sequence_data[i:i+seq_length])
    output_char.append(sequence_data[i+seq_length])

X = np.array(input_sequences)
y = tf.keras.utils.to_categorical(output_char, num_classes=vocab_size)

In [21]:
# Define the RNN model using LSTM layers
model = Sequential()
model.add(Embedding(vocab_size, 128, input_length=seq_length))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

2023-04-27 10:25:00.501635: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-04-27 10:25:00.507384: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-04-27 10:25:00.512307: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 128)          4608      
                                                                 
 lstm (LSTM)                 (None, 100, 256)          394240    
                                                                 
 dropout (Dropout)           (None, 100, 256)          0         
                                                                 
 lstm_1 (LSTM)               (None, 256)               525312    
                                                                 
 dense (Dense)               (None, 36)                9252      
                                                                 
Total params: 933,412
Trainable params: 933,412
Non-trainable params: 0
_________________________________________________________________


2023-04-27 10:25:00.879730: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-04-27 10:25:00.881932: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-04-27 10:25:00.883836: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [23]:
# Split the data into training and validation sets
split_ratio = 0.8
train_size = int(len(X) * split_ratio)
X_train, X_val = X[:train_size], X[train_size:]
y_train, y_val = y[:train_size], y[train_size:]

# Train the model with early stopping based on validation loss
checkpoint = ModelCheckpoint("corruption_rnn.h5", monitor='val_loss', save_best_only=True, verbose=1)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1)

history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, batch_size=64, callbacks=[checkpoint, early_stopping])

Epoch 1/100

2023-04-27 10:35:16.130740: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-04-27 10:35:16.136286: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-04-27 10:35:16.157494: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus


Epoch 1: val_loss improved from inf to 0.58770, saving model to corruption_rnn.h5
Epoch 2/100
Epoch 2: val_loss improved from 0.58770 to 0.56562, saving model to corruption_rnn.h5
Epoch 3/100
Epoch 3: val_loss did not improve from 0.56562
Epoch 4/100
Epoch 4: val_loss did not improve from 0.56562
Epoch 5/100
Epoch 5: val_loss did not improve from 0.56562
Epoch 6/100
Epoch 6: val_loss did not improve from 0.56562
Epoch 7/100
Epoch 7: val_loss did not improve from 0.56562
Epoch 7: early stopping


In [24]:
# Generate text using the trained model
def generate_text(seed_text, model, tokenizer, seq_length, num_chars):
    output_text = seed_text

    for _ in range(num_chars):
        tokenized_text = tokenizer.texts_to_sequences([seed_text])[-1]
        padded_text = pad_sequences([tokenized_text], maxlen=seq_length)

        probabilities = model.predict(padded_text)[0]
        predicted_index = np.argmax(probabilities)
        predicted_char = tokenizer.index_word[predicted_index]

        output_text += predicted_char
        seed_text += predicted_char

    return output_text

In [27]:
seed_text = "Nigeria has a corruption score of "
generated_text = generate_text(seed_text, model, tokenizer, seq_length, 11)
print(generated_text)


Nigeria has a corruption score of 35 in 2016.
