In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import string
import re


In [None]:
data = pd.read_csv('/kaggle/input/us-crime-data/US_Crime_Data.csv')
data.sample(10)

In [None]:
data.isna().sum()

## For this model we just need headlines

In [None]:
df = data[['Title']]
df = df.dropna().reset_index(drop = True)
df.head()

In [None]:
df['Title'][123]

In [None]:
df.shape

# Removing Punctuations

In [None]:
def clean_text(df):
    df['Title'] = df['Title'].apply(lambda x : x.lower())
    tokens = df['Title'].str.replace('[{}]'.format(string.punctuation), '')
    return tokens

In [None]:
tokens = clean_text(df)

In [None]:
tokens

In [None]:
len(set(tokens))

# Tokenizing

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(tokens)
seq = tokenizer.texts_to_sequences(tokens)

In [None]:
seq[:10]

# Creating input and output data list

In [None]:
x = []
y = []
total_words_drop = 0
for i in seq:
    if len(i) > 1:
        for j in range(1, len(i)):
            x.append(i[:j])
            y.append(i[j])
            
    else : 
        total_words_drop +=1
print('Total Words Dropped : {}'.format(total_words_drop))

In [None]:
y[: 10]

# Padding sequences

In [None]:
x = tf.keras.preprocessing.sequence.pad_sequences(x)

In [None]:
x.shape

# Shaping y same as x

In [None]:
y = tf.keras.utils.to_categorical(y)

In [None]:
y.shape

# Vocab Size : total no. of unique words

In [None]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

# LSTM Model

In [None]:
model = tf.keras.Sequential([tf.keras.layers.Embedding(vocab_size,49 ),
                            tf.keras.layers.LSTM(100, return_sequences = True),
                            tf.keras.layers.LSTM(100),
                            tf.keras.layers.Dense(100, activation = 'relu'),
                            tf.keras.layers.Dense(vocab_size, activation = 'softmax')])

In [None]:
model.summary()

In [None]:
model.compile(loss  = 'categorical_crossentropy',
             optimizer = 'adam',
             metrics = ['accuracy'],
             )

In [None]:
history = model.fit(x,y,
                   epochs = 100,
                    batch_size = 256,
                    callbacks = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss',
                                                               patience = 5,
                                                               restore_best_weights = True))

# Saving model

In [None]:
model.save('model.h5')

# Vocab Array : list of all the unique words

In [None]:
vocab_array = np.array(list(tokenizer.word_index.keys()))
vocab_array

# Final Function for Predictions

In [None]:
def make_predictions(text, n_words):
    for i in range(n_words):
        text_tokenize = tokenizer.texts_to_sequences([text])
        text_padding  = tf.keras.preprocessing.sequence.pad_sequences(text_tokenize, maxlen = 49)
        prediction = np.squeeze(np.argmax(model.predict(text_padding), axis = -1))
        prediction = str(vocab_array[prediction - 1])
        text += " " + prediction
    return text

# Testing Model

In [None]:
make_predictions('california',5)

In [None]:
make_predictions('new york',8)

In [None]:
make_predictions('highway',8)

> Its so fun to get prediction !

# Thanks!