In [21]:
import os
import glob
import PyPDF2
import pandas as pd
import numpy as np
import language_tool_python
from nltk.tokenize import sent_tokenize
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.models import load_model

In [22]:
#list all csv files only
csv_files = glob.glob('*.{}'.format('csv'))
csv_files

['validation.csv', 'test.csv']

In [23]:
#merge dataset using concat
df = pd.concat([pd.read_csv(f) for f in csv_files ], ignore_index=True)
df

Unnamed: 0,sentence,corrections
0,So I think we can not live if old people could...,['So I think we would not be alive if our ance...
1,For not use car .,['Not for use with a car . ' 'Do not use in th...
2,Here was no promise of morning except that we ...,"['Here was no promise of morning , except that..."
3,Thus even today sex is considered as the least...,"['Thus , even today , sex is considered as the..."
4,image you salf you are wark in factory just to...,"[""Imagine yourself you are working in factory ..."
...,...,...
1498,But I disegree this opinion because often the ...,"[""But I disagree with this opinion because oft..."
1499,"it gives him many apprtunites in the life , an...",['It gives him many opportunities in life and ...
1500,"In other words , the image in the TV comercial...","['In other words , the image in the TV commerc..."
1501,Members gather money for the funeral and help ...,['Members gather money for the funeral to help...


In [52]:
# Path to your PDF file
pdf_path = 'Resume - Rich Andiety.pdf'

In [96]:
# Define start and end tokens
df['sentence']= '<start> ' + df['sentence']
df['corrections'] =  df['corrections'] + ' <end>' 
df

Unnamed: 0,sentence,corrections
0,<start> So I think we can not live if old peop...,['So I think we would not be alive if our ance...
1,<start> For not use car .,['Not for use with a car . ' 'Do not use in th...
2,<start> Here was no promise of morning except ...,"['Here was no promise of morning , except that..."
3,<start> Thus even today sex is considered as t...,"['Thus , even today , sex is considered as the..."
4,<start> image you salf you are wark in factory...,"[""Imagine yourself you are working in factory ..."
...,...,...
1498,<start> But I disegree this opinion because of...,"[""But I disagree with this opinion because oft..."
1499,<start> it gives him many apprtunites in the l...,['It gives him many opportunities in life and ...
1500,"<start> In other words , the image in the TV c...","['In other words , the image in the TV commerc..."
1501,<start> Members gather money for the funeral a...,['Members gather money for the funeral to help...


In [97]:
# Add start and end tokens to target_texts
input_texts = df['sentence'].astype(str).tolist()
target_texts = [start_token + " " + text + " " + end_token for text in df['corrections'].astype(str).tolist()]

In [98]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(input_texts + target_texts)

In [99]:
# Save the tokenizer for later use in text generation
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [100]:
# Convert texts to sequences
input_sequences = tokenizer.texts_to_sequences(input_texts)
target_sequences = tokenizer.texts_to_sequences(target_texts)

In [101]:
# Padding sequences
max_seq_length = max(max(len(seq) for seq in input_sequences), max(len(seq) for seq in target_sequences))
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_length, padding='post')
target_sequences = pad_sequences(target_sequences, maxlen=max_seq_length, padding='post')

In [102]:
vocab_size = len(tokenizer.word_index) + 1  # Plus 1 for padding
embedding_dim = 128
lstm_units = 256

In [103]:
# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(lstm_units, return_state=True)
_, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

In [104]:
# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [105]:
# Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Reshape target data to be 3D
target_sequences = target_sequences.reshape((target_sequences.shape[0], target_sequences.shape[1], 1))

# Train model
model.fit([input_sequences, target_sequences[:, :-1]], target_sequences[:, 1:], batch_size=64, epochs=10)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2941f95b0>

In [106]:
model.save('grammar_correction_model.h5')

  saving_api.save_model(


In [107]:
def update_tokenizer(tokenizer_file_path, new_token='<start>'):
    # Load the tokenizer
    with open('tokenizer.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)

    # Check if the new token is in the tokenizer's word index
    if new_token not in tokenizer.word_index:
        # Add the new token to the tokenizer's word index
        new_index = len(tokenizer.word_index) + 1
        tokenizer.word_index[new_token] = new_index
        tokenizer.index_word[new_index] = new_token

        # Save the updated tokenizer
        with open(tokenizer_file_path, 'wb') as handle:
            pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

        print(f"Token '{new_token}' added to tokenizer.")
    else:
        print(f"Token '{new_token}' already exists in the tokenizer.")

# Update the tokenizer with the '<start>' token
tokenizer_file_path = 'tokenizer.pickle'  # Update with the correct path
update_tokenizer(tokenizer_file_path)

Token '<start>' added to tokenizer.


In [108]:
def generate_correction(input_seq):
    # Preprocess the input sequence
    tokenized_input = tokenizer.texts_to_sequences([input_seq])
    padded_input = pad_sequences(tokenized_input, maxlen=max_seq_length, padding='post')

    # Assuming the first part of the model is the encoder
    # and the second part is the decoder
    encoder_model = Model(model.input[0], model.layers[2].output)
    decoder_model = Model(model.input[1], model.layers[4].output)

    # Get the encoder's state
    encoder_states = encoder_model.predict(padded_input)

    # Initialize the decoder's input as a sequence with only the start token
    start_token_idx = tokenizer.word_index['<start>']
    decoder_input = np.array([[start_token_idx]])

    # Generate the sequence
    corrected_sentence = []
    for i in range(max_seq_length):
        # Predict the next token using the decoder
        decoder_output, state_h, state_c = decoder_model.predict([decoder_input] + encoder_states)
        next_token = np.argmax(decoder_output[0, -1, :])

        # Add the predicted token to the sequence
        if next_token == tokenizer.word_index['<end>']:
            break

        corrected_sentence.append(next_token)
        decoder_input = np.array([[next_token]])

        # Update the states
        encoder_states = [state_h, state_c]

    # Convert indices to words
    corrected_sentence = tokenizer.sequences_to_texts([corrected_sentence])[0]
    return corrected_sentence


In [109]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page_num in range(len(reader.pages)):  # Corrected variable name here
            text += reader.pages[page_num].extract_text()  # Use 'page_num' instead of 'page_number'
    return text

In [110]:
extracted_text = extract_text_from_pdf(pdf_path)

# Split the text into sentences
sentences = sent_tokenize(extracted_text)

In [112]:
def check_grammar(text):
    tool = language_tool_python.LanguageTool('en-US')
    matches = tool.check(text)
    return matches

In [113]:
def highlight_sentences_with_suggestions(text, matches):
    sentences = sent_tokenize(text)
    error_sentences = set()
    suggestions = {}

    for match in matches:
        for i, sentence in enumerate(sentences):
            if match.offset >= text.find(sentence) and match.offset < text.find(sentence) + len(sentence):
                error_sentences.add(i)
                suggestions[i] = suggestions.get(i, []) + [(match.context, match.replacements[0] if match.replacements else "No suggestion")]

    highlighted_text = ""
    for i, sentence in enumerate(sentences):
        if i in error_sentences:
            highlighted_text += f'**{sentence}**\n'
            for context, suggestion in suggestions[i]:
                highlighted_text += f'Error: "{context}" -> Suggestion: {suggestion}\n'
        else:
            highlighted_text += sentence + '\n'

    return highlighted_text

In [114]:
extracted_text = extract_text_from_pdf(pdf_path)
grammar_matches = check_grammar(extracted_text)
highlighted_text_with_suggestions = highlight_sentences_with_suggestions(extracted_text, grammar_matches)
print(highlighted_text_with_suggestions)

**RICH ANDIETY  
082135324409  | andietyrich @gmail.com  | https://www.linkedin.com/in/andietyrich/  
Gajahmungkur, Semarang  
As a final year student majoring in Informatics, I am a highly driven and ambitious individual with a  strong passion 
for technology.**
Error: "RICH ANDIETY   082135324409  | andietyrich @gmail.co..." -> Suggestion: ANXIETY
Error: "RICH ANDIETY   082135324409  | andietyrich @gmail.com ..." -> Suggestion:  
Error: "RICH ANDIETY   082135324409  | andietyrich @gmail.com  | https://www...." -> Suggestion:  
Error: "RICH ANDIETY   082135324409  | andietyrich @gmail.com  | https://www.linkedin.com/..." -> Suggestion: Dietrich
Error: "...  082135324409  | andietyrich @gmail.com  | https://www.linkedin.com/in/andietyric..." -> Suggestion:  
Error: "...https://www.linkedin.com/in/andietyrich/   Gajahmungkur, Semarang   As a final yea..." -> Suggestion:  
Error: "...ps://www.linkedin.com/in/andietyrich/   Gajahmungkur, Semarang   As a final year student maj..." -> Sugge