# Preprocessing

https://wandb.ai/tcapelle/apple_m1_pro/reports/Deep-Learning-on-the-M1-Pro-with-Apple-Silicon---VmlldzoxMjQ0NjY3

# I. Load dataset

In [None]:
import re
import nltk
import string
import random
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import wordnet # check misspelling
from nltk.corpus import stopwords # check misspelling
from wordcloud import WordCloud
from spellchecker import SpellChecker # simple spell check correction

In [None]:
# load dataset
df = pd.read_csv('######.csv',low_memory= False)
df.head()

# II. Data First Look

In [None]:
df.isnull().sum()

In [None]:
max_len = df['DISC'].str.len().max()
min_len = df['DISC'].str.len().min()

print("Maximum length of string in dataset:", max_len)
print("Minimum length of string in dataset:", min_len)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def plot_string_lengths(df, col):
    string_lengths = df[col].str.len()

    fig, ax = plt.subplots(figsize=(8, 6))
    ax.hist(string_lengths, bins=50, color='green', alpha=0.8, edgecolor='black')
    ax.set_xlabel('Length of string', fontsize=12)
    ax.set_ylabel('Frequency', fontsize=12)
    ax.set_title('Distribution of String Lengths', fontsize=14)

    ax.grid(axis='y', alpha=0.5)

    # vertical line for the mean length
    mean_length = string_lengths.mean()
    ax.axvline(x=mean_length, color='red', linestyle='--', label=f'Mean length: {mean_length:.2f}')

    # vertical line for the minimum length
    min_length = string_lengths.min()
    ax.axvline(x=min_length, color='blue', linestyle='--', label=f'Min length: {min_length}')

    # vertical line for the maximum length
    max_length = string_lengths.max()
    ax.axvline(x=max_length, color='purple', linestyle='--', label=f'Max length: {max_length}')

    ax.legend()
    plt.show()

plot_string_lengths(df, 'DISC')

## Data Analysis:

- The dataset consists of a single column of textual data.
- There were no empty rows detected in the dataset.
- The length of strings in the dataset ranged from 13 to 693 characters.
- The majority of strings had a length of 180 characters, while the median length was 116.50 characters.

# III. ETL Pipeline Preparation

### Clean Dataset

    - Make text all upper case
    - Remove common non-sensical text (/n)
    - Remove punctuations, leaving letters (A-Z) and digits (0-9) in the text
    - Remove leading and trailing spaces
    - Remove extra spaces (more than 1 whitespace characters) in the text
    - Standardizing Words in the Dataset for Consistency After First Cleanning
    - Remove customized stop words
    - Tokenize text
    - Remove customized stop words

## 1. Make text uppercase, remove punctuations, and extra space

In [None]:
def clean_text(text):
    '''Make text uppercase, remove punctuations, remove punctuation and spaces'''
    text = text.upper()
    text = re.sub('\n', '', text)
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text) # Remove all punctuations 
    # get rid of extra spaces after removing all punctuations in the previous step
    text = re.sub('\s+', ' ', text).strip()  
    return text


In [None]:
df['clean'] = pd.DataFrame(df.DISC.apply(clean_text))
df.head()

In [None]:
# sanity check the clean column by removing all letters and digits
# df['punc'] = df['clean'].replace('[A-Z0-9]', '', regex=True)
# df['punc'].unique()

## 2. Remove Customized Stopwords

To avoid over-cleaning the already concise and short dataset, we opted not to remove all the stop words. Instead, we created a customized set of stop words to remove. This approach allows us to preserve the context and meaning of the text while removing unnecessary words

In [None]:
import nltk

stop_words = ['A', 'AN', 'THE', 'THIS', 'THAT', 'THOSE', 'OR', 'EITHER',
              'AND' ,'WAS', 'WERE', 'IS', 'ARE', 'BE', 'MAKE', 'MADE',
              'AS', 'FOR','HAVE', 'HAS', 'HAD', 'DO', 'DID', 'DONE', 'LET',
             'GET', 'GETS','GOT', 'GOTTEN', 'TAKE', 'TAKES', 'TOOK', 'TAKEN']
# Tokenize the text data
df['tokens'] = df['clean'].apply(lambda x: nltk.word_tokenize(x.upper()))

# Filter out the stopwords
df['clean'] = df['tokens'].apply(lambda x: ' '.join([word for word in x if word not in stop_words]))

In [None]:
df.tail()

## 3. Standardizing Words in the Dataset for Consistency After First Cleanning
In the dataset, some words are written in different formats, such as L.H, LH, L/H, and L//H. After removing the punctuations in the previous step, we notice that these words appear in two different forms: LH and L H. To ensure consistency in the data, we need to replace all instances of these words with a uniform format

i.e. 
'LH' will be replace to 'L H' so that later in the IOB tagging step, the 'L' will be tagged as `B-loc`, and 'H' will be marked as `I-loc`

In [None]:
# Define a function to replace a specific word for later IOB tag, i.e.'LH' with 'LEFT HAND'
def replace_word(word, old_word, new_word):
    return word.replace(old_word, new_word)

In [None]:
words_to_replace = {
    'LH': 'L H', # RIGHT HAND
    'RH': 'R H', # LEFT HAND
    'BS': 'B S', # BODY STATION
    'I B': 'IB', # INBOUND
    'O B': 'OB'} # OUTBOUND 

for old_word, new_word in words_to_replace.items():
    df['clean'] = df['clean'].apply(lambda x: replace_word(x, old_word, new_word))

In [None]:
df.head(20)

In [None]:
# random pick a row to check before and after cleaning
idx = random.randint(2, len(df.index))
print(df.loc[idx, 'DISC'])
print(df.loc[idx, 'clean'])
#MIR, T.O, REF

## 4.a. Check Misspelling Words Using nltk

As we check for misspelled words in the text, we simultaneously record all `digits` we encounter in a set called digits. We will later use this set to identify the location of defects in the IOB tagging step. 

While there may be alternative ways to perform this check during the IOB tagging step, for now we will create the set and use it. We will refine the code at a later stage

In [None]:
digits = set() 
global_accept_words = []

In [None]:
def process_text(text):
    global digits
    # get the set of English stopwords
    stop_words = set(stopwords.words('english'))
    global global_accept_words
    accept_words = []
    misspelled_words = []
    words = text.split()
    for word in words:
        if word.lower() in stop_words or word.isdigit():
            accept_words.append(word)
            if word.isdigit():
                digits.add(word)
            global_accept_words.append(word)
            continue
        for pos in ['n', 'v', 'a', 'r']: # noun, verb, adjective, adverb
            if len(wordnet.synsets(word.lower(), pos=pos)) > 0:
                # greater than 0 means the word is in WordNet
                accept_words.append(word)
        global_accept_words += accept_words
        if word not in accept_words and not word.isdigit():
            misspelled_words.append(word)
    return accept_words, misspelled_words

In [None]:
df['accept_words'], df['misspelled_words'] = zip(*df['clean'].apply(process_text))
df.loc[:, ['clean', 'accept_words', 'misspelled_words']]

In [None]:
from itertools import chain

# convert the 'misspelled_words' column to a set
typo_words = set(chain.from_iterable(df['misspelled_words'].values))

# print(typo_words) # uncomment this line to see all possible typo words

print(f"Have found {len(typo_words)} acronyms and possible typos in the dataset") # 3946

### Have found 3978 acronyms and possible typos in the dataset

As the dataset contains numerous acronyms commonly used in aircraft maintenance, it is difficult to determine the number of typos and their frequencies. To address this issue in the future, we plan to conduct research and create a list of all acronyms used in the dataset, which will enable us to identify misspelled words more accurately.

## 4.b Check Misspelling Words Using OpenAi

Create API key on OpenAi website, download and save it in your local machine. Make sure you have access to it later

In [None]:
# import openai
# import pandas as pd
# import key

# # Set up OpenAI API credentials
# openai.api_key = key.key

# def find_typos_gpt3(text):
#     response = openai.Completion.create(
#         engine="curie", # davinci is expensive to run 'curie' > 'gpt2' > 'gptneo'
#         prompt=f"Find typos in the following text: '{text}'",
#         max_tokens=1024,
#         n=1,
#         stop=None,
#         temperature=0.7,
#     )
#     return response.choices[0].text.strip()

# df['typos'] = df['clean'].apply(find_typos_gpt3)

# print(df)

In [None]:
len(digits)

## 5.a Checking misspellings of 'CORROSION' in the text

Although we won't be addressing misspelled words at the moment, we can quickly check the frequency of typos for the word 'CORROSION' in the dataset. This will provide us with an understanding of the noise level in the data and help determine the most appropriate approach to tackle it in the future

In [None]:
import re

misspelled_corrosion = dict()

def find_misspelled_corrosion(report):
    pattern = r"COROSION|CORR|COROSSION"
    global global_accept_words
    global misspelled_corrosion
    global_accept_words.append('CORR') 
    words = report.split()
    for word in words:
        if re.search(pattern, word, re.IGNORECASE) and word not in global_accept_words:
            if word not in misspelled_corrosion:     
                misspelled_corrosion[word] = 0
            misspelled_corrosion[word] += 1


# misspelled_corrosion = df.clean.apply(find_misspelled_corrosion)
df.clean.apply(find_misspelled_corrosion)
print(misspelled_corrosion)
print(f"\nHave found {len(misspelled_corrosion)} typos of the word 'CORROSION'")
print(f"Those typos appear {sum(misspelled_corrosion.values())} times in the data")

### In the dataset, we have identified 91 misspellings of the word 'CORROSION', with a total frequency of 309 occurrences. Below is a list of all the misspelled words and their respective frequencies

{'CORROSI': 10, 'CORROSSION': 25, 'COROSSION': 3, 'DEPOTCORROSION': 5, 'CORROD': 4, 'CORROED': 4, 'CORRION': 2, 'CORRISION': 23, 'CORRSION': 18, 'CORRO': 15, 'COROSION': 19, 'CORROISON': 4, 'CORREDED': 4, 'CORREDE': 2, 'CORROSIONALODINE': 1, 'CORRORDED': 1, 'CORROSIO': 14, 'ACCESSIBLECORROSION': 1, 'CORROSIONANNOTATE': 1, 'HCORROSIONB': 1, 'CORRDED': 2, 'CORRSOION': 3, 'CORROSOION': 2, 'CORRRECT': 1, 'CORROSIONN': 1, '643CORRODED': 2, 'CORROSIN': 7, 'CORROSON': 4, 'CORROS': 25, 'CORRROSION': 5, 'CELLCORRODED': 1, 'TANKCORROSION': 1, 'CORRIOSION': 5, 'CORROSIONDEFECT': 14, 'CORRREPAIR': 3, 'CORROSIONON': 6, 'CORROSIONINSTALL': 2, 'CORRBELOW': 1, 'HCORROSION': 3, 'CORRODRD': 1, 'CORROISION': 4, 'CORROSIONREMOVE': 2, 'CELLCORROSION': 1, 'CORROSIONAROUND': 1, 'CORRIOSIO': 1, 'CORROSIONCLEAN': 2, '90742CORROED': 1, 'CORROSIONAND': 3, 'CORRSOIN': 1, 'HCORRODED': 1, 'CORRECTIVEACTION': 2, 'HASCORR': 1, 'CORROSSIONREMOVED': 1, 'CORRISON': 3, 'CORROION': 1, 'SOMECORR': 1, '1902CORR': 1, '007190CORROSION': 1, 'CORRED': 1, 'CORRRION': 1, 'CORRECTI': 1, 'COROSIONIAW': 1, 'PDMCORROSION': 2, 'CORRODIDED': 1, '1925CORR': 1, 'WASHERSCORRODED': 1, 'CORROSOIN': 1, 'CORROTED': 1, 'CORROSED': 2, 'PDMCORRODED': 1, 'CORROSIONCOMPLETE': 1, 'ACFTCORRODED': 1, 'CORRODEDREQS': 1, 'CORRIONSION': 1, 'CORROSIOIN': 1, 'NOTCORROSION': 1, '229CORRODED': 1, 'ARECORR': 1, 'CORROEDED': 1, 'CORROSIION': 1, 'CORROSIONACCOMPLISH': 1, 'SEVERCORR': 1, 'CORROSIONFROM': 1, 'CORRIOSON': 1, 'CORROSIONIAW': 1, 'CORRODDED': 1, 'REMOVECORROSION': 1, 'ALLCORROSIONS': 1, 'CORROSIONINBETWEEN': 1, 'CORRE': 1, 'CORRIOSN': 1}


## 5.b. Visualization of All Misspellings of the Word 'CORROSION' 

In [None]:
from wordcloud import WordCloud

wordcloud = WordCloud(width=800, height=800, background_color='white').generate_from_frequencies(misspelled_corrosion)
plt.figure(figsize=(6, 6), facecolor=None)
plt.imshow(wordcloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

## 6. Exploring the Effects of Basic Spell Correction on a Trial Dataframe

In the context of my machine learning research report, I apply different spell correction methods to a trial dataframe that I have created and analyze the resulting outcomes. It's important to note that the trial dataframe used in this report was generated by the author and is not extracted from the original source data

In [None]:
data = {'text': ['FOUND CORRRODED BLDG 13', 
             'CHECK CRASHS IN LEF SIDE WING', 
             '2 ENG CORROSSION MARKED IN RED']}
df_trial = pd.DataFrame(data)
df_trial

## 6.a. Spell Correction Trial with SpellChecker 

In [None]:
from spellchecker import SpellChecker

def correct_spelling(text):
    """
    Takes in a text string and returns the corrected text using Spell Checker.
    """
    spell = SpellChecker()
    words = text.split()
    corrected_words = []
    for word in words:
        corrected_word = spell.correction(word.lower())
        if corrected_word is not None:
            corrected_words.append(corrected_word.upper())
        else:
            corrected_words.append(word)
    corrected_text = " ".join(corrected_words)
    return corrected_text

In [None]:
df_trial['correct_spell_checker'] = df_trial['text'].apply(correct_spelling)
df_trial

## 6.b. Spell Correction Trial with textblob

TextBlob is a Python library that provides a simple API for natural language processing tasks, including spell checking. It uses a pre-trained model that is trained on a large corpus of text, including technical and industry-specific terms

In [None]:
from textblob import TextBlob
def correct_spelling_textblob(text):
    """
    Takes in a text string and returns the corrected text using TextBlob.
    """
    blob = TextBlob(text)
    return str(blob.correct()).upper()

In [None]:
df_trial['correct_textblob'] = df_trial['text'].apply(correct_spelling_textblob)
df_trial

## 6.c. Spell Correction Trial with Gingerit

GingerIt is a Python library that provides a simple API for spell checking and grammar correction. It uses a pre-trained model that is specifically designed for technical and industry-specific terms

In [None]:
from gingerit.gingerit import GingerIt
def correct_spelling_gingerit(text):
    """
    Takes in a text string and returns the corrected text using GingerIt.
    """
    parser = GingerIt()
    result = parser.parse(text)
    corrected_text = result['result']
    return corrected_text

In [None]:
df_trial['correct_gingerit'] = df_trial['text'].apply(correct_spelling_gingerit)
df_trial


## 6.d. Spell Correction Trial with Language Tool

LanguageTool is an open-source language checking tool that provides a REST API for spell checking and grammar correction. It supports more than 20 languages and can be used to check for grammar errors and spelling mistakes in technical and industry-specific texts

In [None]:
import language_tool_python

def correct_text_language_tool(text):
    """
    Takes in a text string and returns the corrected text using Language Tool kit.
    """
    my_tool = language_tool_python.LanguageTool('en-US')    
    correct_text = my_tool.correct(text)  
    return correct_text

In [None]:
df_trial['correct_language_tool'] = df_trial['text'].apply(correct_text_language_tool)
df_trial

## Limitations of ML Spelling Correction for Grammar Errors (Need Further Research)

We have explored four distinct methods to correct the spelling in our dataset:

- Approach 1 involved using a spell checker, which unfortunately resulted in the word 'BLDG' being incorrectly changed to 'BLOG'. However, it was able to identify and correct the misspelling of 'CORROSION' and 'CORRODED'.

- Approach 2 utilized Textblob, which was unable to detect the misspelling of 'CORRRODED' and 'CORROSSION'. Additionally, it erroneously changed the word 'IN' to 'OF'. Although, it did not change the word 'BLDG' as the spell checker did.

- In Approach 3, we employed Gingerit, which, like Textblob, did not identify the misspelling of 'CORRRODED' and 'CORROSSION'. However, it did not alter the word 'IN' or 'BLDG' in the original text.

- Lastly, Approach 4 used Language Tool to successfully correct the misspelling of 'CORRODED', 'CORROSION', and 'CRASH'. Nevertheless, it removed the digit '2' from the original text.

The tests have showed that none of the techniques we employed were able to identify the grammar error in the word 'LEF' or retain the digits from the original text. In general, we found that these methods were not successful in identifying grammar mistakes in our dataset, and sometimes even recommended wrong corrections. As a result, we have chosen to disregard them and plan to conduct further research or combine existing libraries to more effectively address typos

Reference (future use):

http://www.realworldnlpbook.com/blog/unreasonable-effectiveness-of-transformer-spell-checker.html

## 7. Pickle the preprocessed dataframe into a pickle file fore later use (optional*)
The preprocessed DataFrame is saved in a pickle file format called `preprocessed_data.pkl`. Later, when we need to use the preprocessed data, we can easily load it using the `pd.read_pickle` method

(In the future, if we get a larger dataset, this step will facilitate faster loading of preprocessed data. This is because reading a pickle object is significantly faster than loading a CSV or Excel file.

In [None]:
# Save the preprocessed DataFrame in a pickle file format
df.to_pickle("preprocessed_data.pkl")

In [None]:
# example to load the preprocessed DataFrame from the pickle file 
df = pd.read_pickle('preprocessed_data.pkl')

In [None]:
df

In [None]:
# download the required NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [None]:
df.head()

In [None]:
import pandas as pd
import nltk


compound_corrosion = set()

def find_adjacent_adjectives_and_adverbs(text):
    words = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(words)
    adjectives_and_adverbs = []
    
    match_words = ('CORROSION', 'CORR', 'CORROSIONS', 
                   'CORROSIONS', 'CORROSIVE', 'CORRODING', 'CORRODED')
    
    for i, (word, pos) in enumerate(pos_tags):
        if word in match_words:
            compound_words = ''
            for j in range(i-1, i-1-1, -1):
#                 if pos_tags[j][1] not in ('IN', 'WB', 'WT', 'WP$', 'WRB'):  # except preposition or subordinating conjunction
                  if pos_tags[j][1] in ('JJ', 'NN', 'VBD', 'RB'): 
                    compound_words = pos_tags[j][0] + ' ' + compound_words
            if compound_words:
                compound_words += word
                compound_corrosion.add(compound_words)
                adjectives_and_adverbs.append(compound_words)
    return " ".join(adjectives_and_adverbs)



# apply the function to the "clean" column and create a new column with the results
df['adjectives_and_corrosion'] = df['clean'].apply(find_adjacent_adjectives_and_adverbs)

In [None]:
cor = (set(df['adjectives_and_corrosion']))
# Create a string from the set of words
word_string = ' '.join(cor)

# Create a WordCloud object and generate the word cloud
wordcloud = WordCloud(background_color='white').generate(word_string)

# Display the word cloud
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
cor

In [None]:
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Create a sample set of words
words = {'apple', 'banana', 'cherry', 'date', 'elderberry', 'fig', 'grape', 'honeydew', 'kiwi', 'lemon'}

# Create a string from the set of words
word_string = ' '.join(words)

# Create a WordCloud object and generate the word cloud
wordcloud = WordCloud(background_color='white').generate(word_string)

# Display the word cloud
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
import pickle
# open a file in binary mode to write the set into
with open('compound_corrosions.pickle', 'wb') as f:
    # use pickle.dump() to write the set into the file
    pickle.dump(compound_corrosion, f)

In [None]:
# Open the pickle file in binary mode for reading
with open('compound_corrosions.pickle', 'rb') as f:
    corrosion_set = pickle.load(f)

# Print the set
print(corrosion_set)
print(f"\n have found {len(corrosion_set)}")

In [None]:
# apply the function to the "clean" column and create a new column with the results
df['adjectives_and_corrosion'] = df['clean'].apply(find_adjacent_adjectives)

In [None]:
df.head(50)

In [None]:
df.iloc[47].clean

### Run the cell below to output the preprocessed dataframe to xlsx if you want to check the result

In [None]:
# temp_df = pd.DataFrame({'clean_text': df['filtered_text']})
# temp_df.to_excel('preprocessed_data.xlsx', index=False)

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer

# Initialize a lemmatizer object
lemmatizer = WordNetLemmatizer()

# Define a sample sentence containing a domain-specific term
text = "CORROSION TREAT APPLY"

# Split the sentence into individual words
words = nltk.word_tokenize(text)

# Lemmatize the words
lemmatized_words = [lemmatizer.lemmatize(word.lower()) for word in words]

# Join the lemmatized words back into a sentence
lemmatized_text = ' '.join(lemmatized_words)

# Print the original and lemmatized sentences
print("Original text:", text)
print("Lemmatized text:", lemmatized_text.upper())
