In [1]:
#this notebook compares the word tokenization of 'c119' and 'remark'
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from itertools import combinations
import nltk

pd.options.mode.chained_assignment = None

In [7]:
columns = ["c78", "c119","remark"]
labels = ['AU', 'ME', 'AF', 'DE', 'II', 'EQ', 'AI']
Corpus = pd.read_csv("./Subsets/Maintenance_Text_data.csv",encoding='latin-1', header=0, usecols=columns)

In [8]:
row_name = 'remark'

# Step - a : Remove blank rows if any.
Corpus[row_name].dropna(inplace=True)

# Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
Corpus[row_name] = [str(entry).lower() for entry in Corpus[row_name]]

# Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
Corpus[row_name]= [word_tokenize(entry) for entry in Corpus[row_name]]

# Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.

# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

for index,entry in enumerate(Corpus[row_name]):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    Corpus.loc[index,'text_final_remark'] = str(Final_words)

In [9]:
new_df = Corpus
new_df.head()

Unnamed: 0,c119,c78,remark,text_final_remark
0,TAILWHEEL COCKED RIGHT PRIOR TO takeoff,AU,"[tailwheel, cocked, right, prior, to, takeoff]","['tailwheel', 'cock', 'right', 'prior', 'takeo..."
1,TOW PLANE BECAME AIRBORNE THEN SETTLED STUDENT...,ME,"[tow, plane, became, airborne, then, settled, ...","['tow', 'plane', 'become', 'airborne', 'settle..."
2,"2ND ILS APCH,aircraft'S G/S INOP LOM TUNED TO ...",AU,"[2nd, ils, apch, ,, aircraft, 's, g/s, inop, l...","['il', 'apch', 'aircraft', 'inop', 'lom', 'tun..."
3,pilot NOTED SOFT right BRAKE PEDAL DRG TAXI TO...,AU,"[pilot, noted, soft, right, brake, pedal, drg,...","['pilot', 'note', 'soft', 'right', 'brake', 'p..."
4,TAXI OFF HARD SFC DUE TFC right MAIN GR BROKE ...,AF,"[taxi, off, hard, sfc, due, tfc, right, main, ...","['taxi', 'hard', 'sfc', 'due', 'tfc', 'right',..."


In [10]:
columns = ["c78", "c119","remark"]
labels = ['AU', 'ME', 'AF', 'DE', 'II', 'EQ', 'AI']
Corpus = pd.read_csv("./Subsets/Maintenance_Text_data.csv",encoding='latin-1', header=0, usecols=columns)
row_name = 'c119'

# Step - a : Remove blank rows if any.
Corpus[row_name].dropna(inplace=True)

# Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
Corpus[row_name] = [str(entry).lower() for entry in Corpus[row_name]]

# Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
Corpus[row_name]= [word_tokenize(entry) for entry in Corpus[row_name]]

# Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.

# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

for index,entry in enumerate(Corpus[row_name]):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    Corpus.loc[index,'text_final_c119'] = str(Final_words)

In [11]:
type(Corpus['text_final_c119'])

pandas.core.series.Series

In [12]:
type(new_df['text_final_remark'])

pandas.core.series.Series

In [16]:
new = Corpus['text_final_c119'].compare(new_df['text_final_remark'], keep_shape=True).dropna()

In [22]:
lookup_codes = list(new.index)

In [28]:
Corpus = pd.read_csv("./Subsets/Maintenance_Text_data.csv",encoding='latin-1', header=0, usecols=columns)
Corpus['c78'].iloc[lookup_codes].value_counts()

II    535
AU    103
ME     48
AF     35
DE     23
AI     11
EQ      1
Name: c78, dtype: int64