In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from itertools import combinations
import nltk

pd.options.mode.chained_assignment = None

In [2]:
def SVM_result(Train_X_Tfidf,Train_Y, Test_X_Tfidf, Test_Y):
    # fit the training dataset on the classifier
    SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(Train_X_Tfidf,Train_Y)
    # predict the labels on validation dataset
    predictions_SVM = SVM.predict(Test_X_Tfidf)

    # Use accuracy_score function to get the accuracy
    #print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
    return accuracy_score(predictions_SVM, Test_Y)*100

In [3]:
def NB_result(Train_X_Tfidf,Train_Y, Test_X_Tfidf, Test_Y):
    # fit the training dataset on the NB classifier
    Naive = naive_bayes.MultinomialNB()
    Naive.fit(Train_X_Tfidf,Train_Y)# predict the labels on validation dataset
    predictions_NB = Naive.predict(Test_X_Tfidf)# Use accuracy_score function to get the accuracy
    #print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)
    return accuracy_score(predictions_NB, Test_Y)*100

In [4]:
def run_model(Corpus, row_name):
    # Step - a : Remove blank rows if any.
    Corpus[row_name].dropna(inplace=True)

    # Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
    Corpus[row_name] = [str(entry).lower() for entry in Corpus[row_name]]

    # Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
    Corpus[row_name]= [word_tokenize(entry) for entry in Corpus[row_name]]

    # Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.

    # WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV

    for index,entry in enumerate(Corpus[row_name]):
        # Declaring Empty List to store the words that follow the rules for this step
        Final_words = []
        # Initializing WordNetLemmatizer()
        word_Lemmatized = WordNetLemmatizer()
        # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
        for word, tag in pos_tag(entry):
            # Below condition is to check for Stop words and consider only alphabets
            if word not in stopwords.words('english') and word.isalpha():
                word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
                Final_words.append(word_Final)
        # The final processed set of words for each iteration will be stored in 'text_final'
        Corpus.loc[index,'text_final'] = str(Final_words)

    Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],Corpus['c78'],test_size=0.3)
    Train_Y = Train_Y.astype(str)
    for item in Train_Y:
        Test_Y = Test_Y.astype(str)
        Train_X = Train_X.astype(str)
        Test_X = Test_X.astype(str)
    Corpus['text_final'] = Corpus['text_final'].astype(str)
    Encoder = LabelEncoder()
    Train_Y = Encoder.fit_transform(Train_Y)
    Test_Y = Encoder.fit_transform(Test_Y)
    Tfidf_vect = TfidfVectorizer(max_features=5000)
    Tfidf_vect.fit(Corpus['text_final'])

    Train_X_Tfidf = Tfidf_vect.transform(Train_X)
    Test_X_Tfidf = Tfidf_vect.transform(Test_X)
    return Train_X_Tfidf, Test_X_Tfidf, Train_Y, Test_Y

In [5]:
np.random.seed(500)
#c78 -> cause label
#c119 -> report remarks
columns = ["c78", "c119","remark"]

In [6]:
Corpus = pd.read_csv("./Subsets/Maintenance_Text_data.csv",encoding='latin-1', header=0, usecols=columns)
Corpus["concat"] = Corpus["remark"] + ' ' + Corpus["c119"]

#List of labels to use
labels = ['AU', 'ME', 'AF', 'DE', 'II', 'EQ', 'AI']
pairs = list(combinations(labels, 2))

In [7]:
row_name = "remark"
temp_svm = pd.DataFrame()
temp_nb = pd.DataFrame()

for pair in pairs:
    temp_Corpus = Corpus.loc[Corpus['c78'].isin([pair[0],pair[1]])]
    Train_X_Tfidf, Test_X_Tfidf, Train_Y, Test_Y = run_model(temp_Corpus, row_name)
    temp_nb[f"{pair[1]}:{pair[0]}"] = ([NB_result(Train_X_Tfidf,Train_Y, Test_X_Tfidf, Test_Y)])
    temp_svm[f"{pair[1]}:{pair[0]}"] = ([SVM_result(Train_X_Tfidf,Train_Y, Test_X_Tfidf, Test_Y)])

In [8]:
NB = pd.read_csv('./Pairs/NB.csv', index_col=0)
concated = pd.concat([NB, temp_nb])
concated.rename(index = {0:row_name},inplace=True)
concated.to_csv('./Pairs/NB.csv')

SVM = pd.read_csv('./Pairs/SVM.csv',index_col=0)
concated = pd.concat([SVM, temp_svm])
concated.rename(index = {0:row_name},inplace=True)
concated.to_csv('./Pairs/SVM.csv')

TypeError: sort_values() missing 1 required positional argument: 'by'