# Imports

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import string
import re
# import contractions

import os
import json
import shutil
import sys

# Import Dataset

In [None]:
def process_csv(csv_data):
    # process csv data
    df_source = f"./data/{csv_data}"
    sample_data = pd.read_csv(df_source, skiprows=1)
    df = pd.DataFrame(sample_data)
    # print(df.head())
    return
# convert xlsx to csv format
def excel_to_csv(excel_data):
    xlsx_source = f"./data/{excel_data}"
    csv_source = xlsx_source.replace(".xlsx", ".csv")
    read_file = pd.read_excel(xlsx_source)
    read_file.to_csv(csv_source,index=None, header=True)
    


if __name__ == '__main__':
    cwd = os.getcwd()
    path = os.path.join(cwd, "data")
    data = os.listdir(path)
    excel_data = list(filter(lambda f: f.endswith('.xlsx'), data))
    csv_data = list(filter(lambda f: f.endswith('.csv'), data))
    # convert gui data from excel to csv
    print(excel_data)
    excel_to_csv(excel_data[0])
    # args = sys.argv[1:]
    # python process_csv.py <arguments>
    # argument should be the path to the csv,
    process_csv(csv_data[0])

In [4]:
# sample_data = pd.read_csv("./data/BS39_cause_effect_defect_data.csv", skiprows= 1)
# df = pd.DataFrame(sample_data)
# print(df.head())

     Item Id Submit Date                                              Title  \
0  DEF016683    4/3/2022  INC_00023776 - TA Deferred Refund - error enco...   
1  DEF016763   23/3/2022  PWD/WTCS Concession Pilot Users PWD fare is ch...   
2  DEF016766   24/3/2022  [BS39] - CT-CC - Devices allow to buy concessi...   
3  DEF016770   29/3/2022  INC_00024000 : SMRT : Nets inventory is showin...   
4  DEF016775   31/3/2022  [ABT][SYSS]EOD0000331 job doesn't have depende...   

                                         Description Phase Found      State  \
0  Test Steps:\n \nExpected Result:\n \nActual Re...  Production     Closed   
1  Test Steps:\n \nExpected Result:\n \nActual Re...  Production     Closed   
2  Test Steps:\n1. Set the ticket type 1 to expir...  Production     Closed   
3  Nets inventory is showing inaccuracy informati...  Production     Closed   
4  Test Steps:\n\n\n1. Ensure EOD0000413(CTC Post...  Production  Scheduled   

  Severity Module/Device Type Text Module/Device V

# EDA

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 33 columns):
 #   Column                                                                                             Non-Null Count  Dtype  
---  ------                                                                                             --------------  -----  
 0   Item Id                                                                                            19 non-null     object 
 1   Submit Date                                                                                        19 non-null     object 
 2   Title                                                                                              19 non-null     object 
 3   Description                                                                                        19 non-null     object 
 4   Phase Found                                                                                        19 non-null     object 
 

In [7]:
root_causes = df['Primary Root Cause Classification #3']
print(df['Primary Root Cause Classification #3'].value_counts())
rc_count = root_causes.sum()

Test & Production Environment Difference                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     8
Missing Test Case                                                                                                                                                                                                                                                                                                                                                                                                                                                        

# Data Cleaning

New column "uncaught reasons" -- Why defect was not identified during testing

We want to identify the keywords from this column that can allow us to pinpoint the underlying root cause.
Example:

Tester input: use case was not documented, so the feature is unknown to the testers, so they didn’t design with it in mind.

map above to:

Root cause: ‘Missing requirement’

To do this, we first have to clean the text in the column to ensure that the keywords are as similar as possible (to ensure highest chance of mapping correctly).

In [11]:
df['uncaught_reasons'] = df['Why defect was not identifierd during testing?']
df['uncaught_reasons'].head()

0    Test cases was not chosen in all tiers for con...
1    Test cases was not chosen based on the impact ...
2    Test cases was not chosen based on the impact ...
3                      Issue not observed in Test Lab.
4                      Issue not observed in Test Lab.
Name: uncaught_reasons, dtype: object

In [9]:
# define lemmatize functions

stop = set(stopwords.words('english'))
stop.update(['and', 'to', 'the', 'of', 'a', 'in', 'with', 'for', 'our', 'is', 'we', 'you', 'are', 'as', 'be', 'on', 'that', 'or'])
punctuation = list(string.punctuation)
stop.update(punctuation)

def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop:
            pos = pos_tag([i.strip()])
            word = lemmatizer.lemmatize(i.strip(),get_simple_pos(pos[0][1]))
            final_text.append(word.lower())
    return " ".join(final_text)

In [18]:
def remove_punctuations(text):
    b = re.sub(r'[^\w]', ' ', text)
    return b

In [24]:
def clean_text(text):
    text = text.lower()
    # text = contractions.fix(text)
    text = remove_punctuations(text)
    text = ' '.join([word for word in text.split()])
    text = lemmatize_words(text)
    return text

df['uncaught_reasons'] = df['uncaught_reasons'].apply(clean_text)
df['uncaught_reasons'].head()

0    test case chosen tier concession period pa bas...
1                test case chosen base impact analysis
2                test case chosen base impact analysis
3                               issue observe test lab
4                               issue observe test lab
Name: uncaught_reasons, dtype: object

# Analysis on Root Causes

# NLP Model for Textual Classification