# **Data Preprocessing**

In [1]:
import pandas as pd
import nltk
import gensim
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from gensim.parsing.preprocessing import STOPWORDS
import numpy as np
import pickle
nltk.download('stopwords')
nltk.download('wordnet')

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\niranjans3ln\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\niranjans3ln\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
#mounting the google drive
#from google.colab import drive
#drive.mount('/content/drive')

In [3]:
#importing the CSV file of the dataset from google drive into a dataframe
data=pd.read_csv('dataset/eclipse_platform.csv')
data

Unnamed: 0,Issue_id,Priority,Component,Duplicated_issue,Title,Description,Status,Resolution,Version,Created_time,Resolved_time
0,1.0,P3,Team,,Usability issue with external editors (1GE6IRL),- Setup a project that contains a *.gif resour...,CLOSED,FIXED,2.0,10/10/2001 21:34,2/9/2012 15:57
1,2.0,P5,Team,,Opening repository resources doesnt honor type...,Opening repository resource always open the de...,RESOLVED,FIXED,2.0,10/10/2001 21:34,5/7/2002 10:33
2,3.0,P5,Team,,Sync does not indicate deletion (1GIEN83),KM (10/2/2001 5:55:18 PM); \tThis PR about the...,RESOLVED,FIXED,2.0,10/10/2001 21:34,5/7/2010 10:28
3,4.0,P5,Team,,need better error message if catching up over ...,- become synchronized with some project in a r...,RESOLVED,FIXED,2.0,10/10/2001 21:34,3/1/2002 16:27
4,5.0,P3,Team,,ISharingManager sharing API inconsistent (1GAU...,For getting/setting the managed state of a res...,RESOLVED,WONTFIX,2.0,10/10/2001 21:34,8/15/2008 8:04
...,...,...,...,...,...,...,...,...,...,...,...
10007,,,,,,,,,,,
10008,,,,,,,,,,,
10009,,,,,,,,,,,
10010,,,,,,,,,,,


In [4]:
#Removing the unuseful coloumns from the dataset
data = data.drop(columns=['Priority', 'Component', 'Status', 'Resolution', 'Version', 'Created_time', 'Resolved_time'], axis=1)
data

Unnamed: 0,Issue_id,Duplicated_issue,Title,Description
0,1.0,,Usability issue with external editors (1GE6IRL),- Setup a project that contains a *.gif resour...
1,2.0,,Opening repository resources doesnt honor type...,Opening repository resource always open the de...
2,3.0,,Sync does not indicate deletion (1GIEN83),KM (10/2/2001 5:55:18 PM); \tThis PR about the...
3,4.0,,need better error message if catching up over ...,- become synchronized with some project in a r...
4,5.0,,ISharingManager sharing API inconsistent (1GAU...,For getting/setting the managed state of a res...
...,...,...,...,...
10007,,,,
10008,,,,
10009,,,,
10010,,,,


In [5]:
#Printing the Coloumn names
data.keys()

Index(['Issue_id', 'Duplicated_issue', 'Title', 'Description'], dtype='object')

In [6]:
#Checking the shape of the data
data.shape

(10012, 4)


## **1. Text Cleaning**

  *   Removing Invalid Reports
  *   Removing Punctuations






In [7]:
#Number of empty Description reports
data['Description'].isnull().sum()

8907

In [8]:
#Number of empty Title reports
data['Title'].isnull().sum()

8907

In [9]:
#removing the rows with empty Description
data = data.dropna(axis=0, subset=['Description'])

In [10]:
pd.options.display.max_colwidth = 500

In [11]:
#Showing the fixed pattern used in many reports which doesn't have to do anything with the bug itself
# Eg: fixed in head, has been marked as read only
data.loc[[4]]

Unnamed: 0,Issue_id,Duplicated_issue,Title,Description
4,5.0,,ISharingManager sharing API inconsistent (1GAUL8H),For getting/setting the managed state of a resource; the methods are:; ; isManaged(resource) and; manage(resource); ; for getting/setting the ignore state; the methods are:; ; getIgnored(resource); setIgnored(resource); ; These should be made more consistent. I suggest renaming ignore methods:; ; isIgnored(resource) and; ignore(resource).; ; I think its good practice to not use get and set in method names unless its really; just a field accessor.; ; NOTES:


In [12]:
#Removing the fixed patterns by replacing them with " "
data["Description"]= data["Description"].str.replace("fixed in HEAD", "", case = False)
data["Description"]= data["Description"].str.replace("has been marked as readonly", " ", case = False)

In [13]:
#Checking if it is been removed
data.loc[[4]]

Unnamed: 0,Issue_id,Duplicated_issue,Title,Description
4,5.0,,ISharingManager sharing API inconsistent (1GAUL8H),For getting/setting the managed state of a resource; the methods are:; ; isManaged(resource) and; manage(resource); ; for getting/setting the ignore state; the methods are:; ; getIgnored(resource); setIgnored(resource); ; These should be made more consistent. I suggest renaming ignore methods:; ; isIgnored(resource) and; ignore(resource).; ; I think its good practice to not use get and set in method names unless its really; just a field accessor.; ; NOTES:


In [14]:
pd.options.display.max_colwidth = 50

In [15]:
#Text Cleaning round 1 (removing punctutions)
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('\w*\f\w*', '', text)
    text = re.sub('\(.*?\)', '', text)
    text = re.sub('\[.*]\)', '', text)
    text = text.lower()
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [16]:
data['Description'] = data['Description'].apply(clean_text_round1)


In [17]:
# Apply a second round of cleaning (removing punctuations)
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\t', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [18]:
data['Title'] = data['Title'].apply(clean_text_round2)
data['Description'] = data['Description'].apply(clean_text_round2)

## **2. Stop Word Removal, Tokenization and Lemmatization**

In [19]:

def lemmatize(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 5:
            result.append(lemmatize(token))
    return result

In [20]:
# nltk.download('omw-1.4') # one-time download be run in case the library is not present on your local environment.

In [21]:
data['Title'] = data['Title'].map(preprocess)
data['Description'] = data['Description'].map(preprocess)

In [22]:
data.to_csv("dataset/preprocessed_data.csv")

## **Seperation of Master Reports and Duplicate Reports**

In [23]:
#Number of master_reports
data['Duplicated_issue'].isnull().sum()

1037

In [24]:
#Saving all the duplicate reports into a csv file as a testing set
duplicate_reports = data.dropna(axis=0, subset=['Duplicated_issue'])
duplicate_reports.reset_index(drop=True)
duplicate_reports.to_csv('dataset/duplicate_reports.csv')

In [25]:
#Seperating all the master reports into a dataframe
master_reports = data[data.isnull().any(axis=1)]
master_reports.reset_index(drop=True)

Unnamed: 0,Issue_id,Duplicated_issue,Title,Description
0,1.0,,"[usability, external, editors]","[project, contain, resource, release, project,..."
1,2.0,,"[open, repository, resources, doesnt]","[open, repository, resource, default, editor, ..."
2,3.0,,"[indicate, deletion]","[deletion, indicator, viewer, subtle, vision, ..."
3,4.0,,"[better, message, catch, resource]","[synchronize, project, repository, different, ..."
4,5.0,,"[isharingmanager, share, inconsistent]","[gettingsetting, manage, resource, methods, is..."
...,...,...,...,...
1032,2545.0,,"[editor, accelerators]","[editor, singleuse, accelerators, associate, e..."
1033,2546.0,,"[workbench, drag, gfmokf]","[perspective, perspective, windows, desktop, w..."
1034,2547.0,,"[editor, refresh, gfmqwj]","[create, project, create, leave, editor, delet..."
1035,2548.0,,"[project, compare, view, select]","[compare, project, folder, stream, resource, c..."


In [26]:
#print('MASTER_REPORTS at INDEX #      ', master_reports[22])

# master_reports.to_csv(r'dataset\my_data_master.csv', index=False)

# master_reports['Issue_id'].replace('', np.nan, inplace=True)

# master_reports.dropna(subset=['Issue_id'], inplace=True)

# master_reports.to_csv(r'dataset\my_data_master_new.csv', index=False)

#from tabulate import tabulate
#display(master_reports.Description)
#print(tabulate(master_reports, headers = 'keys', tablefmt = 'psql'))


#dataframe = pd.DataFrame(master_reports, columns=master_reports.Description)
# printing data frame
#print(dataframe)



#print('LENGTH of MASTER_REPORTS      ', len(master_reports))
for i in range(len(master_reports)):
    print('INDEX      ', i)
    try:
        index = master_reports.Issue_id[i]
        print('Issue_id:           ', index)
        print('Description:           ', master_reports.Description[i])
    except KeyError as ke:
        print('Key Not Found in master_reports Dictionary:     ', ke)
        
    #if(i != 24):
    #print(master_reports.Duplicated_issue[i])

In [27]:
#Removing empty master_reports after data cleaning process
mr = pd.DataFrame()
count = 0
exception_count = 0
# print('LENGTH of MASTER_REPORTS      ', len(master_reports))
# print('')
# #print('MASTER_REPORTS at INDEX #      ', master_reports[2])
# print('')
# print('')
# print('')
for i in range(len(master_reports)):
    #print('COUNTER Value:         ', count)
    #print('Dictionary INDEX - I:     ', i)
    #print('master_reports Description:    ', master_reports.Description[i])
    
    try:
        if(len(master_reports.Description[i]) > 2):        
            #print('LEN of master_reports Description:    ', len(master_reports.Description[i]))
            mr = mr.append(master_reports.loc[[i]]) # use pandas.concat instead of pandas.append
            count+=1
    except KeyError as ke:
            #print('Key Not Found in master_reports Dictionary:     ', ke)
            exception_count+=1
        
        #print('REVISED MR     ', mr)
        

In [28]:
print('Total # Indices missing:       ', exception_count)

Total # Indices missing:        63


In [29]:
#Saving all the master reports into a seperate csv file
master_reports.to_csv('dataset/master_reports.csv')