In [1]:
import pandas as pd
import numpy as np
import re

from nltk.corpus import stopwords
from gensim.parsing.preprocessing import STOPWORDS
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Load Dataset and combined them to get insights using 500 documents
data = pd.read_pickle('data/WWW.pkl')

In [3]:
abs_sample = data['Abstract'][0]
data

Unnamed: 0,Doc_no,Abstract,Keywords
0,10984461,Opportunities at the Intersection of Bioinform...,"Computational Biology,Cooperative Behavior,Dec..."
1,10984462,The Interactions Between Clinical Informatics ...,"Academic Medical Centers,California,Computatio..."
2,10984463,Electronic Health Record Meets Digital Library...,"Internet,Libraries,Medical Records Systems, Co..."
3,10984464,Improving Clinical Communication --A View from...,"Attitude of Health Personnel,Cognitive Science..."
4,10984465,Comparative Evaluation of Three Continuous Spe...,"Comparative Study,Evaluation Studies,Humans,Me..."
...,...,...,...
490,12915583,"Protective Efficacy of an AIDS Vaccine, a Sing...","AIDS Vaccines,Acquired Immunodeficiency Syndro..."
491,12915584,High Circulating Frequencies of Tumor Necrosis...,"CD4 Lymphocyte Count,CD4-Positive T-Lymphocyte..."
492,12915585,Early- and Intermediate-Stage Variants of Simi...,"Animals,Cell Line,Gene Products, gag,HIV-1,Hum..."
493,12915586,Susceptibility of Human Hepatitis Delta Virus ...,"Animals,Base Sequence,Cell Line,Genome, Viral,..."


### Checking for duplicates in the 

In [4]:
print(f'Shape of the Dataset before removing duplicates : {data.shape}')
duplicate = data[data.duplicated('Doc_no')]
print(f'Duplicates found in the: {duplicate.shape}')

Shape of the Dataset before removing duplicates : (495, 3)
Duplicates found in the: (0, 3)


### Average length of the Abstracts before Pre-Processing

In [5]:
total_abstracts = []
for abstract in data['Abstract']:
    total_abstracts.append(len(abstract.split()))

In [6]:
avg_before_proc = (sum(total_abstracts)/len(total_abstracts))
print(f"Average length of abstracts : {avg_before_proc}")

Average length of abstracts : 4748.078787878788


### Text Pre-Processing

In [7]:
# Pre processing the Title text
def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext

def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned

def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

def removeStopWords(sentence):
    stop_words = stopwords.words('english')
    stop_words = STOPWORDS.union(set(stop_words))
    
    word_tokens = word_tokenize(sentence)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    filtered_sentence = " ".join(filtered_sentence)
    return filtered_sentence

def lemmatize(sentence):
    lemmatizer = WordNetLemmatizer()
    lem_sentence = ""
    for word in word_tokenize(sentence):
        word = lemmatizer.lemmatize(word)
        lem_sentence += word
        lem_sentence += " "
    lem_sentence = lem_sentence.strip()
    return lem_sentence
    
def getStrings(sentence):
    sentence = sentence.split(",")
    sentence = " ".join(sentence)
    return sentence

In [None]:
data['Abstract'] = data['Abstract'].str.lower()
data['Abstract'] = data['Abstract'].apply(cleanHtml)
data['Abstract'] = data['Abstract'].apply(cleanPunc)
data['Abstract'] = data['Abstract'].apply(keepAlpha)
data['Abstract'] = data['Abstract'].apply(removeStopWords)
data['Abstract'] = data['Abstract'].apply(lemmatize)

data['Keywords'] = data['Keywords'].str.lower()

data.head()

### Comparing a row from the abstract column before and after text-processing

In [None]:
print(f'Length before pre-processing : {len(abs_sample.split())}\n')
print(abs_sample)

In [None]:
procs_abs_sample = data['Abstract'][0]
print(f'Length after pre-processing : {len(procs_abs_sample.split())}\n')
print(procs_abs_sample)

### Average length of the abstracts after pre-processing

In [None]:
total_abstracts = []
for abstract in data['Abstract']:
    total_abstracts.append(len(abstract.split()))

In [None]:
avg_after_proc = (sum(total_abstracts)/len(total_abstracts))
print(f"Average length of abstracts : {avg_after_proc}")

In [None]:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
desc = ['Avg before Pre-Processing', 'Avg after Pre-Processing']
average_score = [avg_before_proc, avg_after_proc]
ax.bar(desc,average_score)
plt.title('COMPARING THE AVERAGE LENGTH OF THE ABSTRACTS BEFORE AND AFTER DATA PRE-PROCESSING')
plt.show()

In [None]:
data.head()

In [None]:
pd.to_pickle(data, 'Data/Processed_WWW.pkl')