# Feature Engineering

## Import Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import textstat
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

## Read CSV Files

In [2]:
total = pd.read_csv('./total_data_plos_only_cleaned.csv')
total.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,doi,year,month,day,volume,issue,journal,title,text,abstract,keywords,retraction_binary,unpacked_keywords,clean_text,clean_text_lem
0,0,0,0,10.1208/s12249-016-0596-x,2016.0,8.0,10.0,18.0,5.0,AAPS PharmSciTech,Study of the Transformations of Micro/Nano-cry...,‘Polymorphism’ generally referred as the abili...,This study elucidates the physical properties ...,"['monoclinic', 'nano-sized crystals', 'orthorh...",1,"['monoclinic', 'nano-sized', 'crystals', 'orth...",Polymorphism generally referred as the ability...,Polymorphism generally referred a the ability ...
1,1,1,1,10.1021/acscentsci.9b00224,2019.0,5.0,9.0,5.0,6.0,ACS central science,Targeted Protein Internalization and Degradati...,Traditional\ndrug development efforts are focu...,Targeted,[],1,[],Traditional drug development efforts are focus...,Traditional drug development effort are focuse...
2,2,2,2,10.1021/acsomega.8b00488,2018.0,6.0,27.0,3.0,6.0,ACS omega,Regulating the Microstructure of Intumescent F...,Intumescent flame retardants\nare now being us...,A compatibilizer,[],1,[],Intumescent flame retardants are now being use...,Intumescent flame retardant are now being used...
3,3,3,3,10.1021/acsomega.8b00153,2018.0,6.0,25.0,3.0,6.0,ACS omega,Solid-to-Solid Crystallization of Organic Thin...,Crystal growth process is basic and essential ...,The solid-to-solid crystallization processes o...,[],1,[],Crystal growth process is basic and essential ...,Crystal growth process is basic and essential ...
4,4,4,4,10.1107/S1600536811022574,2011.0,6.0,18.0,67.0,,"Acta crystallographica. Section E, Structure r...",Oxonium picrate.,For general background to organic salts of pic...,"The title compound, H3O+·C6H2N3O7",[],1,[],For general background to organic salts of pic...,For general background to organic salt of picr...


In [3]:
total = total.drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1'])

In [5]:
no_retract = pd.read_csv('./no_retraction_data_plos_only_cleaned.csv')
no_retract = no_retract.drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1'])

In [6]:
retract = pd.read_csv('./retraction_data_plos_only_cleaned.csv')
retract = retract.drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1'])

In [7]:
total['year'] = total['year'].astype(str)
total['month'] = total['month'].astype(str)
total['day'] = total['day'].astype(str)
total['volume'] = total['volume'].astype(str)
total['issue'] = total['issue'].astype(str)

retract['year'] = retract['year'].astype(float)
retract['month'] = retract['month'].astype(float)
retract['day'] = retract['day'].astype(float)
retract['volume'] = retract['volume'].astype(float)
retract['issue'] = retract['issue'].astype(float)

no_retract['year'] = no_retract['year'].astype(float)
no_retract['month'] = no_retract['month'].astype(float)
no_retract['day'] = no_retract['day'].astype(float)
no_retract['volume'] = no_retract['volume'].astype(float)
no_retract['issue'] = no_retract['issue'].astype(float)

## Feature Engineering

### Keywords

#### Retraction

In [10]:
#https://stackoverflow.com/questions/40950791/remove-quotes-from-string-in-python
keywords_list = []
count = 0
for i in retract['keywords']:
    if i == []:
        pass
    else:
        for j in i.split():
            keywords_list.append(j.replace("'",'').replace('[','').replace(',','').replace(']','').replace('(','').replace(')','').replace('\\n', '').replace('\\n','').lower())

In [11]:
#cells -> cell
lemmatizer = WordNetLemmatizer()
ls_keywords = []
for i in keywords_list:
    ls_keywords.append(lemmatizer.lemmatize(i))
pd.Series(ls_keywords).value_counts().head(11)

            663
cell        186
antibody    116
cancer      112
response     89
disease      84
health       79
factor       71
theory       67
gene         66
heat         61
dtype: int64

#### Keywords Binary

In [13]:
keywords_binary = []
for i in total['keywords']:
    if len(i) != 2:
        keywords_binary.append(1)
    else:
        keywords_binary.append(0)
print(len(keywords_binary))

10619


In [14]:
total['keywords_binary'] = keywords_binary

In [15]:
keywords_binary = []
for i in retract['keywords']:
    if len(i) != 2:
        keywords_binary.append(1)
    else:
        keywords_binary.append(0)
print(len(keywords_binary))

retract['keywords_binary'] = keywords_binary

1537


### Word Count

In [16]:
list_words = []
for i in range(0, len(total['clean_text'])):
    list_words.append(len(total['clean_text'][i].split()))
total['num_words'] = list_words 

### Character Length

In [17]:
list_words = []
for i in range(0, len(total['clean_text'])):
    list_words.append(len(total['clean_text'][i]))
total['character_length'] = list_words 

### Animal Studies

In [19]:
animal_terms = ['IACUC', 'mouse', 'mice', 'rats', 'rat', 'hamster', 'hamsters', 'pigs', 'rabbits', 'rabbit', 
                'cat', 'cats', 'dog', 'dogs', 'ungulate', 'ungulates', 'pig', 'horse', 'donkey', 'goat',
               'bovine', 'porcine', 'murine', 'chicken', 'sheep', 'cow', 'cows', 'horses', 'goats']

#http://vetmed.tamu.edu/media/2005639/vadnais%20protein%20therapeutics%202017.pdf
#https://www.ncbi.nlm.nih.gov/books/NBK218261/

In [20]:
def animal_binary(dataframe):
    list_articles = []
    iacuc = []
    for i in range(0, len(dataframe['clean_text'])):
        count = 0
        for j in dataframe['clean_text'][i].split():
            for k in animal_terms:
                if j == k:
                    if i not in list_articles:
                        list_articles.append(i)
                        iacuc.append(1)
                        count = 1
                else:
                    pass
        if count == 0:
            iacuc.append(0)
        else:
            pass
                   
    print(len(list_articles))
    print(len(iacuc))

    dataframe['animal_binary'] = iacuc
    return

In [21]:
animal_binary(total)
animal_binary(retract)
animal_binary(no_retract)

4289
10619
791
1537
3498
9082


In [22]:
def list_of_animal_words(dataframe):
    list_articles = []
    list_words = []
    for i in range(0, len(dataframe['clean_text'])):
        count = 0
        iacuc = []
        for j in dataframe['clean_text'][i].split():
            for k in animal_terms:
                if j == k:
                    if i not in list_articles:
                        list_articles.append(i)
                    if j not in iacuc:
                        iacuc.append(k)
                    count = 1
                else:
                    pass
        if count == 0:
            list_words.append([])
        else:
            list_words.append(iacuc)
                    
    print(len(list_articles))
    print(len(list_words))
    dataframe['animal_words'] = list_words
    return

In [23]:
list_of_animal_words(total)
list_of_animal_words(retract)
list_of_animal_words(no_retract)

4289
10619
791
1537
3498
9082


In [24]:
def animal_dummy(word_list, column_name, dataframe):
    column_list = []
    for i in dataframe['animal_words']:
        count = 0
        for j in i:
            for k in word_list:
                if j == k:
                    count = 1
        column_list.append(count)
    
    dataframe[column_name] = column_list
    return 

In [25]:
iacuc = ['IACUC']
mouse = ['mouse', 'mice']
rat = ['rat', 'rats']
murine = ['murine']
hamster = ['hamster', 'hamsters']
rabbit = ['rabbit', 'rabbits']
cat = ['cat', 'cats']
pig = ['pig', 'pigs', 'porcine']
dog = ['dog', 'dogs']
ungulate = ['ungulate', 'ungulates']
horse = ['horse', 'horses']
donkey = ['donkey']
goat = ['goat', 'goats']
cow = ['cow', 'cows', 'bovine']
chicken = ['chicken']
sheep = ['sheep']

In [26]:
animal_dummy(iacuc, 'iacuc', retract)
animal_dummy(mouse, 'mouse', retract)
animal_dummy(rat, 'rat', retract)
animal_dummy(murine, 'murine', retract)
animal_dummy(hamster, 'hamster', retract)
animal_dummy(rabbit, 'rabbit', retract)
animal_dummy(cat, 'cat', retract)
animal_dummy(pig, 'pig', retract)
animal_dummy(dog, 'dog', retract)
animal_dummy(ungulate, 'ungulate', retract)
animal_dummy(horse, 'horse', retract)
animal_dummy(donkey, 'donkey', retract)
animal_dummy(goat, 'goat', retract)
animal_dummy(cow, 'cow', retract)
animal_dummy(chicken, 'chicken', retract)
animal_dummy(sheep, 'sheep', retract)

animal_dummy(iacuc, 'iacuc', no_retract)
animal_dummy(mouse, 'mouse', no_retract)
animal_dummy(rat, 'rat', no_retract)
animal_dummy(murine, 'murine', no_retract)
animal_dummy(hamster, 'hamster', no_retract)
animal_dummy(rabbit, 'rabbit', no_retract)
animal_dummy(cat, 'cat', no_retract)
animal_dummy(pig, 'pig', no_retract)
animal_dummy(dog, 'dog', no_retract)
animal_dummy(ungulate, 'ungulate', no_retract)
animal_dummy(horse, 'horse', no_retract)
animal_dummy(donkey, 'donkey', no_retract)
animal_dummy(goat, 'goat', no_retract)
animal_dummy(cow, 'cow', no_retract)
animal_dummy(chicken, 'chicken', no_retract)
animal_dummy(sheep, 'sheep', no_retract)

animal_dummy(iacuc, 'iacuc', total)
animal_dummy(mouse, 'mouse', total)
animal_dummy(rat, 'rat', total)
animal_dummy(murine, 'murine', total)
animal_dummy(hamster, 'hamster', total)
animal_dummy(rabbit, 'rabbit', total)
animal_dummy(cat, 'cat', total)
animal_dummy(pig, 'pig', total)
animal_dummy(dog, 'dog', total)
animal_dummy(ungulate, 'ungulate', total)
animal_dummy(horse, 'horse', total)
animal_dummy(donkey, 'donkey', total)
animal_dummy(goat, 'goat', total)
animal_dummy(cow, 'cow', total)
animal_dummy(chicken, 'chicken', total)
animal_dummy(sheep, 'sheep', total)

In [27]:
total['animal_binary'][:1537].value_counts(normalize=True)

1    0.514639
0    0.485361
Name: animal_binary, dtype: float64

In [28]:
total['animal_binary'][1537:].value_counts(normalize=True)

0    0.614843
1    0.385157
Name: animal_binary, dtype: float64

### Human Studies

In [30]:
list_articles = []
irb = []
for i in range(0, len(total['clean_text'])):
    count = 0
    word_count = 0
    patient_count = 0
    for j in total['clean_text'][i].split():
        if j == 'IRB' or j == 'case' or j == 'participants':
            if j =='IRB' or j == 'participants':
                if i not in list_articles:
                    list_articles.append(i)
                    irb.append(1)
                    count = 1
            else:
                try:
                    if total['clean_text'][i].split()[word_count+1] == 'study':
                        if i not in list_articles:
                            list_articles.append(i)
                            irb.append(1)
                            count = 1
                    else:
                        pass
                except:
                    pass            
        else:
            pass
        word_count += 1
    if count == 0:
        irb.append(0)
    else:
        pass
                  
print(len(list_articles))
print(len(irb))
total['irb_binary'] = irb

3214
10619


In [31]:
total['irb_binary'][:1537].value_counts(normalize=True)

0    0.829538
1    0.170462
Name: irb_binary, dtype: float64

In [32]:
total['irb_binary'][1537:].value_counts(normalize=True)

0    0.674961
1    0.325039
Name: irb_binary, dtype: float64

### Regulatory Binary

In [33]:
regulatory = []
for i in range(0, len(total['irb_binary'])):
    if total['irb_binary'][i] == 1 or total['animal_binary'][i] == 1:
        if total['irb_binary'][i] == 1 and total['animal_binary'][i] == 1:
            regulatory.append(2)
        else:
            regulatory.append(1)
    else:
        regulatory.append(0)
len(regulatory)

10619

In [34]:
total['reg_binary'] = regulatory
total = total.rename(columns={'reg_binary':'regulatory'})
total['regulatory'][:1537].value_counts(normalize=True)

1    0.545869
0    0.384515
2    0.069616
Name: regulatory, dtype: float64

In [35]:
total['regulatory'][1537:].value_counts(normalize=True)

1    0.581150
0    0.354327
2    0.064523
Name: regulatory, dtype: float64

### Review Binary

In [36]:
list_articles = []
review = []
for i in range(0, len(total['clean_text'])):
    count = 0
    word_count = 0
    for j in total['clean_text'][i].split():
        if j == 'review':
            if total['clean_text'][i].split()[word_count-1] == 'this':
                if i not in list_articles:
                    list_articles.append(i)
                    review.append(1)
                    count = 1
        else:
            pass
        word_count += 1
    if count == 0:
        review.append(0)
    else:
        pass
                  
print(len(list_articles))
print(len(review))

total['review_binary'] = review

234
10619


In [37]:
total['review_binary'][:1537].value_counts(normalize=True)

0    0.960963
1    0.039037
Name: review_binary, dtype: float64

In [38]:
total['review_binary'][1537:].value_counts(normalize=True)

0    0.980841
1    0.019159
Name: review_binary, dtype: float64

### Novel Ideas

In [39]:
list_articles = []
novel_idea = []
for i in range(0, len(total['clean_text'])):
    count = 0
    novel_count = 0
    for j in total['clean_text'][i].split():
        if j == 'novel':
            for k in total['clean_text'][i].split():
                if k == 'novel':
                    novel_count += 1
            if novel_count > 1:
                if i not in list_articles:
                    list_articles.append(i)
                    novel_idea.append(1)
                    count = 1
        else:
            pass
    if count == 0:
        novel_idea.append(0)
    else:
        pass
            
print(len(list_articles))
print(len(novel_idea))
total['novel_idea'] = novel_idea

1565
10619


In [40]:
total['novel_idea'][:1537].value_counts(normalize=True)

0    0.826936
1    0.173064
Name: novel_idea, dtype: float64

In [41]:
total['novel_idea'][1537:].value_counts(normalize=True)

0    0.85697
1    0.14303
Name: novel_idea, dtype: float64

### Text Readability

In [44]:
def readability(dataframe):
    flesch_reading_ease_value = []
    flesch_kincaid_grade_value = []
    
    for i in dataframe['clean_text']:
        flesch_reading_ease_value.append(textstat.flesch_reading_ease(i))
        flesch_kincaid_grade_value.append(textstat.flesch_kincaid_grade(i))

    dataframe['flesch_reading_ease'] = flesch_reading_ease_value
    dataframe['flesch_kincaid_grade'] = flesch_kincaid_grade_value
    print(len(dataframe['flesch_reading_ease']))
    return

In [45]:
readability(retract)
readability(no_retract)
readability(total)

1537
9082
10619


In [46]:
retract.to_csv('./retract_feature_engineered_data.csv')
no_retract.to_csv('./no_retract_feature_engineered_data.csv')
total.to_csv('./total_feature_engineered_data.csv')