In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt

In [71]:
no_retract = pd.read_csv('./no_retraction_data_cleaned.csv')
no_retract = no_retract.drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1'])

retract = pd.read_csv('./retraction_data_cleaned.csv')
retract = retract.drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1'])

In [72]:
total = pd.read_csv('./total_data_cleaned.csv')
total = total.drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1'])
total.head()

Unnamed: 0,doi,year,month,day,volume,issue,journal,title,text,abstract,keywords,publisher,retraction_binary,unpacked_keywords,clean_text,clean_text_lem
0,10.1208/s12249-016-0596-x,2016.0,8.0,10.0,18.0,5.0,AAPS PharmSciTech,Study of the Transformations of Micro/Nano-cry...,‘Polymorphism’ generally referred as the abili...,This study elucidates the physical properties ...,"['monoclinic', 'nano-sized crystals', 'orthorh...",Springer International Publishing,1,"['monoclinic', 'nano-sized', 'crystals', 'orth...",Polymorphism generally referred as the ability...,Polymorphism generally referred a the ability ...
1,10.1021/acscentsci.9b00224,2019.0,5.0,9.0,5.0,6.0,ACS central science,Targeted Protein Internalization and Degradati...,Traditional\ndrug development efforts are focu...,Targeted,[],American Chemical Society,1,[],Traditional drug development efforts are focus...,Traditional drug development effort are focuse...
2,10.1021/acsomega.8b00488,2018.0,6.0,27.0,3.0,6.0,ACS omega,Regulating the Microstructure of Intumescent F...,Intumescent flame retardants\nare now being us...,A compatibilizer,[],American Chemical Society,1,[],Intumescent flame retardants are now being use...,Intumescent flame retardant are now being used...
3,10.1021/acsomega.8b00153,2018.0,6.0,25.0,3.0,6.0,ACS omega,Solid-to-Solid Crystallization of Organic Thin...,Crystal growth process is basic and essential ...,The solid-to-solid crystallization processes o...,[],American Chemical Society,1,[],Crystal growth process is basic and essential ...,Crystal growth process is basic and essential ...
4,10.1107/S1600536811022574,2011.0,6.0,18.0,67.0,,"Acta crystallographica. Section E, Structure r...",Oxonium picrate.,For general background to organic salts of pic...,"The title compound, H3O+·C6H2N3O7",[],International Union of Crystallography,1,[],For general background to organic salts of pic...,For general background to organic salt of picr...


In [73]:
vectorizer = CountVectorizer(analyzer = 'word',
                            tokenizer = None,
                            preprocessor = None,
                            stop_words = stopwords.words("english"), #stopwords were removed
                            max_features = 10000)

In [74]:
vectorizer.fit(retract['clean_text_lem'])
vect = vectorizer.transform(retract['clean_text_lem'])
vect_retract = pd.DataFrame(vect.toarray(), columns = vectorizer.get_feature_names())

In [75]:
vect_retract.head()

Unnamed: 0,0d0,10a,10b,10e8,10x,11a,11m088,125a,125b,126a,...,μmol,νmax,χ2,χn,χγ,χζ,ψb,ψh,ψp,ϵ0
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [76]:
# have equation characters, have numerical/letter combinations

In [77]:
vect_retract.columns[:1000]

Index(['0d0', '10a', '10b', '10e8', '10x', '11a', '11m088', '125a', '125b',
       '126a',
       ...
       'behavioural', 'behind', 'beijing', 'bela', 'belfast', 'belgium',
       'belief', 'believe', 'believed', 'belong'],
      dtype='object', length=1000)

In [78]:
vect_retract.columns[9000:]

Index(['tendency', 'tendon', 'tends', 'tenofovir', 'tensile', 'tension',
       'term', 'termed', 'terminal', 'terminally',
       ...
       'μmol', 'νmax', 'χ2', 'χn', 'χγ', 'χζ', 'ψb', 'ψh', 'ψp', 'ϵ0'],
      dtype='object', length=1000)

In [79]:
ls = []
for i in vect_retract.columns:
    word_sum = vect_retract[i].sum()
    ls.append(word_sum) #create a list of the total number of times each word was used

#The script below was adapted from https://www.kite.com/python/answers/how-to-append-a-list-as-a-row-to-a-pandas-dataframe-in-python
sum_row = pd.Series(ls, index = vect_retract.columns) #the list was turned into a series
vect_retract = vect_retract.append(sum_row, ignore_index = True) #the series was added to the end of the dataframe as a new row
vect_retract

Unnamed: 0,0d0,10a,10b,10e8,10x,11a,11m088,125a,125b,126a,...,μmol,νmax,χ2,χn,χγ,χζ,ψb,ψh,ψp,ϵ0
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1533,0,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,0,0
1534,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1535,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1536,0,0,0,0,0,0,0,0,0,0,...,0,0,8,0,0,0,0,0,0,0


In [80]:
vect_retract.iloc[1537, :].sort_values(ascending=False).head(50)

wa               157152
cell             128810
study             42884
group             40348
using             39680
level             39027
patient           37508
expression        35759
figure            35648
control           35270
usepackage        33089
used              31573
protein           31496
al                30500
et                29665
data              28523
analysis          27986
fig               27421
gene              26404
time              26017
also              25222
treatment         24966
effect            24135
ml                23430
result            22334
ha                21947
antibody          21716
two               21521
vitamin           20472
mouse             19963
sample            19096
compared          18733
different         17579
significant       17245
value             17214
concentration     17061
one               17035
model             17035
well              17003
day               16819
high              16804
disease         

NO RETRACTIONS

In [None]:
vectorizer.fit(no_retract['clean_text_lem'])
vect = vectorizer.transform(no_retract['clean_text_lem'])
vect_no_retract = pd.DataFrame(vect.toarray(), columns = vectorizer.get_feature_names())

In [None]:
vect_no_retract.head()

In [None]:
vect_no_retract.columns[:1000]

In [None]:
vect_no_retract.columns[9000:]

In [None]:
ls = []
for i in vect_no_retract.columns:
    word_sum = vect_no_retract[i].sum()
    ls.append(word_sum) #create a list of the total number of times each word was used

#The script below was adapted from https://www.kite.com/python/answers/how-to-append-a-list-as-a-row-to-a-pandas-dataframe-in-python
sum_row = pd.Series(ls, index = vect_no_retract.columns) #the list was turned into a series
vect_no_retract = vect_no_retract.append(sum_row, ignore_index = True) #the series was added to the end of the dataframe as a new row
vect_no_retract

In [None]:
vect_no_retract.iloc[3427, :].sort_values(ascending=False).head(50)

TOTAL

In [81]:
total.head()

Unnamed: 0,doi,year,month,day,volume,issue,journal,title,text,abstract,keywords,publisher,retraction_binary,unpacked_keywords,clean_text,clean_text_lem
0,10.1208/s12249-016-0596-x,2016.0,8.0,10.0,18.0,5.0,AAPS PharmSciTech,Study of the Transformations of Micro/Nano-cry...,‘Polymorphism’ generally referred as the abili...,This study elucidates the physical properties ...,"['monoclinic', 'nano-sized crystals', 'orthorh...",Springer International Publishing,1,"['monoclinic', 'nano-sized', 'crystals', 'orth...",Polymorphism generally referred as the ability...,Polymorphism generally referred a the ability ...
1,10.1021/acscentsci.9b00224,2019.0,5.0,9.0,5.0,6.0,ACS central science,Targeted Protein Internalization and Degradati...,Traditional\ndrug development efforts are focu...,Targeted,[],American Chemical Society,1,[],Traditional drug development efforts are focus...,Traditional drug development effort are focuse...
2,10.1021/acsomega.8b00488,2018.0,6.0,27.0,3.0,6.0,ACS omega,Regulating the Microstructure of Intumescent F...,Intumescent flame retardants\nare now being us...,A compatibilizer,[],American Chemical Society,1,[],Intumescent flame retardants are now being use...,Intumescent flame retardant are now being used...
3,10.1021/acsomega.8b00153,2018.0,6.0,25.0,3.0,6.0,ACS omega,Solid-to-Solid Crystallization of Organic Thin...,Crystal growth process is basic and essential ...,The solid-to-solid crystallization processes o...,[],American Chemical Society,1,[],Crystal growth process is basic and essential ...,Crystal growth process is basic and essential ...
4,10.1107/S1600536811022574,2011.0,6.0,18.0,67.0,,"Acta crystallographica. Section E, Structure r...",Oxonium picrate.,For general background to organic salts of pic...,"The title compound, H3O+·C6H2N3O7",[],International Union of Crystallography,1,[],For general background to organic salts of pic...,For general background to organic salt of picr...


In [82]:
X = total['clean_text_lem']
y = total['retraction_binary']

In [83]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    random_state=42)

In [84]:
y_test.value_counts(normalize=True).mul(100).round(2)

0    69.06
1    30.94
Name: retraction_binary, dtype: float64

In [None]:
vectorizer = CountVectorizer(analyzer = 'word',
                            tokenizer = None,
                            preprocessor = None,
                            stop_words = stopwords.words("english"), #stopwords were removed
                            max_features = 5000)

In [None]:
vectorizer.fit(X_train)
#stopwords not super helpful because always talking in third person passive
#stopwords should also be lemmatized
#need to make science stopwords list

In [None]:
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
X_train_df = pd.DataFrame(X_train.toarray(),
                          columns=vectorizer.get_feature_names())
X_test_df = pd.DataFrame(X_test.toarray(),
                          columns=vectorizer.get_feature_names())

In [None]:
mnb = MultinomialNB()

In [None]:
mnb.fit(X_train_df, y_train)
print(mnb.score(X_train_df, y_train))
print(mnb.score(X_test_df, y_test))

In [None]:
print(max(mnb.coef_[0]))
print(min(mnb.coef_[0]))
print((mnb.coef_[0]).mean())

In [None]:
plot_confusion_matrix(mnb, X_test_df, y_test, cmap='Blues', values_format='d');

In [None]:
#5000 features, default everything, 71.9% train, 69.5% test, 202 true/predict 1, 182 true 1/predict 0
#10000 features, default everything, 75.0% train, 69.9% test, 198 true/predict 1, 186 true 1/predict 0
#20000 features, default everything, 80.0% train, 70.6% test, 200 true/predict 1, 184 true 1/predict 0
#50000 features, default everything, 85.7% train, 72.4% test, 195 true/predict 1, 189 true 1/predict 0
#100000 features, default everything, 87.5% train, 72.6% test, 197 true/predict 1, 187 true 1/predict 0

Adding to Stopwords List

In [None]:
lemmatizer = WordNetLemmatizer()
lem_stopwords = [lemmatizer.lemmatize(i) for i in stopwords.words("english")]
lem_stopwords

In [None]:
retraction_stopwords = ['wa', 'using', 'et', 'al', 'figure', 'usepackage', 'used', 'fig', 'also', 
                        'ml', 'ha', 'two', 'one', 'may', 'based', 'table', 'however', 'data', 'mm', 'ms']

In [None]:
#https://www.quora.com/Is-there-any-list-of-stopwords-related-to-scientific-papers
science_stopwords = ['any','apply','applying','reapplying','given','papers','paper','about',
                     'results','result','real','world','page','article','present','takes',
                     'account', 'previous','work','propose','proposes','proposed','simply','simple',
                     'demonstrate','demonstrated','demonstrates','realworld','datasets','dataset',
                     'provide','important','research','researchers','experiments','experiment','unexpected',
                     'discovering','using','recent','collected','solve','columns','existing','traditional',
                     'final','consider','presented','provides','automatically','extracting','including','help',
                     'helps','explore','illustrate','achieve','better']

In [None]:
X = total['clean_text_lem']
y = total['retraction_binary']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    random_state=42)

In [None]:
vectorizer = CountVectorizer(analyzer = 'word',
                            tokenizer = None,
                            preprocessor = None,
                            stop_words = stopwords.words("english")+ lem_stopwords + retraction_stopwords + science_stopwords, #stopwords were removed
                            max_features = 20000)

In [None]:
vectorizer.fit(X_train)

In [None]:
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
X_train_df = pd.DataFrame(X_train.toarray(),
                          columns=vectorizer.get_feature_names())
X_test_df = pd.DataFrame(X_test.toarray(),
                          columns=vectorizer.get_feature_names())

In [None]:
mnb = MultinomialNB()

In [None]:
mnb.fit(X_train_df, y_train)
print(mnb.score(X_train_df, y_train))
print(mnb.score(X_test_df, y_test))

In [None]:
print(max(mnb.coef_[0]))
print(min(mnb.coef_[0]))
print((mnb.coef_[0]).mean())

In [None]:
count = 0
for i in range(0, len(mnb.coef_[0])):
    if mnb.coef_[0][i] == -15.715015446704813:
        count +=1
    else:
        pass
print(count)

In [None]:
plot_confusion_matrix(mnb, X_test_df, y_test, cmap='Blues', values_format='d');

In [None]:
#5000 features, more stopwords, 71.7% train, 69.3% test, 199 true/predict 1, 185 true 1/predict 0
#20000 features, more stopwords, 80.1% train, 70.8% test, 200 true/predict 1, 184 true 1/predict 0
#100000 features, more stopwords, 87.6% train, 72.6% test, 196 true/predict 1, 188 true 1/predict 0

In [None]:
vectorizer = CountVectorizer(analyzer = 'word',
                            tokenizer = None,
                            preprocessor = None,
                            stop_words = stopwords.words("english")+ lem_stopwords + retraction_stopwords + science_stopwords, #stopwords were removed
                            max_features = 10000)

In [None]:
vectorizer.fit(retract['clean_text_lem'])
vect = vectorizer.transform(retract['clean_text_lem'])
vect_retract = pd.DataFrame(vect.toarray(), columns = vectorizer.get_feature_names())

In [None]:
vect_retract.columns[:1000]

In [None]:
vect_retract.columns[9000:]

In [None]:
ls = []
for i in vect_retract.columns:
    word_sum = vect_retract[i].sum()
    ls.append(word_sum) #create a list of the total number of times each word was used

#The script below was adapted from https://www.kite.com/python/answers/how-to-append-a-list-as-a-row-to-a-pandas-dataframe-in-python
sum_row = pd.Series(ls, index = vect_retract.columns) #the list was turned into a series
vect_retract = vect_retract.append(sum_row, ignore_index = True) #the series was added to the end of the dataframe as a new row
vect_retract

In [None]:
vect_retract.iloc[1537, :].sort_values(ascending=False).head(50)

In [None]:
vectorizer.fit(no_retract['clean_text_lem'])
vect = vectorizer.transform(no_retract['clean_text_lem'])
vect_no_retract = pd.DataFrame(vect.toarray(), columns = vectorizer.get_feature_names())

In [None]:
vect_no_retract.columns[:1000]

In [None]:
vect_no_retract.columns[9000:]

In [None]:
ls = []
for i in vect_no_retract.columns:
    word_sum = vect_no_retract[i].sum()
    ls.append(word_sum) #create a list of the total number of times each word was used

#The script below was adapted from https://www.kite.com/python/answers/how-to-append-a-list-as-a-row-to-a-pandas-dataframe-in-python
sum_row = pd.Series(ls, index = vect_no_retract.columns) #the list was turned into a series
vect_no_retract = vect_no_retract.append(sum_row, ignore_index = True) #the series was added to the end of the dataframe as a new row
vect_no_retract

In [None]:
vect_no_retract.iloc[3427, :].sort_values(ascending=False).head(50)

TF-IDF Vectorizer

In [None]:
X = total['clean_text_lem']
y = total['retraction_binary']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    random_state=42)

In [None]:
tvec = TfidfVectorizer(ngram_range=(1, 2), max_features=5000, 
                       stop_words = stopwords.words("english")+ lem_stopwords + retraction_stopwords + science_stopwords)

In [None]:
X_train_df = pd.DataFrame(tvec.fit_transform(X_train).toarray(),
                  columns=tvec.get_feature_names())
X_test_df = pd.DataFrame(tvec.transform(X_test).toarray(),
                  columns=tvec.get_feature_names())

In [None]:
mnb = MultinomialNB()

In [None]:
mnb.fit(X_train_df, y_train)
print(mnb.score(X_train_df, y_train))
print(mnb.score(X_test_df, y_test))

In [None]:
plot_confusion_matrix(mnb, X_test_df, y_test, cmap='Blues', values_format='d');

In [None]:
#5000, more stopwords/tfidf bigrams, 75.2% train, 73.2% test, 183 predict/true 1, 201 predict 0/true 1
#20000, more stopwords/tfidf bigrams, 78.5% train, 74.5% test, 135 predict/true 1, 249 predict 0/true 1
#100000, more stopwords/tfidf bigrams, 75.7% train, 73.2% test, 57 predict/true 1, 327 predict 0/true 1

#5000, more stopwords/tfidf trigrams, 75.1% train, 73.2% test, 182 predict/true 1, 202 predict 0/true 1
#20000, more stopwords/tfidf trigrams, 78.2% train, 74.2% test, 135 predict/true 1, 249 predict 0/true 1
#100000, more stopwords/tfidf trigrams, 75.8% train, 73.8% test, 67 predict/true 1, 317 predict 0/true 1

Looked at top 30 for "5000, more stopwords/tfidf bigrams, 75.2% train, 73.2% test, 183 predict/true 1, 201 predict 0/true 1": all were single words

Changing Test Size

In [None]:
X = total['clean_text_lem']
y = total['retraction_binary']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.2,
                                                    stratify=y,
                                                    random_state=42)

In [None]:
vectorizer = CountVectorizer(analyzer = 'word',
                            tokenizer = None,
                            preprocessor = None,
                            stop_words = stopwords.words("english")+ lem_stopwords + retraction_stopwords + science_stopwords, #stopwords were removed
                            max_features = 20000)

vectorizer.fit(X_train)

X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

X_train_df = pd.DataFrame(X_train.toarray(),
                          columns=vectorizer.get_feature_names())
X_test_df = pd.DataFrame(X_test.toarray(),
                          columns=vectorizer.get_feature_names())

mnb = MultinomialNB()

mnb.fit(X_train_df, y_train)
print(mnb.score(X_train_df, y_train))
print(mnb.score(X_test_df, y_test))

plot_confusion_matrix(mnb, X_test_df, y_test, cmap='Blues', values_format='d');

In [None]:
tvec = TfidfVectorizer(ngram_range=(1, 2), max_features=5000, 
                       stop_words = stopwords.words("english")+ lem_stopwords + retraction_stopwords + science_stopwords)

X_train_df = pd.DataFrame(tvec.fit_transform(X_train).toarray(),
                  columns=tvec.get_feature_names())
X_test_df = pd.DataFrame(tvec.transform(X_test).toarray(),
                  columns=tvec.get_feature_names())

mnb = MultinomialNB()

mnb.fit(X_train_df, y_train)
print(mnb.score(X_train_df, y_train))
print(mnb.score(X_test_df, y_test))

plot_confusion_matrix(mnb, X_test_df, y_test, cmap='Blues', values_format='d');

In [69]:
#20000 features, more stopwords/test size 0.2, 79.7% train, 70.7% test, 144 true/predict 1, 163 true 1/predict 0
print(144/(144+163))
#20000 features, more stopwords, 80.1% train, 70.8% test, 200 true/predict 1, 184 true 1/predict 0
print(200/(200+184))
#20000 features, more stopwords/test size 0.3, 81.1% train, 71.6% test, 237 true/predict 1, 224 true 1/predict 0
print(237/(237+224))

print(" ")

#5000, more stopwords/tfidf bigrams/test size 0.2, 74.9% train, 73.3% test, 144 predict/true 1, 163 predict 0/true 1
print(144/(144+163))
#5000, more stopwords/tfidf bigrams, 75.2% train, 73.2% test, 183 predict/true 1, 201 predict 0/true 1
print(183/(183+201))
#5000, more stopwords/tfidf bigrams/test size 0.3, 75.1% train, 73.8% test, 219 predict/true 1, 242 predict 0/true 1
print(219/(219+242))

0.46905537459283386
0.5208333333333334
0.5140997830802603
 
0.46905537459283386
0.4765625
0.4750542299349241
