In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import SMOTE #-> oversampling technique

from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
import re
import string
from sklearn.feature_selection import VarianceThreshold

np.random.seed(42)


# get interim data
df = pd.read_csv("../data/interim/data.csv")



# fill nan-values for references with empty list
df.references.fillna("[]", inplace=True)



# change references type from string into array
df.references = df.references.apply(literal_eval)



# keyword-matching features

def addKeywordFeature(df, keyword, column, col_name):
    toAdd = []
    if column == "abstract":
        for index, row in df.loc[:, [column]].iterrows():
            if pd.isna(df[column][index]):
                toAdd.append(2)
            elif keyword in row.abstract.lower():
                toAdd.append(1)
            else:
                toAdd.append(0)
        df.insert(loc=len(df.columns), column=col_name, value=toAdd)
    elif column == "title":
        for index, row in df.loc[:, [column]].iterrows():
            if pd.isna(df[column][index]):
                toAdd.append(2)
            elif keyword in row.title.lower():
                toAdd.append(1)
            else:
                toAdd.append(0)
        df.insert(loc=len(df.columns), column=col_name, value=toAdd)

addKeywordFeature(df, "literature review", "title", "title_literaturereview")
addKeywordFeature(df, "literature review", "abstract", "abstract_literaturereview")
addKeywordFeature(df, "review", "title", "title_review")
addKeywordFeature(df, "review", "abstract", "abstract_review")
addKeywordFeature(df, "survey", "title", "title_survey")
addKeywordFeature(df, "survey", "abstract", "abstract_survey")
addKeywordFeature(df, "experiment", "title", "title_experiment")
addKeywordFeature(df, "experiment", "abstract", "abstract_experiment")
addKeywordFeature(df, "interview", "title", "title_interview")
addKeywordFeature(df, "interview", "abstract", "abstract_interview")
addKeywordFeature(df, "case study", "title", "title_casestudy")
addKeywordFeature(df, "case study", "abstract", "abstract_casestudy")
addKeywordFeature(df, "questionnaire", "title", "title_questionnaire")
addKeywordFeature(df, "questionnaire", "abstract", "abstract_questionnaire")
addKeywordFeature(df, "design science", "title", "title_designscience")
addKeywordFeature(df, "design science", "abstract", "abstract_designscience")



# method paper matching feature

# read in list of literature review method papers and extract dois
df_method_papers = pd.read_csv("../data/external/lr-method-papers.csv", usecols=["doi"])
df_method_papers.dropna(inplace = True)
method_papers = df_method_papers['doi'].tolist()

reference_count = []

for index, row in df.loc[:, ["references"]].iterrows():
    counter = 0
    for doi in df["references"][index]:
        if doi in method_papers:
            counter += 1
    reference_count.append(counter)
    
df.insert(loc=len(df.columns), column="references_count", value=reference_count)




# text mining feature with bag of words

df.abstract.fillna("", inplace=True)
df.title.fillna("", inplace=True)

# clean text helper function
def clean_text(text):
    
    # lowercase
    text = text.lower()
    
    # remove punctuation and multiple spaces
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", " ", text
    )
    text = " ".join(text.split())
    
    remove_digits = str.maketrans('', '', string.digits)
    text = text.translate(remove_digits)
    
    return text

# clean abstracts and titles
df["clean_abstracts"] = df.abstract.map(clean_text, na_action="ignore")
df["clean_titles"] = df.title.map(clean_text, na_action="ignore")

# count vectorizer for bag of words
print("starting count vectorizer")
bow_abstracts = CountVectorizer(ngram_range=(1,2), stop_words="english")
bow_titles = CountVectorizer(ngram_range=(1,2), stop_words="english")

# fit vocabulary
abstract_matrix = bow_abstracts.fit_transform(df.clean_abstracts)
title_matrix = bow_titles.fit_transform(df.clean_titles)

# variance threshold feature selection
#print("variance threshold")
#selector = VarianceThreshold()
#abstract_matrix = selector.fit_transform(abstract_matrix)
#title_matrix = selector.fit_transform(title_matrix)

# convert sparse matrix into dataframe
df_abstracts = pd.DataFrame.sparse.from_spmatrix(abstract_matrix)
df_titles = pd.DataFrame.sparse.from_spmatrix(title_matrix)

# change column names to prepare for concat
df_titles.rename(columns=lambda x: str(x) + "_title", inplace=True)
df_abstracts.rename(columns=lambda x: str(x) + "_abstracts", inplace=True)

# concat all dataframes to single dataframe
df_final = pd.concat([df, df_titles, df_abstracts], axis=1)



# drop unnecessary columns
df_final.drop(['title', 'abstract', 'references', 'clean_titles', 'clean_abstracts'], axis = 1, inplace = True)

df = df_final

X = df.drop(['literature_review'], axis=1)
y = df['literature_review']

X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.25, random_state=42)

# oversampling with SMOTE
oversample = SMOTE(k_neighbors=2)
over_X, over_y = oversample.fit_resample(X_train,y_train)


models = {}
models['Logistic Regression'] = LogisticRegression(class_weight='balanced')
models['Support Vector Machines'] = SVC(class_weight='balanced')
models['Naive Bayes'] = BernoulliNB()
models['Decision Trees'] = DecisionTreeClassifier(class_weight='balanced')
models['Random Forest'] = RandomForestClassifier(class_weight='balanced')
models['K-Nearest Neighbor'] = KNeighborsClassifier()
models['BRF'] = BalancedRandomForestClassifier()

accuracy = {}
precision = {}
recall = {}
conf_mat = {}
f1 = {}

for key in models.keys():
    accuracy[key] = np.mean(cross_val_score(models[key], X_train, y_train, scoring="accuracy"))
    precision[key] = np.mean(cross_val_score(models[key], X_train, y_train, scoring="precision"))
    recall[key] = np.mean(cross_val_score(models[key], X_train, y_train, scoring="recall"))
    f1[key] = np.mean(cross_val_score(models[key], X_train, y_train, scoring="f1"))
    y_pred = cross_val_predict(models[key], X_train, y_train)
    conf_mat[key] = confusion_matrix(y_train, y_pred)
    print(key)
    print(f"f1: {f1[key]}, precision: {precision[key]}, recall: {recall[key]}, accuracy: {accuracy[key]}")

models['SMOTE'] = RandomForestClassifier()
accuracy['SMOTE'] = np.mean(cross_val_score(models['SMOTE'], over_X, over_y, scoring="accuracy"))
precision['SMOTE'] = np.mean(cross_val_score(models['SMOTE'], over_X, over_y, scoring="precision"))
recall['SMOTE'] = np.mean(cross_val_score(models['SMOTE'], over_X, over_y, scoring="recall"))
f1['SMOTE'] = np.mean(cross_val_score(models['SMOTE'], over_X, over_y, scoring="f1"))
y_pred = cross_val_predict(models['SMOTE'], over_X, over_y)
conf_mat['SMOTE'] = confusion_matrix(over_y, y_pred)
print("SMOTE")
print(f"f1: {f1['SMOTE']}, precision: {precision['SMOTE']}, recall: {recall['SMOTE']}, accuracy: {accuracy['SMOTE']}")


starting count vectorizer


