## Build features with extended dataset and references feature

In [1]:
from ast import literal_eval
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import re
import string
from sklearn.feature_selection import VarianceThreshold

# get interim data
df = pd.read_csv("../data/interim/data_extended.csv")



# fill nan-values for references with empty list
df.references.fillna("[]", inplace=True)



# change references type from string into array
df.references = df.references.apply(literal_eval)



# keyword-matching features

def addKeywordFeature(df, keyword, column, col_name):
    toAdd = []
    if column == "abstract":
        for index, row in df.loc[:, [column]].iterrows():
            if pd.isna(df[column][index]):
                toAdd.append(2)
            elif keyword in row.abstract.lower():
                toAdd.append(1)
            else:
                toAdd.append(0)
        df.insert(loc=len(df.columns), column=col_name, value=toAdd)
    elif column == "title":
        for index, row in df.loc[:, [column]].iterrows():
            if pd.isna(df[column][index]):
                toAdd.append(2)
            elif keyword in row.title.lower():
                toAdd.append(1)
            else:
                toAdd.append(0)
        df.insert(loc=len(df.columns), column=col_name, value=toAdd)

addKeywordFeature(df, "literature review", "title", "title_literaturereview")
addKeywordFeature(df, "literature review", "abstract", "abstract_literaturereview")
addKeywordFeature(df, "review", "title", "title_review")
addKeywordFeature(df, "review", "abstract", "abstract_review")
addKeywordFeature(df, "survey", "title", "title_survey")
addKeywordFeature(df, "survey", "abstract", "abstract_survey")
addKeywordFeature(df, "experiment", "title", "title_experiment")
addKeywordFeature(df, "experiment", "abstract", "abstract_experiment")
addKeywordFeature(df, "interview", "title", "title_interview")
addKeywordFeature(df, "interview", "abstract", "abstract_interview")
addKeywordFeature(df, "case study", "title", "title_casestudy")
addKeywordFeature(df, "case study", "abstract", "abstract_casestudy")
addKeywordFeature(df, "questionnaire", "title", "title_questionnaire")
addKeywordFeature(df, "questionnaire", "abstract", "abstract_questionnaire")
addKeywordFeature(df, "design science", "title", "title_designscience")
addKeywordFeature(df, "design science", "abstract", "abstract_designscience")
addKeywordFeature(df, "design science", "meta-analysis", "title_metaanalysis")
addKeywordFeature(df, "design science", "meta-analysis", "abstract_metaanalysis")



# method paper matching feature

# read in list of literature review method papers and extract dois
df_method_papers = pd.read_csv("../data/external/lr-method-papers.csv", usecols=["doi"])
df_method_papers.dropna(inplace = True)
method_papers = df_method_papers['doi'].tolist()

reference_count = []

for index, row in df.loc[:, ["references"]].iterrows():
    counter = 0
    for doi in df["references"][index]:
        if doi in method_papers:
            counter += 1
    reference_count.append(counter)
    
df.insert(loc=len(df.columns), column="references_count", value=reference_count)



# drop unnecessary columns
df.drop(['title', 'abstract', 'references'], axis = 1, inplace = True)



# save data as csv
df.to_csv('../data/processed/extended_dataset/data_ref.csv', index=False)