## First round of feature engineering - keyword matching feature
The following keywords are searched for within titles and abstracts:
* literature review
* review
* survey
* experiment
* interview
* case study
* questionnaire
* design science
* meta-analysis

If the keyword is found, the value is set to 1. If not, it is set to 0. If there is no title or abstract given, the value is set to 2.

In [1]:
from ast import literal_eval
import pandas as pd

# method to build the features
# both params have to be .csv files
def build_features(source_path,store_path):
    
    # get preprocessed data
    df = pd.read_csv(source_path)

    # fill nan-values for references with empty list
    df.references.fillna("[]", inplace=True)

    # change references column type from string into array
    df.references = df.references.apply(literal_eval)

    # add new features to dataframe
    addKeywordFeature(df, "literature review", "title", "title_literaturereview")
    addKeywordFeature(df, "literature review", "abstract", "abstract_literaturereview")
    addKeywordFeature(df, "review", "title", "title_review")
    addKeywordFeature(df, "review", "abstract", "abstract_review")
    addKeywordFeature(df, "survey", "title", "title_survey")
    addKeywordFeature(df, "survey", "abstract", "abstract_survey")
    addKeywordFeature(df, "experiment", "title", "title_experiment")
    addKeywordFeature(df, "experiment", "abstract", "abstract_experiment")
    addKeywordFeature(df, "interview", "title", "title_interview")
    addKeywordFeature(df, "interview", "abstract", "abstract_interview")
    addKeywordFeature(df, "case study", "title", "title_casestudy")
    addKeywordFeature(df, "case study", "abstract", "abstract_casestudy")
    addKeywordFeature(df, "questionnaire", "title", "title_questionnaire")
    addKeywordFeature(df, "questionnaire", "abstract", "abstract_questionnaire")
    addKeywordFeature(df, "design science", "title", "title_designscience")
    addKeywordFeature(df, "design science", "abstract", "abstract_designscience")
    addKeywordFeature(df, "meta-analysis", "title", "title_metaanalysis")
    addKeywordFeature(df, "meta-analysis", "abstract", "abstract_metaanalysis")

    # drop columns that are not needed anymore
    df.drop(['title', 'abstract', 'references'], axis = 1, inplace = True)

    # store resulting dataframe as csv
    df.to_csv(store_path, index=False)
   
   
    
# helper function to add keyword-matching features
def addKeywordFeature(df, keyword, column, col_name):
    toAdd = []
    if column == "abstract":
        for index, row in df.loc[:, [column]].iterrows():
            if pd.isna(df[column][index]):
                toAdd.append(2)
            elif keyword in row.abstract.lower():
                toAdd.append(1)
            else:
                toAdd.append(0)
        df.insert(loc=len(df.columns), column=col_name, value=toAdd)
    elif column == "title":
        for index, row in df.loc[:, [column]].iterrows():
            if pd.isna(df[column][index]):
                toAdd.append(2)
            elif keyword in row.title.lower():
                toAdd.append(1)
            else:
                toAdd.append(0)
        df.insert(loc=len(df.columns), column=col_name, value=toAdd)

In [2]:
# build features for original dataset
build_features('../data/interim/data_original.csv','../data/processed/original_dataset/data_key.csv')

# build features for extended dataset
build_features('../data/interim/data_extended.csv','../data/processed/extended_dataset/data_key.csv')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.references.fillna("[]", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.references.fillna("[]", inplace=True)


KeyboardInterrupt: 