# Feature Engineering

### abs_literaturereview
1 if abstract contains "literature review"

### abs_review
1 if abstract contains "review"

### abstract_survey
1 if abstract contains "survey"

### title_literaturereview
1 if title contains "literature review"

### title_review
1 if title contains "review"

### title_survey
1 if title contains "survey"

In [5]:
import pandas as pd
from ast import literal_eval

df = pd.read_csv('../data/interim/dataCrossref.csv')
df.references.fillna("[]", inplace=True)
df.references = df.references.apply(literal_eval)

df.drop(['language', 'url', 'pages', 'number', 'volume', 'year', 'journal', 'author', 'ENTRYTYPE', 'doi'], axis=1, inplace=True)

def addKeywordFeature(df, keyword, column, col_name):
    toAdd = []
    if column == "abstract":
        for index, row in df.loc[:, [column]].iterrows():
            if pd.isnull(df[column][index]):
                toAdd.append(0)
            elif keyword in row.abstract.lower():
                toAdd.append(1)
            else:
                toAdd.append(0)
        df.insert(loc=len(df.columns), column=col_name, value=toAdd)
    elif column == "title":
        for index, row in df.loc[:, [column]].iterrows():
            if pd.isnull(df[column][index]):
                toAdd.append(0)
            elif keyword in row.title.lower():
                toAdd.append(1)
            else:
                toAdd.append(0)
        df.insert(loc=len(df.columns), column=col_name, value=toAdd)

addKeywordFeature(df, "literature review", "title", "title_literaturereview")
addKeywordFeature(df, "literature review", "abstract", "abstract_literaturereview")
addKeywordFeature(df, "review", "title", "title_review")
addKeywordFeature(df, "review", "abstract", "abstract_review")
addKeywordFeature(df, "survey", "title", "title_survey")
addKeywordFeature(df, "survey", "abstract", "abstract_survey")
addKeywordFeature(df, "experiment", "title", "title_experiment")
addKeywordFeature(df, "experiment", "abstract", "abstract_experiment")
addKeywordFeature(df, "interview", "title", "title_interview")
addKeywordFeature(df, "interview", "abstract", "abstract_interview")

lr_dois = ["10.2307/4132319", "10.28945/479", "10.1057/ejis.2011.51", "10.1057/ejis.2014.7", "10.17705/1CAIS.03412", "10.1057/jit.2014.26", "10.17705/1CAIS.03743", "10.17705/1CAIS.03712", "10.17705/1CAIS.03706", "10.17705/1CAIS.03708","10.1016/j.im.2014.08.008", "10.17705/1jais.00501"]
reference_count = []

for index, row in df.loc[:, ["references"]].iterrows():
    counter = 0
    for doi in df["references"][index]:
        if doi in lr_dois:
            counter += 1
    reference_count.append(counter)
    
print(reference_count)
df.insert(loc=len(df.columns), column="references_count", value=reference_count)
df.value_counts("references_count")




[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

references_count
0    1475
dtype: int64

## citations indicating literature reviews with doi (based on Bai et al. 2019)
* Webster and Watson (2002): 10.2307/4132319
* Levy and Ellis (2006): 10.28945/479
* Wolfswinkel et al. (2013): 10.1057/ejis.2011.51
* Rowe (2014): 10.1057/ejis.2014.7
* Boell and Cecez-Kecmanovic (2014): 10.17705/1CAIS.03412
* Boell and Cecez-Kecmanovic (2015): 10.1057/jit.2014.26
* Okoli (2015): 10.17705/1CAIS.03743
* Schryen (2015): 10.17705/1CAIS.03712
* Templier and Paré (2015): 10.17705/1CAIS.03706
* Bandara et al. (2015): 10.17705/1CAIS.03708
* Paré et al. (2015): 10.1016/j.im.2014.08.008
* Leidner (2018): 10.17705/1jais.00501