# Feature Engineering - text mining feature

In [5]:
import re
import string
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
import scipy.sparse    


df = pd.read_csv('../data/interim/data.csv')
df.references.fillna("[]", inplace=True)
df.references = df.references.apply(literal_eval)


# text mining with bag of words

df.abstract.fillna("", inplace=True)
df.title.fillna("", inplace=True)

# clean abstracts & titles
def clean_text(text):
    
    # lowercase
    text = text.lower()
    
    # remove punctuation and multiple spaces
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", " ", text
    )
    text = " ".join(text.split())
    
    remove_digits = str.maketrans('', '', string.digits)
    text = text.translate(remove_digits)
    
    return text

df["clean_abstracts"] = df.abstract.map(clean_text, na_action="ignore")
df["clean_titles"] = df.title.map(clean_text, na_action="ignore")

bow_abstracts = CountVectorizer(ngram_range=(1,2), stop_words="english")
bow_titles = CountVectorizer(ngram_range=(1,2), stop_words="english")

abstract_matrix = bow_abstracts.fit_transform(df.clean_abstracts)
title_matrix = bow_titles.fit_transform(df.clean_titles)

#bow_abstracts.get_feature_names_out()

abstract_matrix


#df.to_csv("../data/processed/data.csv", index=False)


df_abstracts = pd.DataFrame.sparse.from_spmatrix(abstract_matrix)
df_titles = pd.DataFrame.sparse.from_spmatrix(title_matrix)

df_titles.rename(columns=lambda x: str(x) + "_title", inplace=True)
df_abstracts.rename(columns=lambda x: str(x) + "_abstracts", inplace=True)

df_final = pd.concat([df, df_titles, df_abstracts], axis=1)

df_final




Unnamed: 0,literature_review,abstract,title,references,clean_abstracts,clean_titles,0_title,1_title,2_title,3_title,...,29873_abstracts,29874_abstracts,29875_abstracts,29876_abstracts,29877_abstracts,29878_abstracts,29879_abstracts,29880_abstracts,29881_abstracts,29882_abstracts
0,1,,Women in the information technology profession...,"[10.1177/0146167290163002, 10.2307/800806, 10....",,women in the information technology profession...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,"More than 10 years ago, Orlikowski and Iacono ...",The ongoing quest for the IT artifact - Lookin...,"[10.2307/256820, 10.1177/001316446002000104, 1...",more than years ago orlikowski and iacono ex...,the ongoing quest for the it artifact looking ...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,,Review: Knowledge Management and Knowledge Man...,[],,review knowledge management and knowledge mana...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,In this paper I review the Information Systems...,Information systems in developing countries - ...,"[10.1080/02681102.1996.9627212, 10.1080/026811...",in this paper i review the information systems...,information systems in developing countries a ...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,,Privacy in the Digital Age - A Review of Infor...,[],,privacy in the digital age a review of informa...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1470,0,,Talking about Technology - The Emergence of a ...,[],,talking about technology the emergence of a ne...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1471,0,,Evaluating Journal Quality and the Association...,[],,evaluating journal quality and the association...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1472,0,,Differential Influence of Blogs Across Differe...,[],,differential influence of blogs across differe...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1473,0,,A Longitudinal Study of Herd Behavior in the A...,[],,a longitudinal study of herd behavior in the a...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## citations indicating literature reviews with doi (based on Bai et al. 2019)
* Webster and Watson (2002): 10.2307/4132319
* Levy and Ellis (2006): 10.28945/479
* Wolfswinkel et al. (2013): 10.1057/ejis.2011.51
* Rowe (2014): 10.1057/ejis.2014.7
* Boell and Cecez-Kecmanovic (2014): 10.17705/1CAIS.03412
* Boell and Cecez-Kecmanovic (2015): 10.1057/jit.2014.26
* Okoli (2015): 10.17705/1CAIS.03743
* Schryen (2015): 10.17705/1CAIS.03712
* Templier and Paré (2015): 10.17705/1CAIS.03706
* Bandara et al. (2015): 10.17705/1CAIS.03708
* Paré et al. (2015): 10.1016/j.im.2014.08.008
* Leidner (2018): 10.17705/1jais.00501