### How? With whom?

In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv('../../data/papersummaries_cleaned.csv')

NB: The Science category is deliberately not split into validation and evaluation. These terms are not very intuitive and respondents will hardly have realized any such distinction.

### The Rules

In [3]:
def assign_tag(level_1, level_2, level_3, summary):
    tags = []
    for tag in level_2:
        if any([re.search(keyword, summary) for keyword in level_3[tag]]):
            tags.append(level_1+tag)
    return tags

In [4]:
def assign_engineering(summary):
    level_1 = ':engineering'
    
    level_2 = [':reference', ':technology', ':methodology']
    
    level_3 = {':reference':   ['^a set of metrics', '^a taxonomy', '^an ontology', 
                               '^a template', '(?<!\w)a blueprint', ], 
               ':technology': ['^a tool', '^a solution', ], 
               ':methodology':     ['^a method', '^a process', '^a.{,15}technique', 'training program',
                                   '^a model', '^a (:?modell?ing |specification )language', 
                                   '^a (formal )?framework']
               }
    return assign_tag(level_1, level_2, level_3, summary)

def assign_science(summary):
    level_1 = ':science'
    
    level_2 = [':observation', ':intervention', ':interrogation']
    
    level_3 = {':observation':   ['(:?(:?multi.)?case|field) study', '(:?data.|document.)driven study',
                                  'industrial evaluation', '^an analysis'], 
               ':intervention':  ['experiment(?:s|\s)', 'project-based study', 
                                  'workshop-based industrial study', 'action research'],
               ':interrogation': ['interview-based study|study based on.{,30}interviews', 
                                  'questionnaire', '(?<!literature )(?:online.)?survey']
               }
    return assign_tag(level_1, level_2, level_3, summary)

def assign_perspective(summary):
    level_1 = ':perspective'
    
    level_2 = [':philosophy', ':opinion', ':experience', ':review']
    
    level_3 = {':philosophy': ['conceptual framework'], 
               ':opinion':  ['^a discussion', '\svision', 'roadmap\s'], 
               ':experience': ['experience report'],
               ':review': ['literature (:?survey|study|review)', 'state of the art report']
               } 
    return assign_tag(level_1, level_2, level_3, summary)


def assign_all_how(summary):
    summary = summary.lower()
    tags = ['how'+ x for x in 
            (assign_engineering(summary) 
             + assign_science(summary)
             + assign_perspective(summary))]
    return tags

In [5]:
def assign_all_withwhom(summary):
    level_1 = [':laypeople', ':professionals']
    level_2 = {':laypeople': [':students', ':others'],
               ':professionals': [':academics', ':practitioners']
              }
    level_3 = {':students': ['with students', 'with practitioners and students'],
               ':others': ['with crowd.?workers'],
               ':academics': ['with academics', 'with researchers', 
                              'with students and academics'],
               ':practitioners': ['with practitioners', 'with students and practitioners']
              }
    tags = []
    for l1 in level_1:
        for l2 in level_2[l1]:
            if any([re.search(x, summary.lower()) for x in level_3[l2]]):
                tags.append('withwhom'+l1+l2)
    return tags

In [6]:
def create_longform_tags(df, column_numbers):
    longform_tags = pd.DataFrame(columns=['PaperID', 'Tag'])
    for row in df.iterrows():
        for colno in column_numbers:
            longform_tags = longform_tags.append(
                pd.DataFrame({'PaperID':list(len(row[1][colno])*[row[1][0]]), 'Tag':row[1][colno]}))
    return longform_tags.reset_index().drop('index', axis=1)

In [7]:
def split_tag_levels(df):
    longform_tags = pd.DataFrame(df, copy=True)
    tags_three_levels = list(zip(*[x.split(':') for x in longform_tags.Tag]))
    longform_tags['level_1'] = tags_three_levels[0]
    longform_tags['level_2'] = tags_three_levels[1]
    longform_tags['level_3'] = tags_three_levels[2]
    return longform_tags

NB: As of now, the third level isn't explicitly represented in the tags. That's unfortunate especially for the distinction experiment/survey (which I'd expect to be evaluated differently). I'd suggest we refine tags using a separate function (after the initial tag assignment).

In [8]:
df['how'] = [assign_all_how(x) for x in df.PaperSummary]

In [9]:
df['withwhom'] = [assign_all_withwhom(x) for x in df.PaperSummary]

In [10]:
df.head()

Unnamed: 0,PaperID,PaperSummary,how,withwhom
0,1,A method for automatically recovering software...,[how:engineering:methodology],[]
1,2,A set of two techniques for improving the qual...,[how:engineering:methodology],[]
2,3,A case study on evaluating a given technique f...,[how:science:observation],[]
3,4,An experience report on the development of a m...,[how:perspective:experience],[]
4,5,A document-driven study on the relevancy of cl...,[how:science:observation],[]


In [11]:
longform_how = split_tag_levels(create_longform_tags(df, [2]))
longform_how.head(2)

Unnamed: 0,PaperID,Tag,level_1,level_2,level_3
0,1,how:engineering:methodology,how,engineering,methodology
1,2,how:engineering:methodology,how,engineering,methodology


In [12]:
longform_how.shape

(440, 5)

In [13]:
longform_how.groupby(['level_1', 'level_2', 'level_3']).count()[['PaperID']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,PaperID
level_1,level_2,level_3,Unnamed: 3_level_1
how,engineering,methodology,177
how,engineering,reference,5
how,engineering,technology,33
how,perspective,experience,38
how,perspective,opinion,11
how,perspective,philosophy,1
how,perspective,review,14
how,science,interrogation,43
how,science,intervention,37
how,science,observation,81


Summaries without tags?

In [14]:
[(x[0], x[1][1], x[1][2]) for x in df.iterrows() if len(x[1][2]) < 1]

[]

Summaries with multiple tags? (to check whether they're okay)

In [15]:
[(x[0], x[1][1], x[1][2]) for x in df.iterrows() if len(x[1][2]) > 1]

[(71,
  'A study based on experiments with students and a case study on the possible benefits of considering existing services and their alignment with requirements at a very early stage in order to exploit the desired benefits of reuse in service-oriented architectures',
  ['how:science:observation', 'how:science:intervention']),
 (207,
  'A literature survey on requirements elicitation techniques and a roadmap of research in order to improve the elicitation of tacit knowledge',
  ['how:perspective:opinion', 'how:perspective:review']),
 (368,
  'A set of two empirical studies (online survey to practitioners and experiment with students) on the creation and use of software requirement specifications in companies and the impact of their quality in subsequent development activities.',
  ['how:science:intervention', 'how:science:interrogation']),
 (395,
  'A literature study on specific threats to validity in controlled experiments with student participants and on mitigation strategies fo

In [16]:
longform_withwhom = split_tag_levels(create_longform_tags(df, [3]))
longform_withwhom.head(2)

Unnamed: 0,PaperID,Tag,level_1,level_2,level_3
0,8,withwhom:professionals:practitioners,withwhom,professionals,practitioners
1,10,withwhom:professionals:practitioners,withwhom,professionals,practitioners


In [17]:
longform_withwhom.shape

(61, 5)

In [18]:
longform_withwhom.groupby(['level_1', 'level_2', 'level_3']).count()[['PaperID']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,PaperID
level_1,level_2,level_3,Unnamed: 3_level_1
withwhom,laypeople,others,1
withwhom,laypeople,students,28
withwhom,professionals,academics,2
withwhom,professionals,practitioners,30


NB: some papers appear multiple times in the counts since some papers receive multiple tags.

In [19]:
longform_tags = create_longform_tags(df,[2,3])
longform_tags.head()

Unnamed: 0,PaperID,Tag
0,1,how:engineering:methodology
1,2,how:engineering:methodology
2,3,how:science:observation
3,4,how:perspective:experience
4,5,how:science:observation


This file is the version for human correction (no redundancy via tag levels added) - note that this works only because every PaperID is already presend in df - otherwise, how='outer' needs to be specified and some record shuffling must be done to ensure the NaN-papers are at the end of the file.

In [20]:
df[['PaperID', 'PaperSummary']].merge(longform_tags
                   ).to_csv('../../analysis/papermapping/papertags_how_withwhom_forcorrection.csv', 
                            index=False, sep=';')

In [21]:
longform_tags_split = split_tag_levels(longform_tags)

In [22]:
longform_tags_split.groupby(['level_1', 'level_2', 'level_3']).count()[['PaperID']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,PaperID
level_1,level_2,level_3,Unnamed: 3_level_1
how,engineering,methodology,177
how,engineering,reference,5
how,engineering,technology,33
how,perspective,experience,38
how,perspective,opinion,11
how,perspective,philosophy,1
how,perspective,review,14
how,science,interrogation,43
how,science,intervention,37
how,science,observation,81


The End.