In [1]:
file = '../../data/papersummaries_cleaned.csv'

In [2]:
import pandas as pd
import re

In [3]:
df = pd.read_csv(file)

### The Scheme

![](tagging_how.png)

NB: The Science category is deliberately not split into validation and evaluation. These terms are not very intuitive and respondents will hardly have realized any such distinction.

### The Rules

In [234]:
def assign_tag(level_1, level_2, level_3, summary):
    tags = []
    for tag in level_2:
        if any([re.search(keyword, summary) for keyword in level_3[tag]]):
            tags.append(level_1+tag)
    return tags


def assign_engineering(summary):
    level_1 = ':engineering'
    
    level_2 = [':analysis', ':technology', ':method']
    
    level_3 = {':analysis':
                   ['A set of metrics']
               , 
               ':technology':
                   ['A tool', 'A solution', 'A model', 
                    'A taxonomy', 'An ontology', 'A (:?modell?ing |specification )language', 
                    'A template', '[Aa] blueprint', 'A (formal )?framework']
               , 
               ':method':
                   ['A method', 'A process', 'A.{,15}technique', 'training program']
               }
    
    return assign_tag(level_1, level_2, level_3, summary)


def assign_science(summary):
    level_1 = ':science'
    
    level_2 = [':observation', ':intervention', ':interrogation']
    
    level_3 = {':observation':
                   ['(:?(:?multi.)?case|field) study',
                    '(:?data.|document.)driven study',
                    'industrial evaluation', 'An analysis']
               , 
               ':intervention':
                   ['experiment(?:s|\s)', 
                    'project-based study', 'workshop-based industrial study', 
                    'action research']
               ,
               ':interrogation':
                   ['interview-based study|study based on.{,30}interviews', 
                    'questionnaire', '(?<!literature )(?:online.)?survey']
               }
    
    return assign_tag(level_1, level_2, level_3, summary)


def assign_perspective(summary):
    level_1 = ':perspective'
    
    level_2 = [':philosophy', ':opinion', ':experience', ':review']
    
    level_3 = {':philosophy':
                   ['conceptual framework']
               , 
               ':opinion':
                   ['A discussion', '\svision', 'roadmap\s']
               , 
               ':experience':
                   ['experience report']
               ,
               ':review':
                   ['literature (:?survey|study|review)',
                    'state of the art report']
               } 
    
    return assign_tag(level_1, level_2, level_3, summary)


def assign_all(summary):
    tags = ['how'+ x for x in 
            (assign_engineering(summary) 
             + assign_science(summary)
             + assign_perspective(summary))]
    return tags

NB: As of now, the third level isn't explicitly represented in the tags. That's unfortunate especially for the distinction experiment/survey (which I'd expect to be evaluated differently). I'd suggest we refine tags using a separate function (after the initial tag assignment).

In [235]:
df['how'] = [assign_all(x) for x in df.PaperSummary]

In [289]:
df.head()

Unnamed: 0,PaperID,PaperSummary,how
0,1,A method for automatically recovering software...,[how:engineering:method]
1,2,A set of two techniques for improving the qual...,[how:engineering:method]
2,3,A case study on evaluating a given technique f...,[how:science:observation]
3,4,An experience report on the development of a m...,[how:perspective:experience]
4,5,A document-driven study on the relevancy of cl...,[how:science:observation]


Summaries without tags?

In [236]:
[(x[0], x[1][1], x[1][2]) for x in df.iterrows() if len(x[1][2]) < 1]

[]

Summaries with multiple tags? (to check whether they're okay)

In [237]:
[(x[0], x[1][1], x[1][2]) for x in df.iterrows() if len(x[1][2]) > 1]

[(71,
  'A study based on experiments with students and a case study on the possible benefits of considering existing services and their alignment with requirements at a very early stage in order to exploit the desired benefits of reuse in service-oriented architectures',
  ['how:science:observation', 'how:science:intervention']),
 (207,
  'A literature survey on requirements elicitation techniques and a roadmap of research in order to improve the elicitation of tacit knowledge',
  ['how:perspective:opinion', 'how:perspective:review']),
 (368,
  'A set of two empirical studies (online survey to practitioners and experiment with students) on the creation and use of software requirement specifications in companies and the impact of their quality in subsequent development activities.',
  ['how:science:intervention', 'how:science:interrogation']),
 (395,
  'A literature study on specific threats to validity in controlled experiments with student participants and on mitigation strategies fo

In [267]:
longform_tags = pd.DataFrame(columns=['PaperID', 'Tag'])
for row in df.iterrows():
    longform_tags = longform_tags.append(
        pd.DataFrame({'PaperID':list(len(row[1][2])*[row[1][0]]), 'Tag':row[1][2]}))
longform_tags = longform_tags.reset_index().drop('index', axis=1)
longform_tags.head()

Unnamed: 0,PaperID,Tag
0,1,how:engineering:method
1,2,how:engineering:method
2,3,how:science:observation
3,4,how:perspective:experience
4,5,how:science:observation


In [286]:
tags_three_levels = list(zip(*[x.split(':') for x in longform_tags.Tag]))
longform_tags['level_1'] = tags_three_levels[0]
longform_tags['level_2'] = tags_three_levels[1]
longform_tags['level_3'] = tags_three_levels[2]
longform_tags.head()

Unnamed: 0,PaperID,Tag,level_1,level_2,level_3
0,1,how:engineering:method,how,engineering,method
1,2,how:engineering:method,how,engineering,method
2,3,how:science:observation,how,science,observation
3,4,how:perspective:experience,how,perspective,experience
4,5,how:science:observation,how,science,observation


NB: some papers appear multiple times in the counts (here: twice at max) since some papers receive multiple tags.

In [288]:
longform_tags.groupby(['level_1', 'level_2', 'level_3']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,PaperID,Tag
level_1,level_2,level_3,Unnamed: 3_level_1,Unnamed: 4_level_1
how,engineering,analysis,1,1
how,engineering,method,162,162
how,engineering,technology,52,52
how,perspective,experience,38,38
how,perspective,opinion,11,11
how,perspective,philosophy,1,1
how,perspective,review,14,14
how,science,interrogation,43,43
how,science,intervention,37,37
how,science,observation,81,81


In [292]:
#longform_tags.merge(df[['PaperID', 'PaperSummary']]
#                   ).to_csv('../../analysis/papersummaries_tagged_how.csv', index=False)

### With whom?

In [312]:
len([x for x in df.PaperSummary if re.search('with students and practitioners', x)])

2

In [317]:
len([x for x in df.PaperSummary if re.search('with practitioners and students', x)])

1

In [319]:
len([x for x in df.PaperSummary if re.search('with students', x)])

27

In [314]:
len([x for x in df.PaperSummary if re.search('with practitioners', x)])

28

In [320]:
len([x for x in df.PaperSummary if re.search('with crowd workers', x)])

1

In [310]:
len([x for x in df.PaperSummary if re.search('academics', x)])

4

In [311]:
len([x for x in df.PaperSummary if re.search('researchers', x)])

3

The End.