### This is a service notebook to create csvs to use as dfs in other notebooks.

### NB:

At the very end of this notebook, we generate a votelist with coded respondent metadata.
This votelist needs to be re-generated if the coding changes.

In [1]:
import pandas as pd

In [2]:
exportdate = 20180327
projectname = 'repract'

In [3]:
number_of_papers = 435

#### DataFrame with Evaluation Counts and Paper Metadata

In [4]:
evaluation_counts = pd.read_csv(f'../analysis/{exportdate}{projectname}_evaluation_counts.csv')

In [5]:
xls = pd.read_csv('../data/all_conferences_from_googledrive.csv', sep=';',
                 usecols=['Unipark ID', 'Title', 'Authors', 'Venue', 'Year', 'Nb. pages',
       'Academic vs. industry', 'Industry track', 'Summary'],
                 nrows=435)
xls = xls[['Unipark ID', 'Venue', 'Year', 'Nb. pages', 'Industry track', 'Academic vs. industry',
       'Title', 'Authors', 'Summary']]

In [6]:
papermeta = evaluation_counts.merge(xls, left_on='PaperID', right_on='Unipark ID').drop([
    'Unipark ID'], axis=1)
# To avoid malformed csvs (due to ONE carriage return...)
papermeta['Title'] = [x.replace('\r', '') for x in list(papermeta['Title'].values)]
# Renaming for more comfortable coding
papermeta = papermeta.rename({'Nb. pages': 'TotalPages', 
               'Industry track': 'IndTrack', 
               'Academic vs. industry': 'AcadVsInd'}, axis=1)

In [7]:
#papermeta.to_csv(
#    f'../analysis/{exportdate}{projectname}_evaluation_counts_with_papermeta.csv', index=False)

#### Long-Form df with List of Votes

In [8]:
df = pd.read_csv(f'../data/{exportdate}{projectname}.csv')

In [9]:
votelist = pd.DataFrame(columns=['EvID', 'PaperID', 'Vote'])
ratings = ['Essential', 'Worthwhile', 'Unimportant', 'Unwise']
for row in df[['v_'+str(x) for x in range(7039,7039+number_of_papers)]].iterrows():
    for idx, elem in enumerate(row[1]):
        if elem in ratings:
            minidf = pd.DataFrame({'EvID':[row[0]], 'PaperID':[idx+1], 'Vote':[elem]})
            votelist = votelist.append(minidf)
votelist = votelist.reset_index().drop('index', axis=1)

In [10]:
#votelist.to_csv(f'../analysis/{exportdate}repract_votelist.csv', index=False)

#### DataFrame with One Row per Evaluation and Participant Metadata

* v_5: primary working area
* v_6: free text in case of other in v5
* v_11: free text years of primary working area experience
* v_118-v_121: intensity of involvement in RE (v118 most intense)
* v_12: CS degree
* v_14: team size
* v_15: class of systems in project scope
* v_16: free text in case of other in v15
* v_19: free text industry sector
* v_124: country

In [11]:
demographics = ['v_5', 'v_6', 'v_11', 'v_12', 'v_14', 'v_15', 'v_16', 'v_19', 'v_124']
reqinvolvement = ['v_118', 'v_119', 'v_120', 'v_121']

In [12]:
votelist_with_respondentmeta = votelist.merge(
    df[demographics+reqinvolvement].reset_index().rename({'index':'EvID'}, axis=1))
votelist_with_respondentmeta.head(5)

Unnamed: 0,EvID,PaperID,Vote,v_5,v_6,v_11,v_12,v_14,v_15,v_16,v_19,v_124,v_118,v_119,v_120,v_121
0,2,10,Worthwhile,Other (please specify),Product Management Coach,10,No,Medium (5-10),Other (please specify),Customer facing software products,Wide range (from automotive supplier to insura...,Germany,quoted,quoted,quoted,not quoted
1,2,23,Unwise,Other (please specify),Product Management Coach,10,No,Medium (5-10),Other (please specify),Customer facing software products,Wide range (from automotive supplier to insura...,Germany,quoted,quoted,quoted,not quoted
2,2,49,Essential,Other (please specify),Product Management Coach,10,No,Medium (5-10),Other (please specify),Customer facing software products,Wide range (from automotive supplier to insura...,Germany,quoted,quoted,quoted,not quoted
3,2,64,Unimportant,Other (please specify),Product Management Coach,10,No,Medium (5-10),Other (please specify),Customer facing software products,Wide range (from automotive supplier to insura...,Germany,quoted,quoted,quoted,not quoted
4,2,68,Unwise,Other (please specify),Product Management Coach,10,No,Medium (5-10),Other (please specify),Customer facing software products,Wide range (from automotive supplier to insura...,Germany,quoted,quoted,quoted,not quoted


In [13]:
#votelist_with_respondentmeta.to_csv(
#    f'../analysis/{exportdate}{projectname}_votelist_with_respondentmeta.csv', 
#                                    index=False)

### Here a version with coded short free-text answers:

In [12]:
votelist = pd.read_csv(f'../analysis/{exportdate}repract_votelist.csv')

In [7]:
df_coded = pd.read_csv(f'../analysis/{exportdate}{projectname}_with_shorttext_integration.csv')
df_coded.shape

(154, 1327)

In [10]:
# these columns are not present in the original data
df_coded.columns.values.flatten()[-6:]

array(['v_11_coded', 'v_19_coded', 'v_6_coded', 'v_5_6_merged',
       'v_16_coded', 'v_15_16_merged'], dtype=object)

In [11]:
demographics_coded = ['v_5', 'v_6', 'v_6_coded', 'v_5_6_merged', 
                      'v_11', 'v_11_coded', 
                      'v_12', 'v_14', 
                      'v_15', 'v_16', 'v_16_coded', 'v_15_16_merged',
                      'v_19', 'v_19_coded', 'v_124']
reqinvolvement = ['v_118', 'v_119', 'v_120', 'v_121']

In [14]:
votelist_with_respondentmeta_coded = votelist.merge(
    df_coded[demographics_coded+reqinvolvement].reset_index().rename({'index':'EvID'}, axis=1))
votelist_with_respondentmeta_coded.head(5)

Unnamed: 0,EvID,PaperID,Vote,v_5,v_6,v_6_coded,v_5_6_merged,v_11,v_11_coded,v_12,...,v_16,v_16_coded,v_15_16_merged,v_19,v_19_coded,v_124,v_118,v_119,v_120,v_121
0,2,10,Worthwhile,Other (please specify),Product Management Coach,Manager,Manager,10,10.0,No,...,Customer facing software products,(Business) information systems,(Business) information systems,Wide range (from automotive supplier to insura...,Automotive,Germany,quoted,quoted,quoted,not quoted
1,2,23,Unwise,Other (please specify),Product Management Coach,Manager,Manager,10,10.0,No,...,Customer facing software products,(Business) information systems,(Business) information systems,Wide range (from automotive supplier to insura...,Automotive,Germany,quoted,quoted,quoted,not quoted
2,2,49,Essential,Other (please specify),Product Management Coach,Manager,Manager,10,10.0,No,...,Customer facing software products,(Business) information systems,(Business) information systems,Wide range (from automotive supplier to insura...,Automotive,Germany,quoted,quoted,quoted,not quoted
3,2,64,Unimportant,Other (please specify),Product Management Coach,Manager,Manager,10,10.0,No,...,Customer facing software products,(Business) information systems,(Business) information systems,Wide range (from automotive supplier to insura...,Automotive,Germany,quoted,quoted,quoted,not quoted
4,2,68,Unwise,Other (please specify),Product Management Coach,Manager,Manager,10,10.0,No,...,Customer facing software products,(Business) information systems,(Business) information systems,Wide range (from automotive supplier to insura...,Automotive,Germany,quoted,quoted,quoted,not quoted


In [15]:
#votelist_with_respondentmeta_coded.to_csv(
#    f'../analysis/{exportdate}{projectname}_votelist_with_respondentmeta_coded.csv', 
#                                    index=False)

The End.