#### Variables coded in this notebook: 6, 11, 16, 19

In [16]:
import os, pandas as pd, re

In [123]:
exportdate = 20180327
projectname = 'repract'

In [127]:
df = pd.read_csv(f'../../data/{exportdate}{projectname}.csv')
df.head(2)

Unnamed: 0,lfdn,external_lfdn,tester,dispcode,lastpage,quality,duration,v_7039,v_7040,v_7041,...,output_mode,javascript,flash,session_id,language,cleaned,ats,datetime,date_of_last_access,date_of_first_mail
0,106,0,no tester,Completed after break (32),2138658,NotShown,-1,NotShown,NotShown,0,...,HTML,NotShown,NotShown,3bb21c1b318e2f6b87557566bdd6b4d9,English,Not cleaned,1515411510,2018-01-08 11:38:30,2018-01-08 13:07:14,0000-00-00 00:00:00
1,131,0,no tester,Completed (31),2138658,NotShown,3805,NotShown,NotShown,NotShown,...,HTML,NotShown,NotShown,fc38f6556787a459c2cc604abf799448,English,Not cleaned,1515667019,2018-01-11 10:36:59,2018-01-11 11:40:24,0000-00-00 00:00:00


In [77]:
basedir = '../../data/freetext'
freetextfiles = os.listdir(basedir)
dfs = {file[:-4]:pd.read_csv(f'{basedir}/{file}') for file in freetextfiles}

In [78]:
dfs.keys()

dict_keys(['.DS_S', 'v_11', 'v_1373', 'v_16', 'v_18', 'v_19', 'v_6', 'v_8345etseq', 'v_8780etseq'])

In [120]:
def add_unique_code(df, func, varname):
    df[f'{varname}_coded'] = func(df[varname])
    return df

In [79]:
codedir = '../../analysis/freetext'
def write_coded(df, varname):
    filepath = f'{codedir}/{varname}_coded.csv'
    df.to_csv(f'{codedir}/{varname}_coded.csv', index=False)
    print(f'File stored at {filepath}.')

#### Variable 6 (_Other_ for variable 5) - primary working area

Options shown were:
* Requirements Engineer
* Business Analyst
* Architect
* Tester / Test Manager
* Project Manager
* Developer
* Product Owner
* Designer
* Other (please specify)

In [218]:
def code_var_6(series):
    coded_series = []
    for value in series:
        val = None
        if re.search('Lecturer|PhD\sCandidate|[Rr]esearcher|R&D', value):
            val = 'Researcher'
        elif re.search('Consultant', value):
            val = 'Consultant'
        elif re.search('[Mm]anag|CTO', value):
            val = 'Manager'
        elif re.search('Systems?\sEngineer', value):
            val = 'Architect'
        elif re.search('[Dd]esign', value):
            val = 'Designer'
        elif (re.search('[Dd]ifferent|[Cc]hanging|[Bb]oth|[Dd]epend(?:s|ing)|[Cc]ombin', 
                        value) or (len(re.findall(',', value)) > 1)):
            val = 'Multiple Roles'
        elif re.search('processes', value):
            val = 'Process Designer'
        elif re.search('[Mm]arketing|ISO\s\d+', value):
            val = 'Context Roles' # this was: 1 Marketing, 1 Regulator
        else:
            raise Exception(f'Difficulty Coding Entry: {value}')
        coded_series.append(val)
    return coded_series

In [220]:
coded_v6 = add_unique_code(dfs['v_6'], code_var_6, 'v_6')
coded_v6.head()

Unnamed: 0,lfdn,v_6,v_6_coded
0,106,Researcher,Researcher
1,139,Product Management Coach,Manager
2,173,Functional Safety Consultant focussing of the ...,Consultant
3,216,Enterprise Systems of Systems researcher (auto...,Researcher
4,240,"Changing with different jobs, RE, Architect, D...",Multiple Roles


In [371]:
coded_v6.groupby('v_6_coded').count()

Unnamed: 0_level_0,lfdn,v_6
v_6_coded,Unnamed: 1_level_1,Unnamed: 2_level_1
Architect,2,2
Consultant,3,3
Context Roles,2,2
Designer,1,1
Manager,6,6
Multiple Roles,6,6
Process Designer,2,2
Researcher,7,7


In [221]:
write_coded(coded_v6, 'v_6')

File stored at ../../analysis/freetext/v_6_coded.csv.


#### Variable 11 - years of experience

In [159]:
def code_var_11(series):
    coded_series = []
    replace_dict = {'[Yy]ears?\.?|y(?!\w)|[Aa]bout': '',
                    '[Ss]ix':'6',
                    '[Oo]ne':'1',
                    ',':'.',
                    '\+|>':''}
    this_year = 2018
    for value in series:
        val = None
        try:
            val = float(value)
        except:
            val = value
            for k, v in replace_dict.items():
                val = re.sub(k, v, val)
            try:
                val = float(val)
            except:
                if re.search('[Ss]ince\s(\d{4})', val):
                    val = 2018 - float(re.search('[Ss]ince\s(\d{4})', val).group(1))
                elif re.search('\d+', val):
                    val = sum([float(x) for x in re.findall('\d+', val)])
                else:
                    raise Exception(f'Difficulty Coding Entry: {value}')
        coded_series.append(val)
    return coded_series

In [160]:
coded_v11 = add_unique_code(dfs['v_11'], code_var_11, 'v_11')
coded_v11.head()

Unnamed: 0,lfdn,v_11,v_11_coded
0,106,4,4.0
1,131,1,1.0
2,139,10,10.0
3,153,10,10.0
4,156,15,15.0


In [367]:
%matplotlib notebook
import seaborn as sns
sns.set_style('darkgrid')
import matplotlib.pyplot as plt

In [370]:
coded_v11.v_11_coded.plot.hist(bins=50, color='k', alpha=0.5)
plt.xlabel('Years of Experience');

<IPython.core.display.Javascript object>

In [83]:
#write_coded(coded_v11, 'v_11')

File stored at ../../analysis/freetext/v_11_coded.csv.


#### Variable 16 (_Other_ for Variable 15) - class of system

Options shown were:
- Software-intensive embedded systems
- (Business) information systems
- Hybrid / mix of embedded systems and information systems
- Other (please specify)

NB: I feel the coding is somewhat arbitrary - but at least it's transparently arbitrary...

In [161]:
def code_var_16(series):
    coded_series = []
    for value in series:
        val = None
        if re.search('[Aa]ll.*?above', value):
            val = 'Hybrid / mix of embedded systems and information systems'
        elif (re.search('[Cc](?:ustomer|onsumer)|online|information', value) # infosys
            or re.search('ERP', value)): # infosys, special (and doubtful ;-))
            val = '(Business) information systems'
        elif re.search('[Mm]achine|infrastructure|processor', value): 
            # or would you want to class these as hybrid?
            val = 'Hardware'
        elif re.search('[Aa]eronautics|Railway', value): # guessing this one
            val = 'Hybrid / mix of embedded systems and information systems'
        else:
            raise Exception(f'Difficulty Coding Entry: {value}')
        coded_series.append(val)
    return coded_series

In [162]:
coded_v16 = add_unique_code(dfs['v_16'], code_var_16, 'v_16')
coded_v16.head()

Unnamed: 0,lfdn,v_16,v_16_coded
0,139,Customer facing software products,(Business) information systems
1,240,"Different, consumer platforms, mobile Apps etc.",(Business) information systems
2,245,Aeronautics,Hybrid / mix of embedded systems and informati...
3,310,"Requirements Engineering tools, compilers and ...",(Business) information systems
4,341,Railway systems,Hybrid / mix of embedded systems and informati...


In [356]:
coded_v16.groupby('v_16_coded').count()

Unnamed: 0_level_0,lfdn,v_16
v_16_coded,Unnamed: 1_level_1,Unnamed: 2_level_1
(Business) information systems,5,5
Hardware,3,3
Hybrid / mix of embedded systems and information systems,4,4


In [163]:
#write_coded(coded_v16, 'v_16')

File stored at ../../analysis/freetext/v_16_coded.csv.


#### Variable 19 - Industry Sector (not standardized as in NaPiRE!)

NB: This is - of course - drastically overfitting, but there's hardly another option (the NaPiRE categories are very problematic and thus should not be reused).

In [349]:
def code_var_19(series):
    coded_series = []
    for value in series:
        val = None
        # Sorry, this is ugly.
        if (re.search('[Mm]ixed|[Vv]aries(?!\w)|[Ss]everal|[Ee]cosystem|(?<!\s)services(?!\s)', 
                      value)
            or len(re.findall(',', value)) > 1
            or re.search('and', value) 
            and not re.search('[Oo]il.*?[Gg]as|[Bb]ank.*?[Ff]in|[Ii]ns.*?[Bb]ank|[Aa]ero.*?[Dd]efen|[Ww]ell.*[Hh]eal', value)
           ):
            val = 'Multiple Sectors'
        elif re.search('[Uu]niversity|[Rr]esearch|[Aa]cademi', value):
            val = 'Academia'
        elif re.search('[Aa]ero|[Aa]vi(?:on|at)', value):
            val = 'Aeronautics'
        elif re.search('[Aa]utomation', value):
            val = 'Automation'
        elif re.search('[Aa]utomotive', value):
            val = 'Automotive'
        elif re.search('[Cc]onsult', value):
            val = 'Consulting'
        elif re.search('[Ee]\-?commerc|[Oo]nline', value):
            val = 'E-Commerce'
        elif re.search('[Ee]ducati', value):
            val = 'Education'
        elif re.search('[Ee]nergy|(?:[Oo]il|[Gg]as)(?!\w)', value):
            val = 'Energy'
        elif re.search('[Ff]inanc|[Bb]anki|[Ii]nsuran', value):
            val = 'Financial Services'
        elif re.search('[Ss]emiconductor|[Rr]obotics|[Cc]omputer\s[Ee]ngin|[Ii]ndustrial\s[Ss]ys', 
                       value): 
            # is this okay?
            val = 'Hardware' 
        elif re.search('[Mm]edic(?:al|ine)|[Hh]eal?th|[Ww]ellness', value):
            val = 'Healthcare'
        elif re.search('[Cc]ommun|[Tt]elecom|ICT|IT|IoT|intranet|[Ee]lectron|[Nn]etwork|[Ii]nformation', 
                       value): # is this okay?
            val = 'ICT' 
        elif re.search('[Rr]ailway|[Bb]uilding|[Pp]ipelines', value): # is this okay?
            val = 'Infrastructure'
        elif re.search('[Gg]overnment|[Pp]ublic\s(?![Tt]ransport)|Defen[cs]e', value):
            val = 'Public Sector'
        elif re.search('[Ss]oftware|[Ss]aa[Ss]', value): # is this okay?
            val = 'Software'
        elif re.search('[Tt]ransport|[Ll]ogis\w|[Mm]arine', value): # is this okay?
            val = 'Transportation'
        elif re.search('[Tt]ourism', value): 
            val = 'Tourism'
        else:
            raise Exception(f'Difficulty Coding Entry: {value}')
        coded_series.append(val)
    return coded_series

In [351]:
coded_v19 = add_unique_code(dfs['v_19'], code_var_19, 'v_19')
coded_v19.head()

Unnamed: 0,lfdn,v_19,v_19_coded
0,106,Automotive,Automotive
1,131,education,Education
2,139,Wide range (from automotive supplier to insura...,Automotive
3,153,Public administration,Public Sector
4,156,Telecommunications,ICT


In [353]:
write_coded(coded_v19, 'v_19')

File stored at ../../analysis/freetext/v_19_coded.csv.


In [354]:
coded_v19.groupby('v_19_coded').count()

Unnamed: 0_level_0,lfdn,v_19
v_19_coded,Unnamed: 1_level_1,Unnamed: 2_level_1
Academia,2,2
Aeronautics,7,7
Automation,3,3
Automotive,21,21
Consulting,2,2
E-Commerce,3,3
Education,7,7
Energy,4,4
Financial Services,20,20
Hardware,5,5


The End.