#  Extract dataset and attribute definitions
##  from supportPrim - STATA files

In [1]:
# auto sugestion with tab
%config IPCompleter.greedy=True
%config InlineBackend.figure_format = 'retina'
%matplotlib inline 

In [2]:
import pandas as pd
import numpy as np
import os
os.getcwd()

'/lhome/amardj/ISM/python_code/es_scripts/case_base_patient_scripts'

### Define the date format to be appended with file names

#### System defined current date

In [3]:
import datetime as datetime
sys_date = datetime.datetime.today().strftime( '%Y%m%d')
sys_date = f'{sys_date}' 
sys_date

'20201023'

### Location of raw data & generated data

In [4]:
data_dir = f'./data/{sys_date}/'
data_dir ##= './data/20201021/'

'./data/20201023/'

In [5]:
index_name = 'id'

## Import stata files 'xxx.dta' for creating python pandas DataFrames

In [6]:
df_labeled = pd.read_stata(f'{data_dir}supportprim_merged_datafile.dta') #, index_col=index_name)
df_labeled.shape # (*, 353)

(63, 350)

In [7]:
df_labeled.body_main_1.unique()

[neck, multisite/complex, back, hip, shoulder, knee]
Categories (6, object): [neck < shoulder < back < hip < knee < multisite/complex]

In [None]:
assert df_labeled.body_main_1.dtype.name == 'category' 
assert df_labeled.body_main_1.dtype.name != 'object' 

df_labeled.body_main_1.unique()

In [None]:
assert df_labeled.employ_1.dtype.name == 'category' 
assert df_labeled.employ_1.dtype.name != 'object' 

df_labeled.employ_1.unique()

## Remove '_merege' cloumn form both the dataframes if any

In [8]:
drop_col_name = '_merge'
if drop_col_name in df_labeled.columns:
    df_labeled.drop( columns=[drop_col_name], axis=1, inplace=True)
    print(' The DataFrame contains column named as : ',drop_col_name, ' - Action : Removed')
else :
    print(' The DataFrame does not contain any column named as : "',drop_col_name,'"')

 The DataFrame does not contain any column named as : " _merge "


## Sort the dataframe column names - alfabetical order

In [9]:
df_labeled = df_labeled.reindex( sorted(df_labeled.columns), axis=1)
print(' Shape of old Dataframe : ', df_labeled.shape)  # (506, 292)
features_labels = df_labeled.columns

# Shape of old Dataframe :  (175, 353)

 Shape of old Dataframe :  (63, 350)


In [10]:
df_labeled.head()

Unnamed: 0,activity_1,age_1,back_nevro_pow_1,back_nevro_reflex_1,back_nevro_sens_1,back_slr_1,birth_year,bmi_1,body_main_1,childhood_1,...,treat_setting_self_some_3,treatmentfactor_patient_3,walk_aid_1,weight_1,work_ability_1,work_ability_3,work_ability_w2,work_ability_w4,work_ability_w8,work_type_1
0,slightly reduced,57.0,,,,,1963,24.772097,neck,very good,...,,Håndleddsbrudd for 6 uker siden. Derfor reduse...,no walkaid,75,8,0,6.0,6.0,1.0,work demand much walking
1,slightly reduced,55.0,,,,,1965,26.573132,multisite/complex,very good,...,,"Øvelser, tiden, aktivitet, lengre periode hjem...",no walkaid,75,8,8,7.0,8.0,7.0,work demand much walking
2,slightly reduced,37.0,,,,,1983,27.281748,multisite/complex,good,...,,"Trenings senteret ble stengt, derfor mindte je...",no walkaid,77,8,8,5.0,,,mostly seated
3,slightly reduced,62.0,,,,,1958,30.487806,multisite/complex,good,...,,,no walkaid,82,8,9,9.0,,9.0,work demands much walking and lifting
4,quite reduced,64.0,,,,no,1956,27.428572,back,very good,...,,Ingen trening/ behandling hos fysioterapeut si...,no walkaid,84,3,2,3.0,3.0,3.0,


In [11]:
def replace_all_with( old, new, tempdf=df_labeled):
    temp = tempdf.replace({old: new}, regex=True)
    return temp

In [12]:
df_labeled.physio_id.head()

0    support10
1    support10
2    support10
3    support10
4    support10
Name: physio_id, dtype: object

In [13]:
df_labeled.treat_desc1_3.tolist()

['Pasienten brøt håndleddet midt i behandlingsforløpet samt stopp i behandlingen pga covid 19. Hun er også midt i overgangsalderen. ',
 'Grundig anamnese og skape en god relasjon og trygghet til denne pasienten. Samarbeid med fastlege',
 'Motivering til å starte fysisk aktivitet. Tett samtale med mor med tanke på trening på treningssenter. Bevisstgjøring av livsstilsendring mot smerter. ',
 'Har vært plaget med denne problemstillingen i flere perioder tidligere. Har vært til meg for flere år siden. Corona pandemien kom og forstyrret behandlingsforløpet. ',
 'Har gått til meg tidligere og har hatt en svært god tone da. Ble påvirket av Corona pandemien og ble mindre oppfølging en tenkt ',
 'Ble veldig lite oppfølging/tilbud pga Corona. Hadde en del frafall før nedstegningen og har ennå ikke kommet igang med individuell behandling',
 'Pasient som har vært til meg tidligere. Kom såvidt igang før Corona pandemien. Har fått beskjed via lege til å oppsøke meg, er under behandling for 50% ufør

In [14]:
df_labeled['physio_id'] = df_labeled['physio_id'].replace(' ', '_', regex=True).str.lower()
df_labeled['clinic_id'] = df_labeled['physio_id']

In [15]:
df_labeled.physio_id

0     support10
1     support10
2     support10
3     support10
4     support10
        ...    
58     support9
59     support9
60     support9
61     support9
62     support9
Name: physio_id, Length: 63, dtype: object

In [16]:
cols = ['physio_id', 'journal_id', 'birth_year']
df_labeled['patient_id'] = df_labeled[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
df_labeled['patient_id'].tolist()

['support10_1_1963',
 'support10_2_1965',
 'support10_5_1983',
 'support10_6_1958',
 'support10_7_1956',
 'support10_8_1967',
 'support10_9_1966',
 'support10_10_1964',
 'support10_11_1966',
 'support10_12_1950',
 'support10_13_1977',
 'support11_1808_1969',
 'support11_1810_1989',
 'support12_31903_1972',
 'support12_31910_1968',
 'support12_31992_1961',
 'support12_32035_1969',
 'support13_29310_2002',
 'support13_31894_1956',
 'support13_31909_1995',
 'support13_32177_1961',
 'support15_6245_1969',
 'support19_5191_1968',
 'support23_3_1950',
 'support24_101_1984',
 'support24_102_1978',
 'support26_1001_1950',
 'support27_1002_1990',
 'support28_2000_1973',
 'support28_2002_1981',
 'support28_2005_1956',
 'support28_2006_1957',
 'support31_32359_1979',
 'support31_32542_1988',
 'support31_32581_1985',
 'support31_32753_1994',
 'support4_38888_1998',
 'support4_39151_1997',
 'support4_39178_1999',
 'support4_39185_1993',
 'support4_39442_1989',
 'support5_11785_1956',
 'support5_175

In [17]:
df_labeled.index.name = 'id'
df_labeled.head()

Unnamed: 0_level_0,activity_1,age_1,back_nevro_pow_1,back_nevro_reflex_1,back_nevro_sens_1,back_slr_1,birth_year,bmi_1,body_main_1,childhood_1,...,walk_aid_1,weight_1,work_ability_1,work_ability_3,work_ability_w2,work_ability_w4,work_ability_w8,work_type_1,clinic_id,patient_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,slightly reduced,57.0,,,,,1963,24.772097,neck,very good,...,no walkaid,75,8,0,6.0,6.0,1.0,work demand much walking,support10,support10_1_1963
1,slightly reduced,55.0,,,,,1965,26.573132,multisite/complex,very good,...,no walkaid,75,8,8,7.0,8.0,7.0,work demand much walking,support10,support10_2_1965
2,slightly reduced,37.0,,,,,1983,27.281748,multisite/complex,good,...,no walkaid,77,8,8,5.0,,,mostly seated,support10,support10_5_1983
3,slightly reduced,62.0,,,,,1958,30.487806,multisite/complex,good,...,no walkaid,82,8,9,9.0,,9.0,work demands much walking and lifting,support10,support10_6_1958
4,quite reduced,64.0,,,,no,1956,27.428572,back,very good,...,no walkaid,84,3,2,3.0,3.0,3.0,,support10,support10_7_1956


# Write the complete updated dataset to an csv file
## for: labeled and numeric dataset

In [18]:
df_labeled.to_csv( f'{data_dir}1_sp_dataset_labeled.csv', index=True)

In [19]:
print(df_labeled.columns.tolist())

['activity_1', 'age_1', 'back_nevro_pow_1', 'back_nevro_reflex_1', 'back_nevro_sens_1', 'back_slr_1', 'birth_year', 'bmi_1', 'body_main_1', 'childhood_1', 'como_asthma_1', 'como_cancer_1', 'como_count_1', 'como_dementhia_1', 'como_diabetes_1', 'como_fatigue_1', 'como_headache_1', 'como_heart_1', 'como_neuro_1', 'como_osteoporosis_1', 'como_other_1', 'como_other_spec_1', 'como_psych_1', 'como_ra_1', 'como_stomach_pain_1', 'compliance_3', 'consultation_3', 'date_baseline_patient', 'diagnose_1', 'discussed_employer_3', 'discussed_family_3', 'discussed_nav_3', 'discussed_occupational_3', 'education_1', 'education_othersp_1', 'employ_1', 'employ_assess_1', 'employ_assess_3', 'employ_disability_1', 'employ_disability_3', 'employ_disabilityp_1', 'employ_disabilityp_3', 'employ_paid_1', 'employ_paid_3', 'employ_paidp_1', 'employ_paidp_3', 'employ_parentleave_1', 'employ_parentleave_3', 'employ_retired_1', 'employ_retired_3', 'employ_sick_1', 'employ_sick_3', 'employ_sickp_1', 'employ_sickp_3',

In [20]:
df_labeled = pd.read_csv(f'{data_dir}1_sp_dataset_labeled.csv', index_col='id')
#df_numeric = pd.read_csv(file_numeric, index_col='id')

In [None]:
assert df_labeled.body_main_1.dtype.name != 'category' 
assert df_labeled.body_main_1.dtype.name == 'object' 

df_labeled.body_main_1.unique()

In [None]:
assert df_labeled.employ_1.dtype.name != 'category' 
assert df_labeled.employ_1.dtype.name == 'object' 

df_labeled.employ_1.unique()

In [22]:
outcome_list = [
'out01',
#'out02'
]

In [23]:
df_labeled[outcome_list][:10]

Unnamed: 0_level_0,out01
id,Unnamed: 1_level_1
0,0.0
1,0.0
2,1.0
3,1.0
4,0.0
5,0.0
6,0.0
7,0.0
8,0.0
9,1.0


In [24]:
df = df_labeled.copy( deep=True)

In [25]:
df.body_main_1.unique()

array(['neck', 'multisite/complex', 'back', 'hip', 'shoulder', 'knee'],
      dtype=object)

In [26]:
df.shape

(63, 352)

In [27]:
df.education_1.head()

id
0       high school
1       high school
2       high school
3    primary school
4       high school
Name: education_1, dtype: object

In [28]:
df.employ_1.head()

id
0                         working or other
1                         working or other
2                         working or other
3                         working or other
4    disability pension or work assessment
Name: employ_1, dtype: object

# 
### Rounding of the float variables up to 2 decimal places

In [30]:
round_float_list = ['bmi_1',
                    'hscl_score_1',
                    
                    #'ndi_total_1',  
                    #'odi_total_1',  
                    #'spadi_total_1',
                    #'outcome_allpercent',
                    
                    'hscl_score_3',
                    #'ndi_total_3',  
                    #'odi_total_3',
                    #'spadi_total_3'
                   ]

for col_name in round_float_list:
    print(col_name)
    df[col_name] = df[col_name].round(2)
    df_labeled[col_name] = df[col_name].round(2)
    #df_numeric[col_name] = df_numeric[col_name].round(2)
    
# df_labeled.hscl10_mean_1 = df_labeled.hscl10_mean_1.round(2)
# df_labeled.ndi_total_1   = df_labeled.ndi_total_1.round(2)
# df_labeled.odi_total_1   = df_labeled.odi_total_1.round(2)
# df_labeled.spadi_total_1 = df_labeled.spadi_total_1.round(2)
# 
# df_numeric.hscl10_mean_3 = df_numeric.hscl10_mean_3.round(2)
# df_numeric.spadi_total_3 = df_numeric.spadi_total_3.round(2)
# df_numeric.ndi_total_3   = df_numeric.ndi_total_3.round(2)
# df_numeric.odi_total_3   = df_numeric.odi_total_3.round(2)

bmi_1
hscl_score_1
hscl_score_3


In [31]:
# for testing
#print(df_labeled[ round_float_list][:5]);
#print(df_numeric[ round_float_list][:5]);

## Drop all empty columns

#### Issues with complete empty columns ????

## Function for removing all leading and trailing spaces in a dataframe

In [32]:
def remove_leading_trailing_spaces(tempdf=df):
    df_obj = tempdf.select_dtypes(['object'])
    tempdf[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())
    return tempdf

## Function to repalce all characters of a string in a dataframe

In [33]:
def replace_all_with( old, new, tempdf=df):
    temp = tempdf.replace({old: new}, regex=True)
    return temp

In [34]:
# Must throw exception
# df_numeric.employment_1 = replace_all_with( 'Working,', 'Working ', df_numeric.employment_1)
# df_numeric.employment_1.unique()

## Replace all special char

### 1. '>' is replaced with 'GT' (Greater Than) and '<' is replaced with 'LT' (Lower/Less Than)

In [35]:
#df_labeled.outcome_percent01.unique()

In [36]:
df = replace_all_with('>', 'GT ', df)
df = replace_all_with('≥', 'GT_EQ ', df)
df = replace_all_with('<', 'LT ', df)

df_labeled = replace_all_with('>', 'GT ', df_labeled)
df_labeled = replace_all_with('≥', 'GT_EQ ', df_labeled)
df_labeled = replace_all_with('<', 'LT ', df_labeled)

#df_numeric = replace_all_with('>', 'GT ', df_numeric)
#df_numeric = replace_all_with('<', 'LT ', df_numeric)

In [37]:
#df_labeled.outcome_percent01.unique()

### 2. '-'  to  ' - ' for readablity
### 3. ';'  to  ':' (colon) to compenseate the next statement (4.) 
### 4. ','  to  ';' (semicolon) since they represent multiple values for a given attribute, and they will result in a 'csv' file 

In [38]:
df = replace_all_with( '-', ' - ', df)
#df = replace_all_with( ';', ' : ', df)
#df = replace_all_with( ',', ' ; ', df)

df_labeled = replace_all_with( '-', ' - ', df_labeled)
#df_labeled = replace_all_with( ';', ' : ', df_labeled)
#df_labeled = replace_all_with( ',', ' ; ', df_labeled)

#df_numeric = replace_all_with( '-', ' - ', df_numeric)
#df_numeric = replace_all_with( ';', ' : ', df_numeric)
#df_numeric = replace_all_with( ',', ' ; ', df_numeric)

### 5. '%'  to  ' percent' to resolve the special char ambiguity

In [39]:
df = replace_all_with( '%', ' percent', df)
df_labeled = replace_all_with( '%', ' percent', df_labeled)
#df_numeric = replace_all_with( '%', ' percent', df_numeric)

In [40]:
df.body_main_1.unique()

array(['neck', 'multisite/complex', 'back', 'hip', 'shoulder', 'knee'],
      dtype=object)

In [41]:
df = replace_all_with( '/', '_', df)
df_labeled = replace_all_with( '/', '_', df_labeled)

In [42]:
df.body_main_1.unique() # The data type should not be cateorical type for python

array(['neck', 'multisite_complex', 'back', 'hip', 'shoulder', 'knee'],
      dtype=object)

In [43]:
#df.body_main_1.cat.rename_categories({'multisite/complex': 'multisite_complex'}, inplace=True)
#df_labeled.body_main_1.cat.rename_categories({'multisite/complex': 'multisite_complex'}, inplace=True)

### 6. multiple spaces in the text to resolve the ambiguity in categorical values for myCBR

In [44]:
df = replace_all_with( '   ', ' ', df)
df = replace_all_with( '  ', ' ',  df)

df_labeled = replace_all_with( '   ', ' ', df_labeled)
df_labeled = replace_all_with( '  ', ' ',  df_labeled)

#df_numeric = replace_all_with( '   ', ' ', df_numeric)
#df_numeric = replace_all_with( '  ', ' ', df_numeric)

In [46]:
df.employ_1.unique() # watch for leading and trailing spaces

array([' working or other', 'disability pension or work assessment',
       'sick leave'], dtype=object)

In [47]:
#df_labeled.treat_base[:50]

### 7. The attribute was originally like ''No, off and on', where  ',' was replaced to  ';'. But  ';' is the representation for multiple value thus we  need to convert  ';' to  ':' for its intended interpretation.

In [48]:
df.pain_continuous_1.head()

id
0    yes
1     no
2    yes
3    yes
4     no
Name: pain_continuous_1, dtype: object

In [49]:
df_labeled.medic_number_1.unique()

array(['1-4', '0', '5 or more', nan], dtype=object)

In [51]:
df.medic_number_1 = df.medic_number_1.replace({'-': 'to'}, regex=True)
df_labeled.medic_number_1 = df_labeled.medic_number_1.replace({'-': 'to'}, regex=True)
print(df.medic_number_1.unique())

df.medic_number_3 = df.medic_number_3.replace({'-': 'to'}, regex=True)
df_labeled.medic_number_3 = df_labeled.medic_number_3.replace({'-': 'to'}, regex=True)
print(df.medic_number_3.unique())

#df.medic_number_1.cat.rename_categories({'1-4': '1 to 4'}, inplace=True)
#df_labeled.medic_number_1.cat.rename_categories({'1-4': '1 to 4'}, inplace=True)
#print(df.medic_number_1.unique())
#
#df.medic_number_3.cat.rename_categories({'1-4': '1 to 4'}, inplace=True)
#df_labeled.medic_number_3.cat.rename_categories({'1-4': '1 to 4'}, inplace=True)
#print(df.medic_number_3.unique())
##df.treat_int_3 = df.treat_int_3.replace({'-': 'to'}, regex=True)
##df.treat_int_3.unique()
#
##df_labeled.treat_int_3 = df_labeled.treat_int_3.replace({'-': 'to'}, regex=True)
##df_labeled.treat_int_3.unique()

['1to4' '0' '5 or more' nan]
['0' '1to4' nan]


In [52]:
if( df.columns.contains('priokey')):
    df.priokey.unique()
    df.priokey = df.priokey.replace({';': '-'}, regex=True)

    print(df.priokey.unique())

  """Entry point for launching an IPython kernel.


In [53]:
df.employ_1.unique()

array([' working or other', 'disability pension or work assessment',
       'sick leave'], dtype=object)

In [54]:
df.shape  # (*, 355)

(63, 352)

In [55]:
df.como_other_spec_1.replace({' _ ':'-'}, regex=True, inplace=True)

In [56]:
df.treat_goal_1.replace({' _ ':'-'}, regex=True, inplace=True)

In [57]:
df.treat_goal_1.tolist()

['Smertefri og full jobb',
 'Få hjelp til å mestre smertene og riktig tankesett',
 'mindre smerter - endre livsstil (kost, søvn, trening)',
 'Bli kvitt svimmelheten, full jobb, kommer igang med trening ',
 'Mindre smerter og bedre funksjon. Sterkere rygg og mage',
 'Opprettholde arbeidsevnen og komme igang med systematisk styrketrening',
 'Fortsett være i 50 percent arbeid, gå ned noen kilo og unngå en proteseoperasjon. ',
 'Fungere best mulig i hverdag og jobb, ned noen kilo, mindre smerter',
 'Være i fullt arbeid, opprettholde aktivitetsnivået og treningen. Øke styrke i overkropp',
 'Å bli smertefri i skulderen. Samt en generell økning i styrke i kroppen og litt opp i vekt. ',
 'Mindre smerter i hverdagen, med mindre medisinbruk. Kartlegge bredt. ',
 'Lære verktøy for påfyll i hverdagen uten å ta beredskapen_kroppslige forsvaret helt.',
 'Jobbe med avspenning for dermed å forhåpetligvis påvirke smerten positivt.',
 'Smertefrihet.',
 'Smertefrihet.',
 'Smertefrihet, tillit til og god 

In [58]:
#df.outcome_percent01 = df.outcome_percent01.replace({'â¥50': 'GT_EQ 50'}, regex=True)
#df.outcome_percent01.unique()

In [59]:
#df[['outcome_allpercent','outcome_percent','outcome_percent01']][:100]

In [60]:
#df_labeled.outcome_percent.unique()

In [61]:
#df_labeled.outcome_percent = df_labeled.outcome_percent.replace({'-': 'to'}, regex=True)
#df_labeled.outcome_percent = df_labeled.outcome_percent.replace({'change/worse': 'change_worse'}, regex=True)
#df_labeled.outcome_percent.unique()

## Remove leading or trailing spaces

In [62]:
df = remove_leading_trailing_spaces( tempdf=df)

df_labeled = remove_leading_trailing_spaces( tempdf=df_labeled)

#df_numeric = remove_leading_trailing_spaces( tempdf=df_numeric)

In [63]:
df.employ_1.unique()

array(['working or other', 'disability pension or work assessment',
       'sick leave'], dtype=object)

## Write the feature engineered dataframe to a "csv" file for myCBR analysis

In [64]:
df.to_csv( f'{data_dir}2_sp_dataset_clean.csv', index=True)