In [101]:
import pandas as pd
# Disable na_filter as the unique_name 'NA' (North America) is interpreted as NaN
from pandas._libs.parsers import STR_NA_VALUES
accepted_na_values = STR_NA_VALUES - {'NA'}

df = pd.read_csv("data/regionmodel_reduced.csv", keep_default_na=False, na_values=accepted_na_values)
new_df = df.drop(['Reference','Note'], axis=1)
new_df.iloc[110]

Parent-Region                         South Asia
Region                                    Bhutan
SQL unique_name                              BTN
time to 75% daily utility in weeks           NaN
time to 25% daily utility in weeks           NaN
Aver. cost per week                        250.0
jan                                            o
feb                                            o
mar                                            +
apr                                            +
may                                            o
jun                                            -
jul                                            -
aug                                            -
sep                                            +
oct                                           ++
nov                                           ++
dec                                            +
Safety from crime                             ++
nature & wildlife                             ++
hiking              

Plenty of values are missing and in the wrong representation.
We first replace missing values in `SQL unique_name` with empty strings.
Multiple columns are using likert scales for their ratings (`'---'`,...,`'o'`, ...,`'+++'`). We convert those to a numeric scale (1-7).

In [102]:
new_df['SQL unique_name'] = df['SQL unique_name'].fillna(value='')

likert_cols = list(new_df.columns)[6:]
remap = dict(zip(['---', '--', '-', 'o', '+', '++', '+++'], list(range(1,8))))
new_df[likert_cols] = new_df[likert_cols].replace(remap)

new_df.iloc[110]
likert_cols

['jan',
 'feb',
 'mar',
 'apr',
 'may',
 'jun',
 'jul',
 'aug',
 'sep',
 'oct',
 'nov',
 'dec',
 'Safety from crime',
 'nature & wildlife',
 'hiking',
 'beach',
 'watersports',
 'entertainment',
 'wintersports',
 'culture',
 'culinary',
 'cities & architecture',
 'shopping']

We fill in all the missing values.

In [108]:
def fillInParentRow(index, startCol, endCol, parent, row):
    if (row.icol[0] == parent[1]):
        for i in range(startCol, endCol):
            if (pd.isna(row.icol[i])):
                new_df.iat[index, i] = parent.iloc[i]
            
parent_row = pd.Series([], dtype='float64')
    
for index in new_df.index:
    row = new_df.iloc[index]
    fillInParentRow(index, 3, 29, parent_row, row)
    
    if (row.iloc[2] == ''):
        fill
        parent_row = row
        continue
    elif (row.iloc[0] == parent_row.iloc[1]):
        fillInParentRow(index, 3, 29, parent_row, row)
        #for i in range(3, 28 + 1):
        #    if (pd.isna(row.iloc[i]) and (row.iloc[0] == parent_row.iloc[1])):
        #        new_df.iat[index, i] = parent_row.iloc[i]
        continue
    
    #print("edge case at index: ", index, " with parent_row: ", parent_row, " and row: ", row)
            
print(new_df.shape)
new_df

edge case at index:  15  with parent_row:  Parent-Region                         North America
Region                                       Canada
SQL unique_name                                    
time to 75% daily utility in weeks                1
time to 25% daily utility in weeks                8
Aver. cost per week                             725
jan                                            <NA>
feb                                            <NA>
mar                                            <NA>
apr                                            <NA>
may                                            <NA>
jun                                            <NA>
jul                                            <NA>
aug                                            <NA>
sep                                            <NA>
oct                                            <NA>
nov                                            <NA>
dec                                            <NA>
Safety from crime    

Unnamed: 0,Parent-Region,Region,SQL unique_name,time to 75% daily utility in weeks,time to 25% daily utility in weeks,Aver. cost per week,jan,feb,mar,apr,...,nature & wildlife,hiking,beach,watersports,entertainment,wintersports,culture,culinary,cities & architecture,shopping
0,,World,,1,4,400,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4
1,World,Europe,EU,1,4,400,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4
2,World,North America,,1,4,400,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4
3,World,Middle America and Caribbean,MA,1,4,400,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4
4,World,South America,SA,1,4,400,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192,Southern Europe,"Spain, Canary islands",ESP_CA,1,2,350,5,5,5,6,...,4,4,5,4,5,2,4,4,4,3
193,Southern Europe,"Portugal, mainland",PRT,1,5,350,3,3,4,6,...,4,4,5,4,4,2,5,5,5,3
194,Southern Europe,"Portugal, islands",PRT_IS,1,8,350,3,3,4,6,...,5,5,5,5,4,2,5,4,4,3
195,Southern Europe,Turkey,TUR,1,6,350,3,3,4,6,...,4,4,5,4,4,4,6,5,5,4


In [104]:
new_df.dtypes

Parent-Region                          object
Region                                 object
SQL unique_name                        object
time to 75% daily utility in weeks    float64
time to 25% daily utility in weeks    float64
Aver. cost per week                   float64
jan                                   float64
feb                                   float64
mar                                   float64
apr                                   float64
may                                   float64
jun                                   float64
jul                                   float64
aug                                   float64
sep                                   float64
oct                                   float64
nov                                   float64
dec                                   float64
Safety from crime                     float64
nature & wildlife                     float64
hiking                                float64
beach                             

We finally convert all scales columns to `int64` instead of `float64`

In [105]:
float_cols = new_df.select_dtypes(include=['float64'])
for col in float_cols.columns.values:
    new_df[col] = new_df[col].astype('Int64')
    
    
print('Original df shape was: ', df.shape, ', dropped two columns and now have: ', new_df.shape)

Original df shape was:  (197, 31) , dropped two columns and now have:  (197, 29)


Save as csv

In [106]:
new_df.to_csv('data/regionmodel_output_cleaned.csv')