In [1]:
# Encoding Clean Data and Adding Relevant Information (Like Content Change) and Metropolitan adn Town Size Typology 

#three .csv's files will be needed to run this code:
# file "data_clean.csv" (from previous file: "0_clean.ipnyb")
# file "city_metro_size.csv" (with city size and metropolitan classification)
# file "file_content.csv" (with binary description of elements of change of each image)

import pandas as pd
df = pd.read_csv('data_clean.csv')

#  _Well Defined_: No Adjustments Needed After New Raw Dataset 

### Transforming 1 instance (for each participant) in the correspondent 8 entries (with the 8 preferences)

In [2]:
# columns to keep in each new instance
other_columns = [ 'id',
    'age', 'gender', 'residence_city', 'residence_country', 'nationality',
    'children', 'children_0-5', 'children_6-10', 'children_11',
    'pet_Y_dog', 'pet_Y_other', 'pet_N',
    'disability_Y_mobility', 'disability_Y_other', 'disability_N',
    'occupation', 'specialist',
    'role_academic', 'role_professional',
    'role_public',
    'role_private', 'role_technology',
    'role_nonprofit',
    'role_other',
    'commute_Walking', 'commute_Train', 'commute_Subway', 'commute_Bus',
    'commute_Private Car', 'commute_Motorcycle', 'commute_Bicycle', 'commute_Scooter',
    'commute_TVDE', 'commute_home', 'commute_Other', 'commute_time', 'own_car',
    'parking_home', 'parking_work'
]

# defining the groups
groups = range(101, 409, 1)  # From G101 to G408

# initializing an empty list to store reshaped data
reshaped_data = []

#recall that each 1 instance (for each participant) has the following variables: 'G101_usefulness', 'G101_safety', 'G101_security', 'G101_stress', 'G101_comfort', 'G101_order', 'G101_liveability'

for group in groups:
    # Define the preference and context columns for the current group
    pref_col = f'G{group}_preference'
    context_col = f'G{group}_context'
    usefulness_col = f'G{group}_usefulness'
    safety_col = f'G{group}_safety'
    security_col = f'G{group}_security'
    stress_col = f'G{group}_stress'
    comfort_col = f'G{group}_comfort'
    order_col = f'G{group}_order'
    liveability_col = f'G{group}_liveability'
    
    if pref_col not in df.columns or context_col not in df.columns:
        continue  # Skip if the group columns are not present
    
    # Iterate over each row in the original DataFrame
    for index, row in df.iterrows():
        # Check if preference column is NaN
        if pd.isna(row[pref_col]):
            continue  # Skip rows where preference is NaN
        
        # Extract the values for the current row
        base_data = row[other_columns].to_dict()
        
        # Create a new row for the preference and context of this group
        reshaped_data.append({
            'group': group,
            'preference': row[pref_col],
            'context': row[context_col],
            'usefulness': row[usefulness_col],
            'safety': row[safety_col],
            'security': row[security_col],
            'stress': row[stress_col],
            'comfort': row[comfort_col],
            'order': row[order_col],
            'liveability': row[liveability_col],   
            **base_data
        })

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(reshaped_data)

### Dividing variable *preference* in dependent variables *intervention_type*, *dimension* and independent variable *Y* 

In [3]:
df.head()

Unnamed: 0,group,preference,context,usefulness,safety,security,stress,comfort,order,liveability,...,commute_Motorcycle,commute_Bicycle,commute_Scooter,commute_TVDE,commute_home,commute_Other,commute_time,own_car,parking_home,parking_work
0,101,2_pedestrian_l_a,N,2.0,2.0,-2.0,2.0,2.0,2.0,2.0,...,,Y,,,,,<15,N,,
1,101,2_pedestrian_l_a,N,2.0,0.0,-1.0,1.0,1.0,1.0,2.0,...,,,,,,,15-30,Y,permit,permit
2,101,2_pedestrian_l_a,N,1.0,0.0,0.0,1.0,1.0,-1.0,2.0,...,,,,,,,15-30,Y,free,permit
3,101,2_pedestrian_l_a,N,1.0,1.0,2.0,1.0,0.0,1.0,1.0,...,Y,,,,,,<15,Y,permit,not free
4,101,2_pedestrian_l_a,N,0.0,2.0,1.0,1.0,0.0,1.0,2.0,...,,Y,,,,,30-45,N,,


In [4]:
# Split the 'preference' column into three parts
df[['ID_intervention','intervention_type', 'dimension', 'Y']] = df['preference'].str.split('_', expand=True)
# Convert Y_value to numeric (1 for 'a', 0 for 'b')
df['Y'] = df['Y'].map({'a': 1, 'b': 0})

In [5]:
# Drop the original columns,since it's no longer necessary
df.drop(columns=['preference'], inplace=True)

In [6]:
# Dummy encoding ensuring binary output
dummy_encoded = pd.get_dummies(df['intervention_type']).astype(int)
# Add dummy columns back to the original dataframe (optional)
df = pd.concat([df, dummy_encoded], axis=1)

In [7]:
import numpy as np
#quero criar nova coluna chamada "intervention", diferente de "intervention_type" para ter só as valores de active, transit e car  & df["intervention_type"] == "shared" & df["intervention_type"] == "shared"
df["intervention"] = np.where((df["intervention_type"] == "pedestrian") | (df["intervention_type"] == "shared") | (df["intervention_type"] == "free"), "active", df['intervention_type'])

In [8]:
df[["intervention_type", "intervention"]].sample(30)

Unnamed: 0,intervention_type,intervention
379,free,active
724,pedestrian,active
285,transit,transit
1396,free,active
111,pedestrian,active
1046,transit,transit
577,pedestrian,active
537,transit,transit
1416,free,active
1107,shared,active


### Binary encoding of varibles *context*, *own_car*, *specialist*, *children*

In [9]:
##Encoding das Variáveis Binárias Y and N em 1 e 0, nenhuma tem nan values 
df['context'] = df['context'].replace({'Y': 1, 'N': 0}).astype(int)
df['own_car'] = df['own_car'].replace({'Y': 1, 'N': 0}).astype(int)
df['specialist'] = df['specialist'].replace({'Y': 1, 'N': 0}).astype(int)
df['children'] = df['children'].replace({'Y': 1, 'N': 0}).astype(int)

  df['context'] = df['context'].replace({'Y': 1, 'N': 0}).astype(int)
  df['own_car'] = df['own_car'].replace({'Y': 1, 'N': 0}).astype(int)
  df['specialist'] = df['specialist'].replace({'Y': 1, 'N': 0}).astype(int)
  df['children'] = df['children'].replace({'Y': 1, 'N': 0}).astype(int)


### Binary encoding of varibles *role_type* and *commute_type*

In [10]:
# Define the columns you want to encode
columns_to_encode = [
    'role_academic', 'role_professional', 'role_public', 'role_private',
    'role_technology', 'role_nonprofit',
    'commute_Walking', 'commute_Train', 'commute_Subway', 'commute_Bus',
    'commute_Private Car', 'commute_Motorcycle', 'commute_Bicycle',
    'commute_Scooter', 'commute_TVDE', 'commute_home'
]

# Encode 'Y' as 1 and NaN as 0, and cast the result to integer
for column in columns_to_encode:
    df[column] = df[column].replace({'Y': 1, 'N': 0}).fillna(0).astype(int)

  df[column] = df[column].replace({'Y': 1, 'N': 0}).fillna(0).astype(int)
  df[column] = df[column].replace({'Y': 1, 'N': 0}).fillna(0).astype(int)
  df[column] = df[column].replace({'Y': 1, 'N': 0}).fillna(0).astype(int)
  df[column] = df[column].replace({'Y': 1, 'N': 0}).fillna(0).astype(int)
  df[column] = df[column].replace({'Y': 1, 'N': 0}).fillna(0).astype(int)
  df[column] = df[column].replace({'Y': 1, 'N': 0}).fillna(0).astype(int)
  df[column] = df[column].replace({'Y': 1, 'N': 0}).fillna(0).astype(int)
  df[column] = df[column].replace({'Y': 1, 'N': 0}).fillna(0).astype(int)
  df[column] = df[column].replace({'Y': 1, 'N': 0}).fillna(0).astype(int)
  df[column] = df[column].replace({'Y': 1, 'N': 0}).fillna(0).astype(int)
  df[column] = df[column].replace({'Y': 1, 'N': 0}).fillna(0).astype(int)
  df[column] = df[column].replace({'Y': 1, 'N': 0}).fillna(0).astype(int)
  df[column] = df[column].replace({'Y': 1, 'N': 0}).fillna(0).astype(int)
  df[column] = df[column].replace({'Y'

### Ordinal encoding of variables *commute_time*

In [11]:
# Encoding the scale in commute time 
#  Define the mapping
commute_mapping = {
    'home': 0,
    '<15': 1,
    '15-30': 2,
    '30-45': 3,
    '+45': 4
}

# Replace the commute_time values with their corresponding numeric scale
df['commute_time'] = df['commute_time'].replace(commute_mapping).fillna(0).astype(int)

  df['commute_time'] = df['commute_time'].replace(commute_mapping).fillna(0).astype(int)


### Converting categorical variable _dimension_ into scale 

In [12]:
# Mapping dictionary to convert categorical values to numerical values
dimension_mapping = {
    'm': 1,  # 'm' becomes 1
    'l': 2,  # 'l' becomes 2
    't': 3   # 't' becomes 3
}

# Apply the mapping to the 'dimension' column
df['dimension'] = df['dimension'].map(dimension_mapping)

### Binary Encoding : categorical variable _gender_ converted into two binary columns *female* and *other_gender* 

In [13]:
# Initialize the new binary columns with default values
df['female'] = 0
df['other_gender'] = 0

# Update the columns based on the 'gender' values
df.loc[df['gender'] == 'F', 'female'] = 1
df.loc[df['gender'].isin(['O', 'N']), 'other_gender'] = 1

#para verificar se está correto:
df[df['other_gender'] == 1][['female', 'gender', 'other_gender']].head()

Unnamed: 0,female,gender,other_gender
1013,0,N,1
1015,0,O,1
1075,0,N,1
1077,0,O,1
1137,0,N,1


In [14]:
import numpy as np
#quero criar nova coluna chamada "intervention", diferente de "intervention_type" para ter só as valores de active, transit e car  & df["intervention_type"] == "shared" & df["intervention_type"] == "shared"
df["gender"] = np.where((df["gender"] == "N"), "O", df['gender'])

In [15]:
df[["gender", "female", "other_gender"]].sample(30)

Unnamed: 0,gender,female,other_gender
273,F,1,0
508,F,1,0
785,M,0,0
978,M,0,0
1326,M,0,0
439,F,1,0
146,F,1,0
1323,O,0,1
1203,M,0,0
936,F,1,0


### Min-max scaling of variable *age* 

In [16]:
# Perform min-max scaling on the 'age' column
df['age_scaled'] = (df['age'] - df['age'].min()) / (df['age'].max() - df['age'].min())

### Binary encoding variables specialists' *role_....*

In [None]:
# Define the roles and their corresponding columns
roles = {
    'academic': 'role_academic',
    'professional': 'role_professional',
    'public': 'role_public',
    'private': 'role_private',
    'technology': 'role_technology',
    'nonprofit': 'role_nonprofit'
}

# Create the 'specialist_role' column by combining roles with value 1
df['specialist_role'] = df.apply(lambda row: ' '.join([role for role, col in roles.items() if row[col] == 1]), axis=1)

# Show the first few rows to verify
df[['specialist','role_academic', 'role_professional', 'role_public', 'role_private', 'role_technology', 'role_nonprofit', 'specialist_role']].head()

Unnamed: 0,specialist,role_academic,role_professional,role_public,role_private,role_technology,role_nonprofit,specialist_role
0,0,0,0,0,0,0,0,
1,0,0,0,0,0,0,0,
2,0,0,0,0,0,0,0,
3,0,0,0,0,0,0,0,
4,1,1,1,0,1,0,0,academic professional private


### Binary encoding of variables *parking_home* and *parking_work*

In [18]:
# Get unique values in 'parking_home'
unique_parking_home = df['parking_home'].unique()
print("Unique values in 'parking_home':", unique_parking_home)

# Get unique values in 'parking_work'
unique_parking_work = df['parking_work'].unique()
print("Unique values in 'parking_work':", unique_parking_work)

Unique values in 'parking_home': [nan 'permit' 'free' 'not free']
Unique values in 'parking_work': [nan 'permit' 'not free' 'free']


In [19]:
# Create binary columns for 'free' and 'permit'
df['parking_home_free'] = (df['parking_home'] == 'free').astype(int)
df['parking_home_permit'] = (df['parking_home'] == 'permit').astype(int)
df['parking_home_notfree'] = (df['parking_home'] == 'not free').astype(int)

# Drop rows where 'parking_home' is 'not free' or NaN
#df_encoded = df.dropna(subset=['parking_home'])
#df_encoded = df_encoded[df_encoded['parking_home'].isin(['free', 'permit'])]

# Display the updated DataFrame
df[df['parking_home_free'] == 0][['parking_home', 'parking_home_permit', 'parking_home_free', 'parking_home_notfree']].head()

Unnamed: 0,parking_home,parking_home_permit,parking_home_free,parking_home_notfree
0,,0,0,0
1,permit,1,0,0
3,permit,1,0,0
4,,0,0,0
7,,0,0,0


In [20]:
# Create binary columns for 'free' and 'permit'
df['parking_work_free'] = (df['parking_work'] == 'free').astype(int)
df['parking_work_permit'] = (df['parking_work'] == 'permit').astype(int)
df['parking_work_notfree'] = (df['parking_work'] == 'not free').astype(int)

# Drop rows where 'parking_home' is 'not free' or NaN
#df_encoded = df.dropna(subset=['parking_work'])
#df_encoded = df_encoded[df_encoded['parking_work'].isin(['free', 'permit'])]

# Display the updated DataFrame
df[df['parking_work_free'] == 0][['parking_work', 'parking_work_permit', 'parking_work_free', 'parking_work_notfree']].head()

Unnamed: 0,parking_work,parking_work_permit,parking_work_free,parking_work_notfree
0,,0,0,0
1,permit,1,0,0
2,permit,1,0,0
3,not free,0,0,1
4,,0,0,0


# _To Check_ : Might Need Adjustements After New Dataset

### Converting varibles *disability_Y_mobility*, *disability_Y_other* and *disability_N* into binary column *disability*

only indicating the presence or absence of a disability, since there were not enough participants with different types of disabilities (auditory, mobility, visual)

In [21]:
#já que não há outras disabilities, para além de mobility e other!! 
df['disability'] = (df[['disability_Y_mobility', 'disability_Y_other']].fillna('N').apply(lambda x: 1 if 'Y' in x.values else 0, axis=1))

In [22]:
df.drop(columns=['disability_Y_mobility', 'disability_Y_other', 'disability_N'], inplace=True)

### Converting variables *pet_Y_dog*, *pet_Y_other* into binary column *pet* and *pet_Y_dog* into binary column *dog* 

In [23]:
df['pet'] = (df[['pet_Y_dog', 'pet_Y_other']].fillna('0').apply(lambda x: 1 if 'Y' in x.values else 0, axis=1))

In [24]:
df['dog'] = (
    df[['pet_Y_dog']]
    .fillna('0')
    .apply(lambda x: 1 if 'Y' in x.values else 0, axis=1)
)

In [25]:
df.drop(columns=['pet_Y_dog', 'pet_Y_other', 'pet_N'], inplace=True)

### Working with the variables *commute_*

In [26]:
df['commute_motorscooter'] = df['commute_Motorcycle'] | df['commute_Scooter']
df.drop(columns=['commute_Motorcycle', 'commute_Scooter'], inplace=True)

In [27]:
# Find unique values in the 'commute_other' column
unique_commute_others = df['commute_Other'].unique()

# Print unique values
print(unique_commute_others)

[nan 'Tramway' 'Boat'
 'I work from home most of the time, but when I commute to the office I use the train, subway, TVDE and I walk on foot. Occasionally I use the bicycle or the bus.'
 'Carsharing' 'Bus' 'tram' 's(peed)-Pedelec']


In [28]:
# Update 'commute_Bus' where 'commute_Other' is 'Bus'
df['commute_Bus'] = df['commute_Bus'] | (df['commute_Other'] == 'Bus').astype(int)

# Update 'commute_Bus' where 'commute_Other' is 'Bus'
df['commute_Train'] = df['commute_Train'] | (df['commute_Other'] == 'tram').astype(int)
df['commute_Train'] = df['commute_Train'] | (df['commute_Other'] == 'Tramway').astype(int)
df['commute_Train'] = df['commute_Train'] | (df['commute_Other'] == 'I work from home most of the time, but when I commute to the office I use the train, subway, TVDE and I walk on foot. Occasionally I use the bicycle or the bus.').astype(int)

# Create 'commute_Boat' column based on 'commute_Other'
df['commute_Boat'] = (df['commute_Other'] == 'Boat').astype(int)
df['commute_Bicycle'] = df['commute_Bicycle'] | (df['commute_Other'] == 's(peed)-Pedelec').astype(int)
df['commute_TVDE'] = df['commute_TVDE'] | (df['commute_Other'] == 'Carsharing').astype(int)

In [29]:
df.drop(columns=['commute_Other'], inplace=True)

# New Categories: _metropolitan_, *city_size*, *european*

### Category: _metropolitan_ (binary:yes/no) and _city _ size_ (small_town;outskirt;large_town)

In [30]:
city_info = pd.read_csv('city_metro_v2.csv')

df = pd.merge(df, city_info, how='left', on='residence_city')

In [31]:
df[["metropolitan", "city_size", "urban_density"]]

Unnamed: 0,metropolitan,city_size,urban_density
0,metropolitan,large_town,6452.0
1,metropolitan,large_town,6452.0
2,metropolitan,outskirt,2525.0
3,metropolitan,large_town,6452.0
4,metropolitan,outskirt,2590.0
...,...,...,...
1499,metropolitan,large_town,854.1
1500,metropolitan,large_town,6452.0
1501,metropolitan,large_town,4110.0
1502,non_metropolitan,small_town,702.1


In [32]:
df['urban_density'] = df['urban_density'].fillna(df['urban_density'].mean())
df['urban_density_sc'] = (df['urban_density'] - df['urban_density'].min()) / (df['urban_density'].max() - df['urban_density'].min())

df[["urban_density_sc", "urban_density"]]

Unnamed: 0,urban_density_sc,urban_density
0,0.263347,6452.0
1,0.263347,6452.0
2,0.102231,2525.0
3,0.263347,6452.0
4,0.104898,2590.0
...,...,...
1499,0.033678,854.1
1500,0.263347,6452.0
1501,0.167260,4110.0
1502,0.027441,702.1


### Category: *europe_resindence* and *europe_nationality* (binary:yes/no) 

In [33]:
df["residence_country"].unique()

array(['portugal', 'netherlands', 'belgium', 'united kingdom', 'brazil',
       'germany', 'israel', 'spain', 'switzerland', 'hungary', 'chile',
       'italy', 'finland', 'sweden', 'greece', 'other', 'denmark',
       'canada', 'ireland', 'united states', 'angola', 'kuwait', 'japan',
       'estonia'], dtype=object)

In [34]:
df["residence_country"].value_counts()/8

residence_country
portugal          100.0
brazil             22.0
germany            20.0
spain               8.0
netherlands         5.0
belgium             5.0
chile               4.0
finland             3.0
israel              2.0
united kingdom      2.0
sweden              2.0
switzerland         2.0
other               2.0
hungary             1.0
italy               1.0
greece              1.0
denmark             1.0
canada              1.0
ireland             1.0
united states       1.0
angola              1.0
kuwait              1.0
japan               1.0
estonia             1.0
Name: count, dtype: float64

In [35]:
df["residence_country"].unique()

array(['portugal', 'netherlands', 'belgium', 'united kingdom', 'brazil',
       'germany', 'israel', 'spain', 'switzerland', 'hungary', 'chile',
       'italy', 'finland', 'sweden', 'greece', 'other', 'denmark',
       'canada', 'ireland', 'united states', 'angola', 'kuwait', 'japan',
       'estonia'], dtype=object)

In [36]:
european_countries = [
    'portugal', 'germany', 'spain', 'belgium', 'netherlands', 'finland', 
    'united kingdom', 'sweden', 'switzerland', 'denmark', 'ireland', 
    'hungary', 'greece', 'italy', 'estonia', 'france', 'poland', 'turkey']

american_countries = ['brazil', "chile", "canada", "united states", "colombia", "peru"]

african_countries = ['cape verde', 'mozambique', 'egypt', 'nigeria', 'cameroon', "angola"]

asian_countries = ["iran", "china", "israel", "jordan", "syria", "india", "japan", "kuwait"]

oceania_countries = ["australia"]

df['europe_residence'] = df['residence_country'].apply(lambda x: 1 if x in european_countries else 0)
df['europe_nationality'] = df['nationality'].apply(lambda x: 1 if x in european_countries else 0)

df['american_residence'] = df['residence_country'].apply(lambda x: 1 if x in american_countries else 0) #vale a pena
df['american_nationality'] = df['nationality'].apply(lambda x: 1 if x in american_countries else 0)

df['african_residence'] = df['residence_country'].apply(lambda x: 1 if x in african_countries else 0) #só há 1 pessoa
df['african_nationality'] = df['nationality'].apply(lambda x: 1 if x in african_countries else 0) #até há várias 

df['asian_residence'] = df['residence_country'].apply(lambda x: 1 if x in asian_countries else 0) #só há 4
df['asian_nationality'] = df['nationality'].apply(lambda x: 1 if x in asian_countries else 0) #9 


In [37]:
df["asian_nationality"].value_counts()/8

asian_nationality
0    179.0
1      9.0
Name: count, dtype: float64

In [38]:
df["residence_country"]

0          portugal
1          portugal
2          portugal
3          portugal
4       netherlands
           ...     
1499         brazil
1500       portugal
1501        germany
1502       portugal
1503         brazil
Name: residence_country, Length: 1504, dtype: object

In [39]:
column_names = df.columns
print(column_names)

Index(['group', 'context', 'usefulness', 'safety', 'security', 'stress',
       'comfort', 'order', 'liveability', 'id', 'age', 'gender',
       'residence_city', 'residence_country', 'nationality', 'children',
       'children_0-5', 'children_6-10', 'children_11', 'occupation',
       'specialist', 'role_academic', 'role_professional', 'role_public',
       'role_private', 'role_technology', 'role_nonprofit', 'role_other',
       'commute_Walking', 'commute_Train', 'commute_Subway', 'commute_Bus',
       'commute_Private Car', 'commute_Bicycle', 'commute_TVDE',
       'commute_home', 'commute_time', 'own_car', 'parking_home',
       'parking_work', 'ID_intervention', 'intervention_type', 'dimension',
       'Y', 'car', 'free', 'pedestrian', 'shared', 'transit', 'intervention',
       'female', 'other_gender', 'age_scaled', 'specialist_role',
       'parking_home_free', 'parking_home_permit', 'parking_home_notfree',
       'parking_work_free', 'parking_work_permit', 'parking_work_notfr

In [40]:
df[["residence_country","nationality","europe_residence", "europe_nationality"]]

Unnamed: 0,residence_country,nationality,europe_residence,europe_nationality
0,portugal,portugal,1,1
1,portugal,portugal,1,1
2,portugal,portugal,1,1
3,portugal,brazil,1,0
4,netherlands,netherlands,1,1
...,...,...,...,...
1499,brazil,brazil,0,0
1500,portugal,portugal,1,1
1501,germany,portugal,1,1
1502,portugal,portugal,1,1


### Category: _emigrant_ (binary)

In [41]:
df['europe_nationality'].value_counts()



europe_nationality
1    984
0    520
Name: count, dtype: int64

In [42]:
df["migrant"] = np.where(df["residence_country"] != df["nationality"], 1, 0)

df.groupby("migrant").count()/8

Unnamed: 0_level_0,group,context,usefulness,safety,security,stress,comfort,order,liveability,id,...,urban_density,urban_density_sc,europe_residence,europe_nationality,american_residence,american_nationality,african_residence,african_nationality,asian_residence,asian_nationality
migrant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,136.0,136.0,136.0,136.0,136.0,136.0,136.0,136.0,136.0,136.0,...,136.0,136.0,136.0,136.0,136.0,136.0,136.0,136.0,136.0,136.0
1,52.0,52.0,52.0,52.0,52.0,52.0,52.0,52.0,52.0,52.0,...,52.0,52.0,52.0,52.0,52.0,52.0,52.0,52.0,52.0,52.0


In [43]:
df[["residence_country","nationality","migrant"]]

Unnamed: 0,residence_country,nationality,migrant
0,portugal,portugal,0
1,portugal,portugal,0
2,portugal,portugal,0
3,portugal,brazil,1
4,netherlands,netherlands,0
...,...,...,...
1499,brazil,brazil,0
1500,portugal,portugal,0
1501,germany,portugal,1
1502,portugal,portugal,0


# Adding .CSV of Content/Elements Target of Change to the dataset (and some initial results)

In [44]:
content_change = pd.read_csv('content_v2.csv')
df['ID_intervention'] = df['ID_intervention'].astype(pd.Int64Dtype())

df = pd.merge(df, content_change, on=['Y', 'ID_intervention'])

In [45]:
# Replace values in the 'metropolitan' column
df['metropolitan_encoded'] = df['metropolitan'].replace({'metropolitan': 1, 'non_metropolitan': 0}).fillna(1).astype(int)

  df['metropolitan_encoded'] = df['metropolitan'].replace({'metropolitan': 1, 'non_metropolitan': 0}).fillna(1).astype(int)


In [46]:
df[["metropolitan_encoded","metropolitan"]]

Unnamed: 0,metropolitan_encoded,metropolitan
0,1,metropolitan
1,1,metropolitan
2,1,metropolitan
3,1,metropolitan
4,1,metropolitan
...,...,...
1499,1,metropolitan
1500,1,metropolitan
1501,1,metropolitan
1502,0,non_metropolitan


In [47]:
# Replace values and handle NaN
df['city_size_encoded'] = df['city_size'].replace({
    'large_town': 3,
    'small_town': 1,
    'outskirt': 2
}).fillna(3).astype(int)

# Verify the changes
print(df['city_size'].unique())

['large_town' 'outskirt' 'small_town' nan]


  df['city_size_encoded'] = df['city_size'].replace({


In [48]:
#generating ID_image for each image (Y+ID_intervention)
df['ID_image'] = df['ID_intervention'].astype(str) + '_' + df['Y'].astype(str)

#generating ID_image for each image (Y+ID_intervention)
content_change['ID_image'] = content_change['ID_intervention'].astype(str) + '_' + content_change['Y'].astype(str)

In [49]:
print(df['ID_image'].unique())
print(content_change['ID_image'].unique())

print(df['ID_image'].nunique())
print(content_change['ID_image'].nunique())

['2_1' '2_0' '7_1' '7_0' '55_1' '55_0' '18_1' '18_0' '49_1' '49_0' '37_0'
 '37_1' '33_1' '33_0' '30_0' '30_1' '63_1' '60_1' '60_0' '34_0' '34_1'
 '72_1' '72_0' '25_1' '25_0' '4_1' '4_0' '15_1' '15_0' '64_1' '64_0' '9_1'
 '9_0' '50_1' '31_0' '31_1' '69_1' '69_0' '66_1' '66_0' '39_1' '39_0'
 '67_1' '67_0' '41_1' '41_0' '20_0' '20_1' '10_1' '10_0' '62_1' '62_0'
 '68_0' '68_1']
['2_1' '2_0' '7_1' '7_0' '55_1' '55_0' '18_1' '18_0' '49_1' '49_0' '37_1'
 '37_0' '33_1' '33_0' '30_1' '30_0' '63_1' '63_0' '60_1' '60_0' '34_1'
 '34_0' '72_1' '72_0' '25_1' '25_0' '4_1' '4_0' '15_1' '15_0' '64_1'
 '64_0' '9_1' '9_0' '50_1' '50_0' '31_1' '31_0' '69_1' '69_0' '66_1'
 '66_0' '39_1' '39_0' '67_1' '67_0' '41_1' '41_0' '20_1' '20_0' '10_1'
 '10_0' '62_1' '62_0' '68_1' '68_0']
54
56


<b>Reminder</b>
- O questionário tem 28 intervenções diferentes, i.e 28 pares de imagens.
- Tendo assim 56 imagens no total;
- O ficheiro das respostas apenas mostrou ter 54 imagens preferidas (respostas/images diferentes)

<b>Conclusão</b>
- Portanto, houve 2 imagens que nunca foram selecionadas por nenhum dos participantes! (até agora 188 participantes) (✿◠‿◠) 

In [50]:
set_df1 = set(df['ID_image'].unique())
set_df2 = set(content_change['ID_image'].unique())

# Find IDs in df1 that are not in df2
only_in_df1 = set_df1 - set_df2

# Find IDs in df2 that are not in df1
only_in_df2 = set_df2 - set_df1

# Optionally, you can combine both to get all non-matching IDs
all_non_matching_ids = only_in_df1.union(only_in_df2)

# Print the results
print("IDs only in df1:", only_in_df1)
print("IDs only in df2:", only_in_df2)
print("All non-matching IDs:", all_non_matching_ids)

IDs only in df1: set()
IDs only in df2: {'63_0', '50_0'}
All non-matching IDs: {'63_0', '50_0'}


# Reorder

In [51]:
#joined shared + pedestrian + free in active modes, t be evaluated both indivually and collectively
df['active'] = np.where((df['shared'] == 1) | (df['free'] == 1) | (df['pedestrian'] == 1), 1, 0)

In [52]:
#public_transport_modes = ['commute_Bus', 'commute_Subway', 'commute_Train', 'commute_Boat']
#active_modes = ['commute_Bicycle', 'commute_Walking', "commute_motorscooter "]
#car_modes = ['commute_Private Car', 'commute_TVDE']

df['pt_commute'] = ((df['commute_Bus'] == 1) | 
                    (df['commute_Subway'] == 1) | 
                    (df['commute_Train'] == 1) | 
                    (df['commute_Boat'] == 1)).astype(int)

df['active_commute'] = ((df['commute_Bicycle'] == 1) | 
                         (df['commute_Walking'] == 1) | 
                         (df['commute_motorscooter'] == 1)).astype(int)

df['car_commute'] = ((df['commute_Private Car'] == 1) | 
                     (df['commute_TVDE'] == 1)).astype(int)

In [53]:
list(df.columns)

['group',
 'context',
 'usefulness',
 'safety',
 'security',
 'stress',
 'comfort',
 'order',
 'liveability',
 'id',
 'age',
 'gender',
 'residence_city',
 'residence_country',
 'nationality',
 'children',
 'children_0-5',
 'children_6-10',
 'children_11',
 'occupation',
 'specialist',
 'role_academic',
 'role_professional',
 'role_public',
 'role_private',
 'role_technology',
 'role_nonprofit',
 'role_other',
 'commute_Walking',
 'commute_Train',
 'commute_Subway',
 'commute_Bus',
 'commute_Private Car',
 'commute_Bicycle',
 'commute_TVDE',
 'commute_home',
 'commute_time',
 'own_car',
 'parking_home',
 'parking_work',
 'ID_intervention',
 'intervention_type',
 'dimension',
 'Y',
 'car',
 'free',
 'pedestrian',
 'shared',
 'transit',
 'intervention',
 'female',
 'other_gender',
 'age_scaled',
 'specialist_role',
 'parking_home_free',
 'parking_home_permit',
 'parking_home_notfree',
 'parking_work_free',
 'parking_work_permit',
 'parking_work_notfree',
 'disability',
 'pet',
 'dog',
 '

In [54]:
ID_image = ['ID_intervention','Y','group', 'ID_image']
ID_participant = ['id']
interventions = ['intervention_type',"intervention",'dimension']

context_likert = ['context','usefulness','safety','security','stress','comfort','order','liveability']

socio_demographics = ['age','gender','residence_city','residence_country','nationality','children','children_0-5','children_6-10','children_11', "occupation"]
socio_demo_encoded = ['female','other_gender','age_scaled', 'disability','pet','dog']
specialist = ['specialist',"specialist_role", 'role_academic','role_professional','role_public','role_private','role_technology','role_nonprofit']

travel_behavior = ['commute_Walking','commute_Train','commute_Subway','commute_Bus','commute_Private Car','commute_Bicycle','commute_TVDE','commute_home','commute_time','own_car',

 'commute_motorscooter',
 'commute_Boat', "pt_commute", "car_commute", "active_commute"]

parking = ['parking_home',
            'parking_work',
            'parking_home_free',
 'parking_home_permit',
 'parking_work_free',
 'parking_work_permit','parking_work_notfree', 'parking_home_notfree']

categories = ['metropolitan',
 'city_size',
 "europe_residence",
 "europe_nationality", "migrant", "metropolitan_encoded", "city_size_encoded", "american_nationality", "american_residence", "african_residence", "african_nationality", "asian_residence", "asian_nationality",
 "urban_density_sc", "urban_density"]

interventions_encoded = ['free','pedestrian','shared', "active",'car',"transit"]

content = ['parking_space',
 'pedestrian_space',
 'activities',
 'greenery',
 'colorful_pavement_urban_art',
 'benches',
 'car_lanes',
 'cyclelane',
 'ebusl_ps',
 'ebusl_no_ps',
 'transit_mall_no_cars',
 'vru_ps',
 'vru_no_ps',
 'vru_no_cars',
 'shared_space',
 'superblock',
 'people',
 'cars',
 'order_maintenance',
 'sunnier']


In [55]:
remaining_columns = list(set(df.columns) - set(parking)- set(ID_image)- set(ID_participant)- set(context_likert) - set(specialist)- set(socio_demo_encoded)- set(socio_demographics) - set(content) - set(categories)- set(interventions_encoded) -set(travel_behavior))

print(remaining_columns)

['dimension', 'role_other', 'intervention', 'intervention_type']


In [56]:
file_path = 'C:\\Users\\maryi\\Desktop\\Tese Data Analysis\\data_encoded.csv'

# Export the DataFrame to a .csv file
df[ID_image + ID_participant + interventions + context_likert + socio_demographics + socio_demo_encoded + specialist +  travel_behavior + parking + categories + interventions_encoded + content].to_csv(file_path, index=False)