In [18]:
import pandas as pd
import numpy as np
import re

df1 = pd.read_csv('Chinese Thermal Comfort Dataset Class I.csv.csv', skiprows=1)
df2 = pd.read_csv('Chinese Thermal Comfort Dataset Class II.csv.csv', skiprows=1)
df3 = pd.read_csv('Chinese Thermal Comfort Dataset Class III.csv.csv', skiprows=1)
df_concatenated = pd.concat([df1, df2, df3], ignore_index=True)

desired_columns = ['B1.Building Type' ,'B2.Building Function','B5.Room (Length×Width)','B5.Room Height (m)', 'C1.Sex','C2.Age','D1.TSV','D2.TCV','D5.Clothing Insulation (clo)', 'E1.Indoor Air Temperature (℃)','E2.Indoor Relative Humidity (%)', 'A4.Season', 'G1.Real-Time Outdoor Temperature (℃)', 'G5.Mean Daily Outdoor Relative Humidity (%)']
df_selected = df_concatenated[desired_columns]
df_selected.replace('NA ', np.nan, inplace=True)

new_column_names = {
    'A4.Season': 'Season',
    'A6.Climate zone': 'Climate zone',
    'B1.Building Type': 'Building Type',
    'B2.Building Function': 'Building Function',
    'B5.Room (Length×Width)': 'Room Length x Width',
    'B5.Room Height (m)': 'Room Height',
    'C1.Sex': 'Sex',
    'C2.Age': 'Age',
    'D1.TSV': 'Thermal Sensation Vote',
    'D2.TCV': 'Thermal Comfort Vote',
    'D5.Clothing Insulation (clo)': 'Clothing Insulation (clo)',
    'E1.Indoor Air Temperature (℃)': 'Indoor Temperature (℃)',
    'E2.Indoor Relative Humidity (%)': 'Indoor Humidity (%)',
    'G1.Real-Time Outdoor Temperature (℃)': 'Outdoor Temperature (℃)',
    'G5.Mean Daily Outdoor Relative Humidity (%)': 'Outdoor Humidity (%)',
}

default_values = {
    'B5.Room (Length×Width)': '0×0',
    'B5.Room Height (m)': '0',
    'C1.Sex': 'Unknown',
    'C2.Age': 0,
    'B1.Building Type': 'Other',
    'B2.Building Function': 'Other',
    'G1.Real-Time Outdoor Temperature (℃)': 20,
    'G5.Mean Daily Outdoor Relative Humidity (%)': 50,
    
}
df_filled = df_selected.fillna(default_values)
df_renamed = df_filled.rename(columns=new_column_names)
pattern = r'[×xX]' 
df_renamed[['Room Length', 'Room Width']] = df_renamed['Room Length x Width'].str.split(pattern, expand=True)

def clean_numeric(s):
    s = str(s)
    s = re.sub(r'[^\d.]', '', s) 
    parts = s.split('.')
    if len(parts) > 1:
        s = parts[0] + '.' + ''.join(parts[1:])
    else:
        s = parts[0]

    try:
        return float(s) if s != '' else 0.0
    except ValueError:
        return 0.0


df_renamed['Room Length'] = df_renamed['Room Length'].apply(clean_numeric)
df_renamed['Room Width'] = df_renamed['Room Width'].apply(clean_numeric)
df_renamed['Room Height'] = df_renamed['Room Height'].apply(clean_numeric)

df_renamed['Room Volume'] = df_renamed['Room Length'] * df_renamed['Room Width'] * df_renamed['Room Height']

bins = [0, 1, 300, 700, 5000]
labels = ['0', '1-300', '300-700', '700+']  
df_renamed['Room Volume Category'] = pd.cut(df_renamed['Room Volume'], bins=bins, labels=labels, right=False)

bins_insulation = [0, 1, 2, 3] 
labels_insulation = labels = ['light', 'medium', 'heavy']   
df_renamed['Clothing Insulation (clo)'] = pd.cut(df_renamed['Clothing Insulation (clo)'], bins=bins_insulation, labels=labels_insulation, right=True)

df_renamed.drop(['Room Length x Width', 'Room Length', 'Room Width', 'Room Height', 'Room Volume', 'Thermal Sensation Vote'], axis=1, inplace=True)

df_complete_fields = df_renamed.dropna()

df_complete_fields['Building Type'] = df_complete_fields['Building Type'].replace('Dormitory', 'Residential')  
df_complete_fields['Building Type'] = df_complete_fields['Building Type'].replace('Educationnal', 'Educational')  
df_complete_fields['Building Function'] = df_complete_fields['Building Function'].replace('Study', 'Classroom')  
df_complete_fields['Building Function'] = df_complete_fields['Building Function'].replace('Laboratory', 'Classroom')  
df_complete_fields['Building Function'] = df_complete_fields['Building Function'].replace('Dormitory', 'Bedroom')  
df_complete_fields['Building Function'] = df_complete_fields['Building Function'].replace('Dormitory', 'Bedroom')  
df_complete_fields['Age'] = df_complete_fields['Age'].replace('30-40', '31-40')  
df_complete_fields['Age'] = df_complete_fields['Age'].replace('21-23', '18-30')  
df_complete_fields['Age'] = df_complete_fields['Age'].replace('24-26', '18-30')  
df_complete_fields['Age'] = df_complete_fields['Age'].replace('18-31', '18-30')  
df_complete_fields['Age'] = df_complete_fields['Age'].replace('18-32', '18-30')  
df_complete_fields['Age'] = df_complete_fields['Age'].replace('18-33', '18-30')  
df_complete_fields['Age'] = df_complete_fields['Age'].replace('19-28', '18-30')  
df_complete_fields['Age'] = df_complete_fields['Age'].replace('29-55', '31-40')  
df_complete_fields['Age'] = df_complete_fields['Age'].replace('≥61', '>60')  
df_complete_fields['Age'] = df_complete_fields['Age'].replace('> 60', '>60')  
df_complete_fields['Age'] = df_complete_fields['Age'].replace('＞60', '>60')  

def categorize_age(age):
    age = str(age).strip() 
    if age.isdigit():  
        min_age = int(age)
    else:
        return age 

    if min_age < 1:
        return '0'
    elif min_age < 18:
        return '9-17'  
    elif min_age <= 30:
        return '18-30'
    elif min_age <= 40:
        return '31-40'
    elif min_age <= 50:
        return '41-50'
    elif min_age <= 60:
        return '51-60'
    elif min_age >60:
        return '>60'
    else:
        return 0

df_complete_fields['Age'] = df_complete_fields['Age'].apply(categorize_age)

df_complete_fields = df_complete_fields.dropna()


df_complete_fields.to_csv('all_fields_populated.csv', index=True, index_label="Id")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected.replace('NA ', np.nan, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_complete_fields['Building Type'] = df_complete_fields['Building Type'].replace('Dormitory', 'Residential')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_complete_fields['Building Type'] = df_complete_fields['Building Type'].replace('Educationnal', 'Educ