## Data Preprocessing

### Import Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
from data_loader import DataLoader, DataLoaderSelector
from augment_dataset import add_synthetic_conclusions_straightforward

### Load and Clean Dataset

In [2]:
mwr = DataLoaderSelector(data_path='data_th_scale.csv')

### Access the Dataframe

In [3]:
mwr_df = mwr.data_loader.data

In [4]:
print(mwr_df['r:Th'].value_counts(normalize=True))

r:Th
2    0.294479
1    0.289074
0    0.241170
4    0.080335
5    0.064945
3    0.029997
Name: proportion, dtype: float64


### Generate Synthetic Descriptions

#### Straightforward Conclusions

In [21]:
# Th: Thermal asymmetry with:
# 0: no changes
# 1: slightly elevated temperature
# 2: moderately elevated temperature (surface)
# 3: moderately elevated temperature (surface and depth)
# 4: increased temperature (surface and depth), partial asymmetry
# 5: increased temperature (surface and depth), greater and clear asymmetry

In [22]:
# add the straightforward conclusions
mwr_df_simple = add_synthetic_conclusions_straightforward(mwr_df)


In [23]:
mwr_df_simple.head()


Unnamed: 0,Examination ID,Conclusion,r:Th,Weight,Height,Ambient temperature,r:AgeInYears,Mammary diameter,Cycle,Day from the first day,...,R8 sk,L8 sk,R9 sk,L9 sk,T1 sk,T2 sk,R0 sk,L0 sk,Conclusion (Tr),Synthetic_Conclusion
0,00002A00010B1,temperatura v v predelu mastektomije je nekoli...,0,0,0,22,43,17,28,16,...,32.7,31.4,33.8,32.6,33.1,32.9,33.3,31.6,The temperature in the area of the mastectomy ...,No thermal changes detected.
1,00002A00010C1,v desni dojki je temperatura lepo razporejena ...,0,0,0,22,45,20,30,9,...,32.8,31.5,33.2,32.6,32.5,33.2,33.3,31.4,"In the right breast, the temperature is well d...",No thermal changes detected.
2,00002A00023C1,"1/ razporeditev temperature je asimetrična, ve...",1,0,0,21,53,25,0,-1,...,30.9,30.0,30.5,30.6,31.6,31.8,30.9,31.6,"1/ The temperature distribution is asymmetric,...",Slightly elevated temperature.
3,00002A00037C1,temperatura v obeh dojkah je pravilno porazdel...,0,0,0,25,54,24,0,-1,...,32.0,31.2,32.3,33.0,32.7,32.6,33.4,33.5,The temperature in both breasts is evenly dist...,No thermal changes detected.
4,00002A00037D1,"dojki sta srednje veliki, mehki, rahlo vozliča...",0,0,0,23,55,25,0,-1,...,31.4,31.7,33.3,32.5,33.6,33.4,33.7,33.1,"The breasts are medium-sized, soft, slightly n...",No thermal changes detected.


### Binary encoding

In [24]:
mwr_df_simple['y_binary'] = mwr_df_simple['r:Th'].apply(lambda x: 0 if x == 0 else 1)
y_class = mwr_df_simple['y_binary']

In [25]:
mwr_df_simple.head(20)

Unnamed: 0,Examination ID,Conclusion,r:Th,Weight,Height,Ambient temperature,r:AgeInYears,Mammary diameter,Cycle,Day from the first day,...,L8 sk,R9 sk,L9 sk,T1 sk,T2 sk,R0 sk,L0 sk,Conclusion (Tr),Synthetic_Conclusion,y_binary
0,00002A00010B1,temperatura v v predelu mastektomije je nekoli...,0,0,0,22,43,17,28,16,...,31.4,33.8,32.6,33.1,32.9,33.3,31.6,The temperature in the area of the mastectomy ...,No thermal changes detected.,0
1,00002A00010C1,v desni dojki je temperatura lepo razporejena ...,0,0,0,22,45,20,30,9,...,31.5,33.2,32.6,32.5,33.2,33.3,31.4,"In the right breast, the temperature is well d...",No thermal changes detected.,0
2,00002A00023C1,"1/ razporeditev temperature je asimetrična, ve...",1,0,0,21,53,25,0,-1,...,30.0,30.5,30.6,31.6,31.8,30.9,31.6,"1/ The temperature distribution is asymmetric,...",Slightly elevated temperature.,1
3,00002A00037C1,temperatura v obeh dojkah je pravilno porazdel...,0,0,0,25,54,24,0,-1,...,31.2,32.3,33.0,32.7,32.6,33.4,33.5,The temperature in both breasts is evenly dist...,No thermal changes detected.,0
4,00002A00037D1,"dojki sta srednje veliki, mehki, rahlo vozliča...",0,0,0,23,55,25,0,-1,...,31.7,33.3,32.5,33.6,33.4,33.7,33.1,"The breasts are medium-sized, soft, slightly n...",No thermal changes detected.,0
5,00002A00037E1,1/ izmerjena temperatura v obeh dojkah je sime...,1,0,0,21,56,25,0,-1,...,29.8,30.4,30.8,31.3,31.5,31.9,31.8,1/ The measured temperature in both breasts is...,Slightly elevated temperature.,1
6,00002A00068C1,tenperatura v predelu dojk je v mejah fiziološ...,0,0,0,23,43,29,25,5,...,31.7,32.4,31.8,32.8,32.4,32.7,32.7,The temperature in the breast area is within t...,No thermal changes detected.,0
7,00002A00078B1,temperatura v obeh dojkah je pravilno in simet...,0,0,0,24,33,25,32,32,...,32.4,32.9,33.1,32.4,32.5,33.2,33.8,The temperature in the breast area is correctl...,No thermal changes detected.,0
8,00002A00110B1,"1/ razporeditev je asimetrična, vendar znakov ...",1,0,0,21,37,22,30,6,...,33.2,32.4,32.9,33.9,33.8,33.8,34.1,"1/ The distribution is asymmetrical, but there...",Slightly elevated temperature.,1
9,00002A00112B1,dojki sta simetrični.klinično sta srednje veli...,0,0,0,23,53,22,28,23,...,30.5,30.6,31.3,31.1,30.9,30.9,30.8,"The breasts are symmetrical. Clinically, they ...",No thermal changes detected.,0


### Save New Dataframe

In [26]:
mwr_df_simple.to_csv('mwr_simple.csv', index=False)