<hr>
# Setting Up

In [1]:
## Importing packages

# This R environment comes with all of CRAN and many other helpful packages preinstalled.
# You can see which packages are installed by checking out the kaggle/rstats docker image: 
# https://github.com/kaggle/docker-rstats

library(tidyverse) # metapackage with lots of helpful functions

## Running code

# In a notebook, you can run a single code cell by clicking in the cell and then hitting 
# the blue arrow to the left, or by clicking in the cell and pressing Shift+Enter. In a script, 
# you can run code by highlighting the code you want to run and then clicking the blue arrow
# at the bottom of this window.

## Reading in files

# You can access files from datasets you've added to this kernel in the "../input/" directory.
# You can see the files added to this kernel by running the code below. 

list.files(path = "../input")

## Saving data

# If you save any files or images, these will be put in the "output" directory. You 
# can see the output directory by committing and running your kernel (using the 
# Commit & Run button) and then checking out the compiled version of your kernel.

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.2.1 ──

[32m✔[39m [34mggplot2[39m 3.2.1.[31m9000[39m     [32m✔[39m [34mpurrr  [39m 0.3.3     
[32m✔[39m [34mtibble [39m 2.1.3          [32m✔[39m [34mdplyr  [39m 0.8.3     
[32m✔[39m [34mtidyr  [39m 1.0.0          [32m✔[39m [34mstringr[39m 1.4.0     
[32m✔[39m [34mreadr  [39m 1.3.1          [32m✔[39m [34mforcats[39m 0.4.0     

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



In [2]:
library(pacman)
pacman::p_load(pacman, rio)

<hr>
# Importing CSV Files

In [3]:
df = import("../input/hepatitis-c-virus-hcv-for-egyptian-patients/HCV-Egy-Data.csv")
df_or = df
df_disc = import("../input/hepatitis-c-virus-hcv-for-egyptian-patients/Discretization-Criteria.csv")
head(df_or)
df_disc

Unnamed: 0_level_0,Age,Gender,BMI,Fever,Nausea/Vomting,Headache,Diarrhea,Fatigue & generalized bone ache,Jaundice,Epigastric pain,⋯,ALT 36,ALT 48,ALT after 24 w,RNA Base,RNA 4,RNA 12,RNA EOT,RNA EF,Baseline histological Grading,Baselinehistological staging
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,56,1,35,2,1,1,1,2,2,2,⋯,5,5,5,655330,634536,288194,5,5,13,2
2,46,1,29,1,2,2,1,2,2,1,⋯,57,123,44,40620,538635,637056,336804,31085,4,2
3,57,1,33,2,2,2,2,1,1,1,⋯,5,5,5,571148,661346,5,735945,558829,4,4
4,49,2,33,1,2,1,2,1,2,1,⋯,48,77,33,1041941,449939,585688,744463,582301,10,3
5,59,1,32,1,1,2,1,2,2,2,⋯,94,90,30,660410,738756,3731527,338946,242861,11,1
6,58,2,22,2,2,2,1,2,2,1,⋯,73,114,29,1157452,1086852,5,5,5,4,4


Unnamed: 0_level_0,Feature Names,Feature Values,Discretization (Items)
Unnamed: 0_level_1,<chr>,<chr>,<chr>
1,Age,32:61,"[0; 32], ]32; 37], ]37; 42],]42; 47], ]47; 52], ]52; 57],]57; 62]"
2,Gender,"Male,Female","[Male], [Female]"
3,BMI(Body Mass Index),22:35,"[0; 18:5[ [18:5; 25[, [25; 30[, [30; 35[, [35; 40["
4,Fever,"Absent, Present","[Absent], [Present] -"
5,Nausea/Vomiting,"Absent, Present","[Absent], [Present] -"
6,Headache,"Absent, Present","[Absent], [Present] -"
7,Diarrhea,"Absent, Present","[Absent], [Present] -"
8,Fatigue,"Absent, Present","[Absent], [Present] -"
9,Bone ache,"Absent, Present","[Absent], [Present] -"
10,Jaundice,"Absent, Present","[Absent], [Present] -"


<hr>
# Data Cleaning

Discretised Data as per expert guided **discretization criteria** dataset

In [4]:
df$Age <- cut(df$Age, breaks=c(0, 32, 37, 42, 47, 52, 57, 62), include.lowest=TRUE)
df$Gender <- factor(df$Gender, labels=c("Male", "Female"))
df$BMI <- cut(df$BMI, breaks=c(0, 18.5, 25, 30, 35, 40), include.lowest=TRUE, right=FALSE)

for (x in colnames(df[4:10])){
    df[[x]] <- factor(df[[x]], levels=c(1, 2), labels=c("Absent", "Present"))
}

df$WBC <- cut(df$WBC, breaks=c(0, 4000, 11000, 12101), include.lowest=TRUE, right=FALSE)
df$RBC <- cut(df$RBC, breaks=c(0, 3000000, 5000000, 5018451), include.lowest=TRUE, right=FALSE)

hgb_v <- vector()
for(x in (1:1385)){
    if(df[x,]$Gender == "Male"){
        if(df[x,]$HGB >= 2 & df[x,]$HGB < 14){ hgb_v[x] = as.character("[2,14)")}
        else if(df[x,]$HGB >= 14 & df[x,]$HGB <= 17.5){ hgb_v[x] = as.character("[14,17.5]")}
        else{ df[x,]$HGB = as.character("(17.5,20]")}
    } else {
        if(df[x,]$HGB >= 2 & df[x,]$HGB < 12.3){ hgb_v[x] = as.character("[2,12.3)")}
        else if(df[x,]$HGB >= 12.3 & df[x,]$HGB <= 15.3){ hgb_v[x] = as.character("[12.3, 15.3]")}
        else{ df[x,]$HGB = as.character("(15.3, 20]")}
    }
}
df$HGB <- factor(hgb_v)

df$Plat <- cut(df$Plat, breaks=c(93013, 100000, 255000, 226465), include.lowest=TRUE, right=FALSE)

for (x in colnames(df[15:22])){
    df[[x]] <- cut(df[[x]], breaks=c(0, 20, 40, 128), include.lowest=TRUE, right=TRUE)
}

df["RNA Base"] <- cut(df[["RNA Base"]], breaks=c(0, 5, 1201086), include.lowest=TRUE, right=TRUE)
df["RNA 4"] <- cut(df[["RNA 4"]], breaks=c(0, 5, 1201715), include.lowest=TRUE, right=TRUE)
df["RNA 12"] <- cut(df[["RNA 12"]], breaks=c(0, 5, 3731527), include.lowest=TRUE, right=TRUE)
df["RNA EOT"] <- cut(df[["RNA EOT"]], breaks=c(0, 5, 808450), include.lowest=TRUE, right=TRUE)
df["RNA EF"] <- cut(df[["RNA EF"]], breaks=c(0, 5, 808450), include.lowest=TRUE, right=TRUE)
df[["Baseline histological Grading"]] <- as.factor(df[["Baseline histological Grading"]])
df[["Baselinehistological staging"]] <- factor(df[["Baselinehistological staging"]], levels=c(0, 1, 2, 3, 4), labels=c("[No Fibrosis]", "[Portal Fibrosis]", "[Few Septa]", "[Many Septa]", "[Cirrhosis]"))
head(df)

Unnamed: 0_level_0,Age,Gender,BMI,Fever,Nausea/Vomting,Headache,Diarrhea,Fatigue & generalized bone ache,Jaundice,Epigastric pain,⋯,ALT 36,ALT 48,ALT after 24 w,RNA Base,RNA 4,RNA 12,RNA EOT,RNA EF,Baseline histological Grading,Baselinehistological staging
Unnamed: 0_level_1,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,⋯,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>
1,"(52,57]",Male,"[35,40]",Present,Absent,Absent,Absent,Present,Present,Present,⋯,"[0,20]","[0,20]","[0,20]","(5,1.2e+06]","(5,1.2e+06]","(5,3.73e+06]","[0,5]","[0,5]",13,[Few Septa]
2,"(42,47]",Male,"[25,30)",Absent,Present,Present,Absent,Present,Present,Absent,⋯,"(40,128]","(40,128]","(40,128]","(5,1.2e+06]","(5,1.2e+06]","(5,3.73e+06]","(5,8.08e+05]","(5,8.08e+05]",4,[Few Septa]
3,"(52,57]",Male,"[30,35)",Present,Present,Present,Present,Absent,Absent,Absent,⋯,"[0,20]","[0,20]","[0,20]","(5,1.2e+06]","(5,1.2e+06]","[0,5]","(5,8.08e+05]","(5,8.08e+05]",4,[Cirrhosis]
4,"(47,52]",Female,"[30,35)",Absent,Present,Absent,Present,Absent,Present,Absent,⋯,"(40,128]","(40,128]","(20,40]","(5,1.2e+06]","(5,1.2e+06]","(5,3.73e+06]","(5,8.08e+05]","(5,8.08e+05]",10,[Many Septa]
5,"(57,62]",Male,"[30,35)",Absent,Absent,Present,Absent,Present,Present,Present,⋯,"(40,128]","(40,128]","(20,40]","(5,1.2e+06]","(5,1.2e+06]","(5,3.73e+06]","(5,8.08e+05]","(5,8.08e+05]",11,[Portal Fibrosis]
6,"(57,62]",Female,"[18.5,25)",Present,Present,Present,Absent,Present,Present,Absent,⋯,"(40,128]","(40,128]","(20,40]","(5,1.2e+06]","(5,1.2e+06]","[0,5]","[0,5]","[0,5]",4,[Cirrhosis]


In [5]:
export(df, "HCV-Discretised.csv")