In [None]:
# From Coursera Julia course by university of Cape Town
using Pkg;
# ENV["PYTHON"]="/home/maruthinh/softwares/julia-1.10.0-alpha1/bin/julia"
# Pkg.build("PyCall")
Pkg.add("PyPlot")
Pkg.add("Distributions") #create random variables
Pkg.add("StatsBase") #Basic statistical support
Pkg.add("CSV") #Reading and writing CSV files
Pkg.add("DataFrames") #create a data structure 
Pkg.add("HypothesisTests") #create a statistical tests
Pkg.add("StatsPlots") #plot statistical data
Pkg.add("GLM") #General linear models

In [None]:
using CSV
using DataFrames
using Distributions
using GLM
using HypothesisTests
using PyPlot
using StatsBase
using StatsPlots
# pyplot()

#### Creating random variables

In [None]:
age = rand(18:80, 100); #uniform distribution
wcc = round.(rand(Distributions.Normal(12,2), 100), digits=1); #white cell count. Normal distribution & round to one decimal place 
crp = round.(Int, rand(Distributions.Chisq(4), 100)) .* 10; #Chi-squared distribution with broadcasting & alternative round() : C-reactive protein
treatment = rand(["A", "B"], 100); #uniformly weighted
result = rand(["Improved" "Static" "Worse"], 100); #uniformly weighted

#### Desriptive statistics

In [None]:
#mean of age 
mean(age)

In [None]:
#median of age 
median(age)

In [None]:
#standard deviation
std(age)

In [None]:
#variance of age 
var(age)

In [None]:
#Descriptive statistics of the age variable 
StatsBase.describe(age)

In [None]:
#Summary stats of the wcc variable (similar to describe)
StatsBase.summarystats(wcc)

#### Creating a dataframe
create a dataframe data for easier manipulation

In [None]:
data = DataFrame(Age=age, WCC=wcc, CRP=crp, Treatment=treatment, Result=result)

In [None]:
#number of rows and cols 
size(data)

In [None]:
#first row 
first(data)

In [None]:
#extract data frame based on condition
dataA = data[data.Treatment .== "A", :]
dataB = data[data.Treatment .== "B", :]

##### Descriptive stat using the dataframe 

In [None]:
describe(data)

In [None]:
# groupby() function in the DataFrames package is used to group rows in a DataFrame based on the unique values in one or more columns
# Now, grouped_df is a GroupedDataFrame object, which contains groups of rows grouped by unique values in the "ID" column.
grouped_df = groupby(data, :Age)
# You can iterate over the groups 
for group in grouped_df
    println(group)
end

In [None]:
# You can perform various operations on groups, such as calculating summary statistics (e.g., mean, sum) or applying custom functions using combine() or map() functions.
mean_scores = combine(grouped_df, :WCC => mean => :WCC)

### Visualizing the data
Plots package with DataFrame macro 

In [None]:
@df data density(:Age, group=:Treatment, title="Distribution of ages by treatment group", xlab="Age", ylab="Distribution", legent=:topright)

In [None]:
#we can do the same for thre resulting groups 
@df data density(:Age, group = :Result, title="Distribution of ages by result group", xlab="Age", ylab="Distribution", legent=:topright)

In [None]:
#discriminate between all groups 
@df data density(:Age, group=(:Treatment, :Result), title="Distribution of ages by treatment and result group", xlab="Age", ylab="Distribution", legent=:topright)


In [None]:
#create a box-and-whisker plot of the white cell per treatment group and then per result group
@df data StatsPlots.boxplot(:Treatment, :WCC, lab="WCC", title="white cell count by treatment group", xlab="Groups", ylab="WCC")

In [None]:
@df data StatsPlots.boxplot(:Result, :WCC, lab="Result", title="white cell count by result group", xlab="Groups", ylab="WCC")

In [None]:
#correlation between numerical variables using a correlation plot and corner plot. 
@df data corrplot([:Age :WCC :CRP], grid=false) #no comma between arguments 

In [None]:
@df data cornerplot([:Age :WCC :CRP], grid=false, compact=true) #no comma between arguments 

### Inferential statistics

In [None]:
# We will begin by Student's t test 
#difference in age between patients in groups A and B
HypothesisTests.EqualVarianceTTest(dataA.Age, dataB.Age)

In [None]:
#p value for the difference in white cell count between patients in groups A and B 
pvalue(EqualVarianceTTest(dataA.WCC,dataA.WCC))

In [None]:
#difference in c-reactive protein level between patients in groups A and B for unequal variances 
UnequalVarianceTTest(dataA.CRP, dataB.CRP)

### Creating a liner models using GLM 

In [None]:
#Simple model to predict CRP 
GLM.fit(LinearModel, @formula(CRP~1), data)

In [None]:
#Adding age as a predictor variable 
fit(LinearModel, @formula(CRP~Age), data)

In [None]:
#Adding age and WCC as a predictor variable 
fit(LinearModel, @formula(CRP~Age+WCC), data)

We can conduct $\chi^2$ test for independence using the HypothesisTests.ChisqTest() function. First we need to look at the counts. Below we calculate the number of unique values for the result variable sample spaces for patients in groups A and B

In [None]:
groupby(dataA, :Result)
groupby(dataB, :Result)

observed=reshape([22,17,18,18,11,14],(2,3))
ChisqTest(observed)

### Export to CSV

In [None]:
CSV.write("data.csv", data)