In [1]:
using Distributions, DataFrames, Random, CSV

┌ Info: Precompiling CSV [336ed68f-0bac-5ca0-87d4-7b16caf5d00b]
└ @ Base loading.jl:1260


## Normal Distribution

In [2]:
Random.seed!(1234)

MersenneTwister(UInt32[0x000004d2], Random.DSFMT.DSFMT_state(Int32[-1393240018, 1073611148, 45497681, 1072875908, 436273599, 1073674613, -2043716458, 1073445557, -254908435, 1072827086  …  -599655111, 1073144102, 367655457, 1072985259, -1278750689, 1018350124, -597141475, 249849711, 382, 0]), [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0  …  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], UInt128[0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000  …  0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x000000000000

In [3]:
n = Distributions.Normal()
params(n)

(0.0, 1.0)

Using the params() function, we note a mean on $0$ and a standard deviation of $1$, also called the standard normal distribution.

The fieldnames() function provides the actual parameters of the given distribution. In the case of the normal distribution, it will be the average and the standard deviation, namely $\mu$ and $\sigma$.

In [4]:
# Returning the parameters of the normal distribution
fieldnames(Normal)

(:μ, :σ)

In [5]:
var1 = rand(n, 10)

10-element Array{Float64,1}:
  0.8673472019512456
 -0.9017438158568171
 -0.4944787535042339
 -0.9029142938652416
  0.8644013132535154
  2.2118774995743475
  0.5328132821695382
 -0.27173539603462066
  0.5023344963886675
 -0.5169836206932686

In [6]:
mean(var1), std(var1)

(0.18909179133831322, 0.9879593623730926)

In [7]:
#Probability density function value at x = 0.3
pdf(Normal(), 0.3)

0.38138781546052414

In [8]:
#Cumulative distribution function as x = 0.25
cdf(Normal(), 0.25)

0.5987063256829237

In [9]:
var2 = rand(Normal(100, 10), 100);

In [10]:
# Using fit() to calculate the parameters of a distribution
fit(Normal, var2)

Normal{Float64}(μ=98.50583989904842, σ=9.591211638396837)

In [11]:
# Quantiles
quantile(Normal(), 0.025)

-1.9599639845400592

In [12]:
quantile(Normal(), 0.975)

1.9599639845400576

### Other Distribution types

In [13]:
# Beta distribution
b = Beta(1, 1)
params(b)
var3 = rand(b, 100);
fit(Beta, var3)

Beta{Float64}(α=1.0960317409697764, β=1.0578819792921308)

In [14]:
# χ2 distribution
c = Chisq(1)
var4 = rand(c, 100)
fieldnames(Chisq) # Degrees of freedom

(:ν,)

# DataFrames

In [15]:
#Create and empty DataFrame
df = DataFrame();

In [16]:
# Add a column with data point values (rows)
df[:Var2] = var2;

│     df[!, col_ind] = v
│     df
│ end` instead.
│   caller = top-level scope at In[16]:1
└ @ Core In[16]:1


In [17]:
#View first five rows
first(df, 5)

Unnamed: 0_level_0,Var2
Unnamed: 0_level_1,Float64
1,94.395
2,99.8071
3,101.281
4,118.528
5,91.7224


In [18]:
# Add another column
df[:Var3] = var3;

│     df[!, col_ind] = v
│     df
│ end` instead.
│   caller = top-level scope at In[18]:1
└ @ Core In[18]:1


In [19]:
# View last three rows
last(df, 3)

Unnamed: 0_level_0,Var2,Var3
Unnamed: 0_level_1,Float64,Float64
1,113.067,0.34393
2,114.489,0.988254
3,107.782,0.158908


In [20]:
# Dimensions of a DataFrame
size(df)

(100, 2)

In [21]:
# Summarize the content
describe(df)

Unnamed: 0_level_0,variable,mean,min,median,max,nunique,nmissing,eltype
Unnamed: 0_level_1,Symbol,Float64,Float64,Float64,Float64,Nothing,Nothing,DataType
1,Var2,98.5058,67.8864,99.0269,124.175,,,Float64
2,Var3,0.508856,0.0199572,0.502559,0.988254,,,Float64


In [22]:
# Data type only
eltypes(df)

│   caller = top-level scope at In[22]:1
└ @ Core In[22]:1


2-element Array{DataType,1}:
 Float64
 Float64

In [23]:
df2 = DataFrame()
df2[:A] = 1:10
df2[:B] = ["I", "II", "II", "I", "II","I", "II", "II", "I", "II"]
Random.seed!(1234)
df2[:C] = rand(Normal(), 10)
df2[:D] = rand(Chisq(1), 10);

│     df[!, col_ind] = v
│     df
│ end` instead.
│   caller = top-level scope at In[23]:2
└ @ Core In[23]:2
│     df[!, col_ind] = v
│     df
│ end` instead.
│   caller = top-level scope at In[23]:3
└ @ Core In[23]:3
│     df[!, col_ind] = v
│     df
│ end` instead.
│   caller = top-level scope at In[23]:5
└ @ Core In[23]:5
│     df[!, col_ind] = v
│     df
│ end` instead.
│   caller = top-level scope at In[23]:6
└ @ Core In[23]:6


In [24]:
# First three rows with all the colums
df2[1:3, :]

Unnamed: 0_level_0,A,B,C,D
Unnamed: 0_level_1,Int64,String,Float64,Float64
1,1,I,0.867347,0.859325
2,2,II,-0.901744,0.304983
3,3,II,-0.494479,1.17771


In [25]:
# All rows columns 1 and 3
df2[:, [1, 3]]

Unnamed: 0_level_0,A,C
Unnamed: 0_level_1,Int64,Float64
1,1,0.867347
2,2,-0.901744
3,3,-0.494479
4,4,-0.902914
5,5,0.864401
6,6,2.21188
7,7,0.532813
8,8,-0.271735
9,9,0.502334
10,10,-0.516984


In [26]:
# Different notation
df2[:, [:A, :C]]

Unnamed: 0_level_0,A,C
Unnamed: 0_level_1,Int64,Float64
1,1,0.867347
2,2,-0.901744
3,3,-0.494479
4,4,-0.902914
5,5,0.864401
6,6,2.21188
7,7,0.532813
8,8,-0.271735
9,9,0.502334
10,10,-0.516984


In [27]:
data1 = CSV.read("CCS.csv");
typeof(data1)

DataFrame

In [28]:
first(data1, 5)

Unnamed: 0_level_0,PatientID,Cat1,Cat2,Var1,Var2,Var3
Unnamed: 0_level_1,Int64,String,String,Float64,Float64,Float64
1,1,A,C,38.2568,5.93913,35.0579
2,2,A,C,17.8317,5.34754,21.131
3,8,A,B,16.0218,6.60709,60.9436
4,9,A,C,45.1158,6.00733,21.8797
5,16,A,C,20.448,8.54819,20.6623


In [29]:
describe(data1)

Unnamed: 0_level_0,variable,mean,min,median,max,nunique,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Union…,Nothing,DataType
1,PatientID,60.5,1,60.5,120,,,Int64
2,Cat1,,A,,B,2.0,,String
3,Cat2,,B,,X,6.0,,String
4,Var1,27.9679,15.2356,22.6801,84.2378,,,Float64
5,Var2,5.92121,3.01173,5.64241,15.5826,,,Float64
6,Var3,51.95,20.3153,44.3042,147.397,,,Float64


### Joining

In [30]:
# Creating DataFrames
subjects = DataFrame(Number = [100, 101, 102, 103], Stage = ["I", "III", "II", "I"])
treatment  = DataFrame(Number = [103, 102, 101, 100], Treatment = ["A", "B", "A", "B"]);

In [31]:
# Joining
df3 = join(subjects, treatment, on = :Number);
df3

Unnamed: 0_level_0,Number,Stage,Treatment
Unnamed: 0_level_1,Int64,String,String
1,100,I,B
2,101,III,A
3,102,II,B
4,103,I,A


In [32]:
# Adding a longer list of subjects
subjects = DataFrame(Number = [100, 101, 102, 103, 104, 105], Stage = ["I", "III", "II", "I", "II", "II"]);

In [33]:
# Inner join
df4 = join(subjects, treatment, on = :Number, kind = :inner);
df4

Unnamed: 0_level_0,Number,Stage,Treatment
Unnamed: 0_level_1,Int64,String,String
1,100,I,B
2,101,III,A
3,102,II,B
4,103,I,A


In [34]:
# Outer joing: empty fields filled with missing
df5  = join(subjects, treatment, on = :Number, kind = :outer);
df5

Unnamed: 0_level_0,Number,Stage,Treatment
Unnamed: 0_level_1,Int64⍰,String⍰,String⍰
1,100,I,B
2,101,III,A
3,102,II,B
4,103,I,A
5,104,II,missing
6,105,II,missing


### Grouping

In [35]:
# Creating a new DataFrame
df6 = DataFrame(Group = rand(["A", "B", "C"], 15), Variable1 = randn(15), Variable2 = rand(15));
first(df6, 3)

Unnamed: 0_level_0,Group,Variable1,Variable2
Unnamed: 0_level_1,String,Float64,Float64
1,B,1.48217,0.869237
2,B,-0.522772,0.0396356
3,B,-1.5807,0.79041


In [36]:
# Grouping using by()
by(df6, :Group, size)

Unnamed: 0_level_0,Group,x1
Unnamed: 0_level_1,String,Tuple…
1,B,"(11, 3)"
2,A,"(2, 3)"
3,C,"(2, 3)"


In [37]:
# Count unique data point values in :Group column
by(df6, :Group, dfc -> DataFrame(Count = size(dfc, 1)))

Unnamed: 0_level_0,Group,Count
Unnamed: 0_level_1,String,Int64
1,B,11
2,A,2
3,C,2


In [38]:
# Aggregate for descriptive statistics
print(aggregate(df6, :Group, [mean, std]))

3×5 DataFrame
│ Row │ Group  │ Variable1_mean │ Variable2_mean │ Variable1_std │ Variable2_std │
│     │ [90mString[39m │ [90mFloat64[39m        │ [90mFloat64[39m        │ [90mFloat64[39m       │ [90mFloat64[39m       │
├─────┼────────┼────────────────┼────────────────┼───────────────┼───────────────┤
│ 1   │ B      │ -0.0518152     │ 0.41795        │ 1.20435       │ 0.274403      │
│ 2   │ A      │ -0.191824      │ 0.544708       │ 0.789975      │ 0.409632      │
│ 3   │ C      │ 0.152627       │ 0.371762       │ 0.0256698     │ 0.370051      │

In [39]:
# Group
groupby(df6, :Group)

Unnamed: 0_level_0,Group,Variable1,Variable2
Unnamed: 0_level_1,String,Float64,Float64
1,B,1.48217,0.869237
2,B,-0.522772,0.0396356
3,B,-1.5807,0.79041
4,B,0.131842,0.431188
5,B,0.447358,0.137658
6,B,-0.396211,0.60808
7,B,0.621673,0.498734
8,B,0.182588,0.0940369
9,B,2.06353,0.52509
10,B,-1.41453,0.265511

Unnamed: 0_level_0,Group,Variable1,Variable2
Unnamed: 0_level_1,String,Float64,Float64
1,C,0.134475,0.110096
2,C,0.170778,0.633427


In [40]:
length(groupby(df6, :Group))

3

### Sorting

In [41]:
df6S = sort!(df6, [:Group, :Variable1]);
first(df6S, 7)

Unnamed: 0_level_0,Group,Variable1,Variable2
Unnamed: 0_level_1,String,Float64,Float64
1,A,-0.750421,0.834362
2,A,0.366773,0.255054
3,B,-1.58492,0.337865
4,B,-1.5807,0.79041
5,B,-1.41453,0.265511
6,B,-0.522772,0.0396356
7,B,-0.396211,0.60808


In [42]:
# Creating a DataFrame with an obvious duplicate row
df7 = DataFrame(A = [1, 2, 2, 3, 4, 5],  B = [11, 12, 12, 13, 14, 15], C = ["A", "B", "B", "C", "D", "E"]);
df7

Unnamed: 0_level_0,A,B,C
Unnamed: 0_level_1,Int64,Int64,String
1,1,11,A
2,2,12,B
3,2,12,B
4,3,13,C
5,4,14,D
6,5,15,E


In [43]:
# Only unique rows
unique(df7)

Unnamed: 0_level_0,A,B,C
Unnamed: 0_level_1,Int64,Int64,String
1,1,11,A
2,2,12,B
3,3,13,C
4,4,14,D
5,5,15,E


In [44]:
df7

Unnamed: 0_level_0,A,B,C
Unnamed: 0_level_1,Int64,Int64,String
1,1,11,A
2,2,12,B
3,2,12,B
4,3,13,C
5,4,14,D
6,5,15,E


In [45]:
# Permanant change
unique!(df7)
df7

Unnamed: 0_level_0,A,B,C
Unnamed: 0_level_1,Int64,Int64,String
1,1,11,A
2,2,12,B
3,3,13,C
4,4,14,D
5,5,15,E


### Deleting Rows

In [46]:
# Permanently
deleterows!(df7, [1, 5])
df7

Unnamed: 0_level_0,A,B,C
Unnamed: 0_level_1,Int64,Int64,String
1,2,12,B
2,3,13,C
3,4,14,D
