# Working with Data

In [1]:
# load required modules
using DataFrames
using Distributions
using Random
using StatsBase

In [2]:
# set seed for reproducible sequence of numbers
Random.seed!(42);

## Dataframes

In [3]:
# create a sample dataframe
df = DataFrame(
    Dict(
        :x1 => rand(Normal(10, 1), 100),
        :x2 => [sample(["Good", "Better", "Best"], pweights([0.2, 0.5, 0.3])) for i=1:100],
        :x3 => rand(Pareto(10, 1), 100)
    )
)

Unnamed: 0_level_0,x1,x2,x3
Unnamed: 0_level_1,Float64,String,Float64
1,9.44397,Best,1.13941
2,9.55562,Good,1.15612
3,10.0272,Better,1.14102
4,9.70052,Better,1.13922
5,11.7779,Good,1.2221
6,8.8551,Better,1.04502
7,9.53139,Best,1.15222
8,10.1561,Good,1.03787
9,7.35801,Better,1.098
10,11.0033,Better,1.05128


In [4]:
# get type of df
typeof(df)

DataFrame

### Access Dataframe Rows

In [5]:
# access 1 row
df[1, :]

Unnamed: 0_level_0,x1,x2,x3
Unnamed: 0_level_1,Float64,String,Float64
1,9.44397,Best,1.13941


In [6]:
# access 1 and 2 row
df[[1, 2],:]

Unnamed: 0_level_0,x1,x2,x3
Unnamed: 0_level_1,Float64,String,Float64
1,9.44397,Best,1.13941
2,9.55562,Good,1.15612


### Access Dataframe Columns

In [7]:
# access x1 column vector
df[:, :x1]

100-element Vector{Float64}:
  9.443973123853613
  9.555616642890303
 10.027155338009194
  9.700515909641089
 11.777861098057326
  8.85509846827118
  9.531394117832326
 10.15614346264074
  7.358008991923204
 11.003309901459485
 11.08238120560843
 10.18702790710363
 10.518148787877138
  ⋮
  9.06950711978119
  8.00944602766854
 10.137539801928368
 13.149575924127522
  9.278587801512574
  9.422907303013153
 10.459383775304756
 10.247207706055054
  9.974337506195937
  9.367919031973127
  8.979580645130948
  8.688492199795714

In [8]:
# access x1 column dataframe
df[:,[:x1]]

Unnamed: 0_level_0,x1
Unnamed: 0_level_1,Float64
1,9.44397
2,9.55562
3,10.0272
4,9.70052
5,11.7779
6,8.8551
7,9.53139
8,10.1561
9,7.35801
10,11.0033


In [9]:
# access column x1 and x2
df[:, [:x1, :x2]]

Unnamed: 0_level_0,x1,x2
Unnamed: 0_level_1,Float64,String
1,9.44397,Best
2,9.55562,Good
3,10.0272,Better
4,9.70052,Better
5,11.7779,Good
6,8.8551,Better
7,9.53139,Best
8,10.1561,Good
9,7.35801,Better
10,11.0033,Better


### Access Dataframe Rows and Columns

In [10]:
# slicing dataframe by indeces
df[1:2, 1:2]

Unnamed: 0_level_0,x1,x2
Unnamed: 0_level_1,Float64,String
1,9.44397,Best
2,9.55562,Good


In [12]:
# slicing dataframe by column names
df[1:2, [:x1, :x2]]

Unnamed: 0_level_0,x1,x2
Unnamed: 0_level_1,Float64,String
1,9.44397,Best
2,9.55562,Good


In [31]:
# select only x2 == "Better"
mask = [df[i,:x2] == "Better" for i=1:nrow(df)]
df[mask, :]

Unnamed: 0_level_0,x1,x2,x3
Unnamed: 0_level_1,Float64,String,Float64
1,10.0272,Better,1.14102
2,9.70052,Better,1.13922
3,8.8551,Better,1.04502
4,7.35801,Better,1.098
5,11.0033,Better,1.05128
6,10.3676,Better,1.02916
7,8.40942,Better,1.03214
8,10.4107,Better,1.02553
9,8.94901,Better,1.02335
10,10.5698,Better,1.04113
