# Example 02-02: Salaries for Professors

[R documentation](http://finzi.psych.upenn.edu/R/library/car/html/Salaries.html)

In [None]:
using DataFramesMeta
using Gadfly
using KernelDensity
using RDatasets

## Exploring contents of RDatasets package

In [None]:
RDatasets.packages()

In [None]:
RDatasets.datasets()

In [None]:
RDatasets.datasets("car")

In [None]:
@where(RDatasets.datasets(), :Package .== "car")

In [None]:
@where(RDatasets.datasets(), :Package .== "car", :Dataset .== "Salaries")

## Loading a datasets from RDatasets

In [None]:
SalariesFrame = dataset("car", "Salaries")

## Getting basic information about a DataFrame

In [None]:
size(SalariesFrame)

In [None]:
names(SalariesFrame)

In [121]:
head(SalariesFrame, 10)

Unnamed: 0,Rank,Discipline,YrsSincePhD,YrsService,Sex,Salary
1,AsstProf,B,1,0,Male,88000
2,AsstProf,B,1,0,Male,88795
3,AsstProf,B,2,0,Male,78000
4,AsstProf,A,2,0,Female,72500
5,AsstProf,A,2,0,Male,85000
6,AsstProf,B,4,0,Male,84000
7,AsstProf,B,4,0,Male,92000
8,AsstProf,B,5,0,Female,77000
9,AsstProf,A,5,0,Male,74000
10,AsstProf,B,11,0,Male,77000


In [None]:
tail(SalariesFrame)

In [None]:
describe(SalariesFrame)

## Basic manipulation of a DataFrame

In [None]:
sort!(SalariesFrame, cols=[:YrsService, :YrsSincePhD])

In [None]:
# pool!(SalariesFrame, [:Discipline, :Sex])

In [None]:
levels(SalariesFrame[:Sex])

## Exploratory statistical analysis of professorial salaries

In [None]:
by(SalariesFrame, :Rank) do df
  DataFrame(
    n = length(df[:Salary]),
    mean = round(mean(df[:Salary]), 3),
    std = round(std(df[:Salary]), 3)
  )
end

In [None]:
by(SalariesFrame, :Discipline) do df
  DataFrame(
    n = length(df[:Salary]),
    mean = round(mean(df[:Salary]), 3),
    std = round(std(df[:Salary]), 3)
  )
end

In [None]:
by(SalariesFrame, :Sex) do df
  DataFrame(
    n = length(df[:Salary]),
    mean = round(mean(df[:Salary]), 3),
    std = round(std(df[:Salary]), 3)
  )
end

In [None]:
by(SalariesFrame, [:Rank, :Discipline]) do df
  DataFrame(
    n = length(df[:Salary]),
    mean = round(mean(df[:Salary]), 3),
    std = round(std(df[:Salary]), 3)
  )
end

In [120]:
by(SalariesFrame, [:Rank, :Sex]) do df
  DataFrame(
    n = length(df[:Salary]),
    mean = round(mean(df[:Salary]), 3),
    std = round(std(df[:Salary]), 3)
  )
end

Unnamed: 0,Rank,Sex,n,mean,std
1,AsstProf,Female,11,78049.909,9371.996
2,AsstProf,Male,56,81311.464,7901.343
3,AssocProf,Female,10,88512.8,17965.286
4,AssocProf,Male,54,94869.704,12890.817
5,Prof,Female,18,121967.611,19619.583
6,Prof,Male,248,127120.823,28213.808


In [119]:
by(SalariesFrame, [:Discipline, :Sex]) do df
  DataFrame(
    n = length(df[:Salary]),
    mean = round(mean(df[:Salary]), 3),
    std = round(std(df[:Salary]), 3)
  )
end

Unnamed: 0,Discipline,Sex,n,mean,std
1,A,Female,18,89064.944,21638.581
2,A,Male,163,110699.982,30663.119
3,B,Female,21,111234.524,25367.24
4,B,Male,195,118760.374,29831.315


In [118]:
by(SalariesFrame, [:Rank, :Discipline, :Sex]) do df
  DataFrame(
    n = length(df[:Salary]),
    mean = round(mean(df[:Salary]), 3),
    std = round(std(df[:Salary]), 3)
  )
end

Unnamed: 0,Rank,Discipline,Sex,n,mean,std
1,AsstProf,A,Female,6,72933.333,5463.21
2,AsstProf,A,Male,18,74269.611,4580.125
3,AsstProf,B,Female,5,84189.8,9792.119
4,AsstProf,B,Male,38,84647.079,6900.293
5,AssocProf,A,Female,4,72128.5,6402.716
6,AssocProf,A,Male,22,85048.864,10611.885
7,AssocProf,B,Female,6,99435.667,14086.476
8,AssocProf,B,Male,32,101621.531,9607.891
9,Prof,A,Female,8,109631.875,15094.589
10,Prof,A,Male,123,120619.26,28504.875


In [None]:
plot(SalariesFrame, x="Rank", y="Salary", Geom.boxplot)

In [None]:
plot(SalariesFrame, x="Discipline", y="Salary", Geom.boxplot)

In [None]:
plot(SalariesFrame, x="Sex", y="Salary", Geom.boxplot)

In [None]:
hstack(
  plot(SalariesFrame, x="Rank", y="Salary", Geom.boxplot),
  plot(SalariesFrame, x="Discipline", y="Salary", Geom.boxplot),
  plot(SalariesFrame, x="Sex", y="Salary", Geom.boxplot)
)

In [None]:
plot(SalariesFrame, x="YrsSincePhD", y="Salary")

In [None]:
plot(SalariesFrame, x="YrsService", y="Salary")

In [None]:
plot(SalariesFrame, x="YrsSincePhD", y="Salary", color="Rank")

In [None]:
plot(SalariesFrame, x="YrsService", y="Salary", color="Rank")

In [None]:
plot(SalariesFrame, x="YrsSincePhD", y="Salary", color="Discipline")

In [None]:
plot(SalariesFrame, x="YrsService", y="Salary", color="Discipline")

In [None]:
plot(SalariesFrame, x="YrsSincePhD", y="Salary", color="Sex")

In [None]:
plot(SalariesFrame, x="YrsService", y="Salary", color="Sex")

In [None]:
plot(SalariesFrame, x="Salary", Geom.histogram(bincount=50, density=true))

In [None]:
salaries = kde(SalariesFrame[:Salary])

plot(
  x=salaries.x,
  y=salaries.density,
  Geom.line,
  Guide.xlabel("Salary"),
  Guide.ylabel(""),
  Guide.title("Kernel density estimator")
)

In [None]:
plot(SalariesFrame, x="Salary", color="Discipline", Geom.histogram(bincount=50, density=true))

In [None]:
plot(SalariesFrame, x="Salary", color="Rank", Geom.histogram(bincount=50, density=true))

In [None]:
kdes = [kde(@where(SalariesFrame, :Rank .== l)[:Salary]) for l in levels(SalariesFrame[:Rank])]

In [None]:
plot(
layer(x=kdes[1].x, y=kdes[1].density, Geom.line, Theme(default_color=colorant"blue")),
layer(x=kdes[2].x, y=kdes[2].density, Geom.line, Theme(default_color=colorant"red")),
layer(x=kdes[3].x, y=kdes[3].density, Geom.line, Theme(default_color=colorant"green")),
Guide.manual_color_key("Rank", ["AsstProf", "AssocProf", "Prof"], ["blue", "red", "green"]),
Guide.xlabel("Salary"), Guide.ylabel(""), Guide.title("Kernel density estimators")
)

In [None]:
plot(z=SalariesFrame[:Salary], Geom.contour, Guide.colorkey("Elevation"))

In [None]:
plot(SalariesFrame, x="YrsService", y="Salary", Geom.histogram2d(xbincount=30, ybincount=30))