# RevoScaleR

## 1. Introduction

### Import data

In [None]:
# Declare the file paths for the csv and xdf files
myAirlineCsv <- file.path(rxGetOption("sampleDataDir"), "2007_subset.csv")
myAirlineXdf <- "2007_subset.xdf"

# Use rxImport to import the data into xdf format
rxImport(inData = myAirlineCsv, outFile = myAirlineXdf, overwrite = TRUE)
list.files()

### Summarize data

In [None]:
# Get basic information about your data
rxGetInfo(data = myAirlineXdf, 
          getVarInfo = TRUE,
          numRows = 10)

## Summarize the variables corresponding to actual elapsed time, time in the air, departure delay, flight Distance.
rxSummary(formula = ~ ActualElapsedTime + AirTime + DepDelay + Distance, 
          data = myAirlineXdf)

# Histogram of departure delays
rxHistogram(formula = ~DepDelay, 
            data = myAirlineXdf)

# Use parameters similar to a regular histogram to zero in on the interesting area
rxHistogram(formula = DepDelay, 
            data = myAirlineXdf, 
            xAxisMinMax = c(-100, 400), 
            numBreaks = 500,
            xNumTicks = 500)

### Create new variables (rxDataStep)

In [None]:
## Calculate an additional variable: airspeed (distance traveled / time in the air). 
rxDataStep(inData = myAirlineXdf, 
         outFile = myAirlineXdf, 
         varsToKeep = c("AirTime", "Distance"),
	       transforms = list(airSpeed = Distance / AirTime),
         append = "cols",
         overwrite = TRUE)

# Get Variable Information for airspeed
rxGetInfo(data = myAirlineXdf, 
          getVarInfo = TRUE,
          varsToKeep = "airSpeed")

# Summary for the airspeed variable
rxSummary(~airSpeed, 
          data = myAirlineXdf)

# Construct a histogtam for airspeed
# We can use the xAxisMinMax argument to limit the X-axis.
rxHistogram(~airSpeed, 
            data = myAirlineXdf
            )

rxHistogram(~airSpeed, 
            data = myAirlineXdf,
            xNumTicks = 10,
            numBreaks = 1500,
            xAxisMinMax = c(0,12)
            )

### Transform variables (rxDataStep)

In [None]:
# Conversion to miles per hour
rxDataStep(inData = myAirlineXdf, 
         outFile = myAirlineXdf, 
         varsToKeep = c("airSpeed"),
	       transforms = list(airSpeed = airSpeed * 60),
         overwrite=TRUE)

# Histogram for airspeed after conversion
rxHistogram(~ airSpeed, data = myAirlineXdf)

### Correlations

In [None]:
# Correlation for departure delay, arrival delay, and air speed
rxCor(formula = ~ DepDelay + ArrDelay + airSpeed,
      data = myAirlineXdf,
      rowSelection = (airSpeed > 50) & (airSpeed < 800))

### Lin reg

In [None]:
# Regression for airSpeed based on departure delay
myLMobj <- rxLinMod(formula = airSpeed ~ DepDelay, 
         data = myAirlineXdf,
         rowSelection = (airSpeed > 50) & (airSpeed < 800))
summary(myLMobj)

## 2. Data exploration

### RevoScaler options

In [None]:
## extract the names of the possible options:
names(rxOptions())

## extract the sample data directory:
rxGetOption("sampleDataDir")

## view the current value of the reportProgress option
rxGetOption("reportProgress")

## set the value of the reportProgress option to 0
rxOptions(reportProgress = 0)

### Import and explore data

In [None]:
## set up the variable that has the address of the relevant data file:
djiXdf <- file.path(rxGetOption("sampleDataDir"), "DJIAdaily.xdf")

## get information about that dataset:
rxGetInfo(djiXdf, getVarInfo = TRUE)

### Extract metadata

In [None]:
## get variable information for the dataset
djiVarInfo <- rxGetVarInfo(djiXdf)
names(djiVarInfo)

## extract information about the closing cost variable
(closeVarInfo <- djiVarInfo$Close)
## get the class of the closeVarInfo object:
class(closeVarInfo)
## examine the structure of the closeVarInfo object:
str(closeVarInfo)

## extract the global maximum of the closing cost variable:
closeMax <- closeVarInfo[["high"]]

### Summarize variables (rxSummary)

In [None]:
## Basic summary statistics:
rxSummary( ~ DayOfWeek + Close + Volume, data = djiXdf)

## Frequency weighted:
rxSummary( ~ DayOfWeek + Close, data = djiXdf, fweights = "Volume")

## Basic frequency count:
rxCrossTabs( ~ DayOfWeek, data = djiXdf)

### Explore distribution (rxHistogram)

In [None]:
## Numeric Variables
rxHistogram(~ Close, data = djiXdf)
## Categorical Variable:
rxHistogram(~ DayOfWeek, data = djiXdf)

## Different panels for different days of the week
rxHistogram(~ Close | DayOfWeek, data = djiXdf)

## Numeric Variables with a frequency weighting:
rxHistogram(~ Close, data = djiXdf, fweights = "Volume")

### Plot bivariate relationship (rxLinePlot)

In [None]:
## Simple bivariate line plot:
rxLinePlot(Close ~ DaysSince1928, data = djiXdf)

## Using different panels for different days of the week:
rxLinePlot(Close ~ DaysSince1928 | DayOfWeek, data = djiXdf)

## Using different groups.
rxLinePlot(Close ~ DaysSince1928, groups = DayOfWeek, data = djiXdf)

## Simple bivariate line plot, after taking the log() of the ordinate (y) variable.
rxLinePlot(log(Close) ~ DaysSince1928, data = djiXdf)

### Summarizing Variables with rxCrossTabs()

In [None]:
## Compute the the summed volume for each day of the week:
rxCrossTabs(formula = Volume ~ DayOfWeek, data = djiXdf)

## Compute the the summed volume for each day of the week for each month:
rxCrossTabs(formula = Volume ~ F(Month):DayOfWeek, data = djiXdf)

## Compute the the average volume for each day of the week for each month:
rxCrossTabs(formula = Volume ~ F(Month):DayOfWeek, data = djiXdf, means = TRUE)

## Compute the the average closing price for each day of the week for each month, using volume as frequency weights
rxCrossTabs(formula = Close ~ F(Month):DayOfWeek, data = djiXdf, means = TRUE, fweights = "Volume")

### Summarizing Variables with rxCube()

In [None]:
## Compute the the summed volume for each day of the week:
rxCrossTabs(Volume ~ DayOfWeek, data = djiXdf)
rxCube(Volume ~ DayOfWeek, data = djiXdf, means = FALSE)

## Compute the the summed volume for each day of the week for each month:
rxCrossTabs(Volume ~ F(Month):DayOfWeek, data = djiXdf)
rxCube(Volume ~ F(Month):DayOfWeek, data = djiXdf, means = FALSE)

## Compute the the average volume for each day of the week for each month:
rxCube(Volume ~ F(Month):DayOfWeek, data = djiXdf)

## Compute the the average closing price for each day of the week for each month, using volume as frequency weights
rxCube(Close ~ F(Month):DayOfWeek, data = djiXdf, fweights = "Volume")

## Data Manipulation

### Using rxDataStep() to transform data

In [None]:
## Get information on mortData
rxGetInfo(mortData)

## Set up my personal copy of the data:
myMortData <- "myMD.xdf"

## Create the transform
rxDataStep(inData = mortData, outFile = myMortData,
           transforms = list(highDebtRow = ccDebt > 8000)
  )
## Get the variable information
rxGetVarInfo(myMortData)
## Get the proportion of values that are 1.
rxSummary( ~ highDebtRow, data = myMortData)

## Compute multiple transforms!
rxDataStep(inData = myMortData, outFile = myMortData,
           transforms = list(
             newHouse = houseAge < 10,
             ccsXhd = creditScore * highDebtRow 
             ),
           append = "cols",
           overwrite = TRUE
  )

### Complex transforms using transformFuncs

In [None]:
## Compute the summary statistics
(csSummary <- rxSummary(~ creditScore, data = mortData))

## Extract the mean and std. deviation
meanCS <- csSummary$sDataFrame$Mean[1]
sdCS <- csSummary$sDataFrame$StdDev[1]

## Create a function to compute the scaled variable
scaleCS <- function(mylist){
  mylist[["scaledCreditScore"]] <- (mylist[["creditScore"]] - myCenter) / myScale
  return(mylist)
}

## Run it with rxDataStep
myMortData <- "myMD.xdf"
rxDataStep(inData = mortData, outFile = myMortData,
           transformFunc = scaleCS,
           transformObjects = list(myCenter = meanCS, myScale = sdCS)
           )

## Check the new variable:
rxGetVarInfo(myMortData)
rxSummary( scaleCS, data = myMortData)