Skip to content

Commit

Permalink
Used base R functions
Browse files Browse the repository at this point in the history
Replaced the data.table function (fread) with base R function for reading csv files (read.csv)
  • Loading branch information
Emaasit committed Jun 8, 2015
1 parent 33f9882 commit a550f70
Showing 1 changed file with 8 additions and 13 deletions.
21 changes: 8 additions & 13 deletions examples/src/main/r/data-manipulation.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
# Load SparkR library into your R session
library(SparkR)

## Initialize SparkContext on your local PC
## Initialize SparkContext
sc <- sparkR.init(master = "local", appName = "MyApp")

## Initialize SQLContext
Expand All @@ -31,19 +31,16 @@ sqlContext <- SparkRSQL.init(sc)
# The data set is made up of 227,496 rows x 14 columns.


# Option 1: Create an R data frame and then convert it to a SparkR DataFrame -------
# Option 1: Create a local R data frame and then convert it to a SparkR DataFrame -------

## Create R dataframe
install.packages("data.table") #We want to use the fread() function to read the dataset
library(data.table)

flights_df <- fread("flights.csv")
## Create a local R dataframe
flights_df <- read.csv("flights.csv")
flights_df$date <- as.Date(flights_df$date)

## Convert the local data frame into a SparkR DataFrame
flightsDF <- createDataFrame(sqlContext, flights_df)

# Option 2: Alternatively, directly create a SparkR DataFrame from the source data
# Option 2: Alternatively, directly create a SparkR DataFrame from the source data -------
flightsDF <- read.df(sqlContext, "flights.csv", source = "csv", header = "true")

# Print the schema of this Spark DataFrame
Expand All @@ -52,11 +49,6 @@ printSchema(flightsDF)
# Cache the DataFrame
cache(flightsDF)


# Install the magrittr pipeline operator
install.packages("magrittr")
library(magrittr)

# Print the first 6 rows of the DataFrame
showDF(flightsDF, numRows = 6) ## Or
head(flightsDF)
Expand Down Expand Up @@ -88,6 +80,9 @@ print(dest_df)
jfkDF <- filter(flightsDF, "dest == JFK") ##OR
jfkDF <- filter(flightsDF, flightsDF$dest == JFK)

# Install the magrittr library
library(magrittr)

# Group the flights by date and then find the average daily delay
# Write the result into a DataFrame
groupBy(flightsDF, "date") %>%
Expand Down

0 comments on commit a550f70

Please sign in to comment.