Disparity

mguevara · Sep 15, 2015 · 1b35828 · 1b35828
1 parent 177cab5
commit 1b35828
Show file tree

Hide file tree

Showing 12 changed files with 260 additions and 77 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -0,0 +1,2 @@
+^.*\.Rproj$
+^\.Rproj\.user$
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+.Rproj.user
+.Rhistory
+.RData
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,11 +1,17 @@
-Package: diversity
-Title: Diversity Measures Package
-Version: 0.1
-Authors: Miguel Guevara <miguel.guevara@upla.cl>, Dominik Hartmann
+Package: diveR
+Title: An R Package to Compute Diversity Measures
+Version: 0.4
+Author: Miguel R. Guevara <miguel.guevara@upla.cl>, Dominik Hartmann
         <d.hartmann@uni-hohenheim.de>, Marcelo Mendoza
         <marcelo.mendoza@usm.cl>
-Description: It implements a number of diversity measures for data analysis
-Depends: proxy
-License: GPL
+Maintainer: Miguel R. Guevara <miguel.guevara@upla.cl>
+Description: It implements a number of diversity measures for data analysis considering three dimensions: Variety, Balance and Disparity. 
+Depends: proxy,pheatmap,reshape2,foreign
+URL: https://github.com/mguevara/diveR
+Repository: CRAN
+Date: 2015-09-18 16:34:12
+BugReports: https://github.com/mguevara/diveR/issues
+License: MIT
+LazyData: true
 NeedsCompilation: no
-Packaged: 2015-08-17 21:26:31 UTC; mmendoza
+Packaged: 2015-09-13 21:26:31 UTC; mmendoza
diff --git a/R/diversity.R b/R/diversity.R
@@ -105,7 +105,7 @@ diversity <- function(data, type="all", method='euclidean', agg_type=NULL, q=0,
     diversity <- merge(diversity,m_d, by=0, all=TRUE)
     rownames(diversity) <- diversity$Row.names; diversity$Row.names <- NULL
   }
-  if(type == 'inverse-simpson' || type=='inv' || type == 'all') {
+  if(type == 'inverse-simpson' || type=='is' || type == 'all') {
     q <- 2
     p <- 1/(1-q)
     m_d <- as.data.frame((rowSums(propX ^ q, na.rm=TRUE)) ^ p)
@@ -126,24 +126,47 @@ diversity <- function(data, type="all", method='euclidean', agg_type=NULL, q=0,
     diversity <- merge(diversity,m_d, by=0, all=TRUE)
     rownames(diversity) <- diversity$Row.names; diversity$Row.names <- NULL
   }
-  if(!is.null(method)) {
-    if (method == 'jaccard') {
-      disX <- as.matrix(dist(t(X), method="Jaccard"), diag=1)
-
-    }
-    else if (method == 'euclidean') {
-      disX <- as.matrix(dist(t(X), method="euclidean"), diag=1)
-    }
-    else if (method == "cosine") {
-      disX <- as.matrix(dist(t(X), method="cosine"), diag=1)
-    }
-  }
-  else {
-    disX <- as.matrix(dist(t(X), method="euclidean"), diag=1)
-  }
   if(type == 'rao-stirling' || type=='rs' || type == 'all') {
-      m_d <- as.data.frame(rowSums(propX^beta %*% disX^alpha * propX^beta))
-      colnames(m_d) <- c('rao-stirling')
+  	disX <- distances(X, agg_type = agg_type, method=method) #compute distances first	
+  	disX_mask <- disX
+  	disX_mask[ (!is.na(disX_mask))] <- 1
+  	disX_mask[lower.tri(disX_mask)] <- 0
+  	diag(disX_mask) <- 0
+  	disX_mask[is.na(disX_mask)] <- 0
+
+  	#print(propX)
+  	N <- ncol(propX)
+  	m_d <- data.frame(row.names =  rownames(propX))
+  	str(m_d)
+  	m_d[,'rao-stirling'] <- NA
+
+  	for(entity in row.names(propX)) #go into each entity
+  	  {
+  	  	entity_data <- propX[entity,]
+  	  	prop_i <- matrix(entity_data, ,nrow=N,ncol=N, byrow=TRUE)
+  	  	prop_j <- matrix(entity_data, ,nrow=N,ncol=N, byrow=FALSE)
+  	  	p_ij <- prop_i * prop_j
+				p_ij_mask <- p_ij
+  			#p_ij_mask[p_ij_mask != 0] <- 1
+  			p_ij_mask[upper.tri(p_ij_mask)] <- 1
+  		  diag(p_ij_mask) <- 0
+  		  p_ij_mask[lower.tri(p_ij_mask)] <- 0
+  	  	rs_entity <- ((disX^alpha)*disX_mask) * ((p_ij^beta)*p_ij_mask)  #deleting proportions or distances inexistent that might become 1, because of potentia of zero.	
+  		  m_d[entity, 'rao-stirling'] <- sum(rs_entity)
+  		  print(entity)
+  		  print(p_ij)
+  		  print(p_ij_mask)
+  	    print((p_ij^beta)*p_ij_mask)
+  	  }
+  	#print(rs_entity)
+  	#print((p_ij^beta)*p_ij_mask)
+  	print(propX)
+  	print(disX)
+  	print(disX_mask)
+
+  	#print((disX^alpha)*disX_mask)
+  	  #m_d <- as.data.frame(rowSums(propX^beta %*% disX^alpha * propX^beta))
+
       diversity <- merge(diversity,m_d, by=0, all=TRUE)
       rownames(diversity) <- diversity$Row.names; diversity$Row.names <- NULL
   }
@@ -156,6 +179,8 @@ diversity <- function(data, type="all", method='euclidean', agg_type=NULL, q=0,
   return(diversity)
 }
 
+
+
 #' @title Get Data
 #' @description It takes data as dataframe (edges) or as matrix (table) to be exported in proper form to be used by the diversity function.
 #' @param data Data to be processed as dataframe or as matrix. 
@@ -168,6 +193,7 @@ get_data <- function(data, agg_type)
 	if (is.data.frame(data)) {
 		if(!is.null(agg_type)) {
 			#diversity <- data.frame(row.names=levels(data[,2]))
+			data <- droplevels(data) #delete un used levels
 			X <- matrix(0, nrow=nlevels(data[,2]), ncol=nlevels(data[,1]), dimnames=list(levels(data[,2]),levels(data[,1])))
 			X[cbind(data[,2], data[,1])] <- data[,3]
 		}
@@ -191,14 +217,34 @@ get_data <- function(data, agg_type)
 
 }
 
+
+#' @title Variety
+#' @description It computes the variety or simple diversity of a system. Number of types
+#' @param data Data to be processed as dataframe or as matrix. 
+#' @examples 
+#' vari <- varity(data=d)
+#' @return a dataframe with values of variety
+#' @export
+variety <- function(data, sort=TRUE)
+{
+	vari <- diversity(data, type='v')
+	if(sort != FALSE)
+	{
+		vari['category'] <- row.names(vari)
+		vari <- vari[order(vari$variety, decreasing = TRUE), ]
+		vari['category'] <- NULL	
+	}
+
+	return(vari)
+}
+
 #' @title Ubiquity
 #' @description It computes the ubiquity or the rearnes of the categories
 #' @param data Data to be processed as dataframe or as matrix. 
 #' @param data_agg Diferent of NULL if column analysis is needed.
 #' @examples 
 #' ub <- ubiquity(data=d)
 #' @return a dataframe with values of frequency per category. Decreasing order
-#' @export
 ubiquity <- function(data)
 {
 	ubiq <- diversity(data, type='v', method='euclidean' , agg_type='col')
@@ -250,22 +296,48 @@ readEdges <- function(path,sepr,we=TRUE){
   return(X)
 }
 
-#' @title A procedure to read a data frame from a csv file
-#' @description It takes a file and creates a data frame for diversity analysis
-#' @param path A string representing the path to a csv file. Rows and columns are described by strings (Ex.: "asind" "0010" 216000).
-#' @param sepr Separator field used in the file to separate columns
-#' @param we It indicates if the list of edges includes weights or not. Default is TRUE
-#' @return A data frame with objects as rows and categories as cols
+#' @title A procedure to read data of a data file in formats csv, dta or spss
+#' @description It reads a file with data shaped as a matrix or as edges list. Several types of formats are allowed.
+#' @param path A string representing the path to data file. If it is shpaed as a matrix, first column must include proper names of the categories. If it is shaped as edges list, it must contain three columns, these are entity, category, value. 
+#' @param sep Separator field used in the file to separate columns, if it is a CSV file. Default value is comma.
+#' @param type It indicates the type of data to be read. This parameter facilitate the input of diverse type of data files, as spss or stata Posible options are the names of the mentioned softwares. Default value is csv.
+#' @return A data frame with three columns, even when the input file is shaped as a matrix.
 #' @examples 
-#' path <- "~/MyDiversity/data/sitc_cnt_62.csv"
-#' sepr <- ' '
-#' we <- TRUE
-#' data <- readCSV(path,sepr,we)
-readCSV <- function(path,sepr,we=TRUE){
-  data <- data.frame()
-  #if (!we) col = 2 else col = 3
-  col_classes <- c('factor', 'factor', NA)
-  data <- rbind(data, read.csv(path, sep=sepr, colClasses=col_classes))
+#' path <-  path_to_matrix_file <- system.file("extdata", "PantheonMatrix.csv", package = "diveR")
+#' sep <- ','
+#' data <- read.data(path)
+#' path <-  path_to_matrix_file <- system.file("extdata", "PantheonEdges.csv", package = "diveR")
+#' data <- read.data(path)
+read.data <- function(path, type='csv',sep=','){
+
+	if(type=='csv')
+	{
+		data_temp <- read.csv(path, sep=sep, nrow=1)
+		if(ncol(data_temp)>3)
+		{#matrix shape
+			data <- read.csv(path,sep=sep)
+			data <- melt(data, na.rm = TRUE)
+		}
+		else
+		{
+			col_classes <- c('factor', 'factor', NA)
+			data <- read.csv(path, sep=sep, colClasses=col_classes)		
+		}
+
+	}
+		if(type=='spss')
+	{
+			data <- read.spss(path, use.value.labels = FALSE)
+	}
+	if(type=='stata')
+	{
+		data <- read.dta(path)
+	}
+	if(ncol(data)> 3)
+	{
+		data <- melt(data, na.rm=TRUE)
+	}
+	row.names(data) <- NULL
   return(data)
 }
 
@@ -290,40 +362,19 @@ writeCSV <- function(frame, path, sepr) {
 #' @examples 
 #' Xdis <- dist_mat(data)
 #' Xdis <- dist_mat(data, method="jaccard", agg_type='col')
-dist_mat <- function(data, method=NULL, agg_type=NULL){
-  if (is.data.frame(data)) {
-    if(is.null(agg_type)) {
-      X <- matrix(0, nrow=nlevels(data[,2]), ncol=nlevels(data[,1]), dimnames=list(levels(data[,2]),levels(data[,1])))
-      X[cbind(data[,2], data[,1])] <- data[,3]
-    }
-    else {
-      X <- matrix(0, nrow=nlevels(data[,1]), ncol=nlevels(data[,2]), dimnames=list(levels(data[,1]),levels(data[,2])))
-      X[cbind(data[,1], data[,2])] <- data[,3]
-    }
-  }
-  else {
-    if (!is.null(agg_type)) {
-      X <- t(data)
-    }
-    else {
-      X <- data
-    }  
-  }
-  if(!is.null(method)) {
+distances <- function(data, method='cosine', agg_type=NULL){
+    X <- get_data(data=data, agg_type=agg_type)
+
     if (method == 'jaccard') {
-      disX <- as.matrix(dist(t(X), method="Jaccard"), diag=1)
-
+      disX <- as.matrix(dist(t(X), method="Jaccard"), diag=1)  
     }
-    else if (method == 'euclidean') {
+		if (method == 'euclidean') {
       disX <- as.matrix(dist(t(X), method="euclidean"), diag=1)
     }
-    else if (method == "cosine") {
+    if (method == "cosine") {
       disX <- as.matrix(dist(t(X), method="cosine"), diag=1)
     }
-  }
-  else {
-    disX <- as.matrix(dist(t(X), method="cosine"), diag=1)
-  }
+
   return(disX)
 }
 
@@ -337,7 +388,7 @@ dist_mat <- function(data, method=NULL, agg_type=NULL){
 #' Xdis <- dist_mat(data, method="euclidean")
 #' disp <- disparity(Xdis)
 #' disp <- disparity(Xdis, method="avg", agg_type='col')
-disparity <- function(X, method="all", agg_type="row") {
+disparity <- function(X, method="all", agg_type="col") {
   if (agg_type == "col") {
     X <- t(X)
     disparity <- data.frame(row.names=rownames(X))
@@ -358,4 +409,38 @@ disparity <- function(X, method="all", agg_type="row") {
     rownames(disparity) <- disparity$Row.names; disparity$Row.names <- NULL
   }
   return(disparity)
-}
+}
+
+#' @title Balance
+#' @description A procedure to compute several measures of the dimension balance of the diversity.
+#' @param data A matrix of data with row and column names. Or a dataframe with three columns entity, category and value
+#' @examples 
+#' balance(data)
+balance <- function(data )
+{
+	balance <- diversity(data, type='entropy') #first balance measure
+	measures <- c( 'gini','simpson', 'berger-parker', 'inverse-simpson', 'evenness' )
+	for(measure in measures)
+	{
+		m_b <- diversity(data, type=measure)
+		balance <- merge(balance,m_b, by=0, all=TRUE)
+		rownames(balance) <- balance$Row.names; balance$Row.names <- NULL
+	}
+
+	return(balance)
+}
+
+
+
+#' @title A procedure to plot the matrix of data as a pheatmap
+#' @description It takes a matrix of data and plots a pheatmap of that matrix
+#' @param data A matrix of data to be ploted
+#' @param fontsize The size of the font used in the plot.
+#' @param fontsize_row The font size of the labels in the rows 
+#' @examples 
+#' pheatmap(data)
+plot.pheatmap <- function(data,title=NULL, fontsize=14, fontsize_row=7, color = c('blue','yellow'))
+{
+	pheatmap(data, cluster_rows=FALSE, cluster_cols=FALSE,main=paste(title),show_colnames=FALSE,legend=FALSE,fontsize = fontsize,fontsize_row=7, color=color)
+
+}
diff --git a/diveR.Rproj b/diveR.Rproj
@@ -0,0 +1,17 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: No
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
+
+BuildType: Package
+PackageUseDevtools: Yes
+PackageInstallArgs: --no-multiarch --with-keep.source
diff --git a/inst/extdata/PantheonEdges.csv b/inst/extdata/PantheonEdges.csv
@@ -0,0 +1 @@
+Country,Occupation,ExportVietnam,Politician,6New Zealand,Politician,2Latvia,Politician,8Uruguay,Politician,5Chile,Politician,10South Africa,Politician,9Saudi Arabia,Politician,17Portugal,Politician,36China,Politician,38Canada,Politician,10South Africa,Religious Figure,1Saudi Arabia,Religious Figure,11Portugal,Religious Figure,2China,Religious Figure,2Portugal,Nobleman,2China,Military Person,4Latvia,Philosopher,1Saudi Arabia,Philosopher,1China,Philosopher,10Canada,Philosopher,1New Zealand,Writer,1Latvia,Writer,2Uruguay,Writer,3Chile,Writer,5South Africa,Writer,4Portugal,Writer,4China,Writer,12Canada,Writer,7China,Historian,1Vietnam,Journalist,1South Africa,Journalist,1Latvia,Soccer Player,1Uruguay,Soccer Player,10Chile,Soccer Player,5South Africa,Soccer Player,4Portugal,Soccer Player,26Canada,Soccer Player,1Uruguay,Referee,1Chile,Coach,1Portugal,Coach,2Chile,Tennis Player,3China,Tennis Player,1Canada,Tennis Player,1South Africa,Basketball Player,1China,Basketball Player,1Canada,Hocker Player,1New Zealand,Racecar Driver,2South Africa,Racecar Driver,1Canada,Racecar Driver,2China,Martial Arts,1New Zealand,Mountaineer,1Canada,Wrestler,4South Africa,Athlete,1Canada,Athlete,1Latvia,Chess Master,1Chile,Biologist,1South Africa,Biologist,2Canada,Biologist,1New Zealand,Chemist,2Latvia,Chemist,1China,Chemist,1Canada,Chemist,4New Zealand,Physicist,1South Africa,Physicist,1China,Physicist,4Canada,Physicist,3Saudi Arabia,Astronomer,1South Africa,Physician,1Saudi Arabia,Physician,1Portugal,Physician,2Canada,Physician,3Canada,Phychologist,2Canada,Economist,4China,Political Scientist,1China,Engineer,1China,Mathematician,1Canada,Computer Scientist,1China,Inventor,2Canada,Inventor,1New Zealand,Actor,3South Africa,Actor,2China,Actor,5Canada,Actor,37China,Musician,1Canada,Musician,8New Zealand,Singer,1Chile,Singer,1South Africa,Singer,1Portugal,Singer,2Canada,Singer,15Canada,Composer,2Latvia,Sculptor,1Latvia,Artist,1China,Artist,1Portugal,Architect,1China,Architect,1Canada,Architect,1Canada,Comic Artist,1New Zealand,Film Director,1Latvia,Film Director,1China,Film Director,3Canada,Film Director,2Latvia,Dancer,1South Africa,Social Activist,3China,Social Activist,4Vietnam,Model,2South Africa,Model,1Canada,Model,1Canada,Magician,1Vietnam,Presenter,2Canada,Pornographic Actor,2South Africa,Businessperson,1China,Businessperson,1Portugal,Explorer,6China,Explorer,2China,Astronaut,1Saudi Arabia,Companion,3Portugal,Companion,1Saudi Arabia,Extremist,1

diff --git a/inst/extdata/PantheonMatrix.csv b/inst/extdata/PantheonMatrix.csv
@@ -0,0 +1 @@
+Country,Politician,Religious Figure,Nobleman,Military Person,Philosopher,Writer,Historian,Journalist,Soccer Player,Referee,Coach,Tennis Player,Basketball Player,Hocker Player,Racecar Driver,Martial Arts,Mountaineer,Wrestler,Athlete,Chess Master,Biologist,Chemist,Physicist,Astronomer,Physician,Phychologist,Economist,Political Scientist,Engineer,Mathematician,Computer Scientist,Inventor,Actor,Musician,Singer,Composer,Sculptor,Artist,Architect,Comic Artist,Film Director,Dancer,Social Activist,Model,Magician,Presenter,Pornographic Actor,Businessperson,Explorer,Astronaut,Companion,ExtremistVietnam,6,,,,,,,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,,2,,,,,,New Zealand,2,,,,,1,,,,,,,,,2,,1,,,,,2,1,,,,,,,,,,3,,1,,,,,,1,,,,,,,,,,,Latvia,8,,,,1,2,,,1,,,,,,,,,,,1,,1,,,,,,,,,,,,,,,1,1,,,1,1,,,,,,,,,,Uruguay,5,,,,,3,,,10,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Chile,10,,,,,5,,,5,,1,3,,,,,,,,,1,,,,,,,,,,,,,,1,,,,,,,,,,,,,,,,,South Africa,9,1,,,,4,,1,4,,,,1,,1,,,,1,,2,,1,,1,,,,,,,,2,,1,,,,,,,,3,1,,,,1,,,,Saudi Arabia,17,11,,,1,,,,,,,,,,,,,,,,,,,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,3,1Portugal,36,2,2,,,4,,,26,,2,,,,,,,,,,,,,,2,,,,,,,,,,2,,,,1,,,,,,,,,,6,,1,China,38,2,,4,10,12,1,,,,,1,1,,,1,,,,,,1,4,,,,,1,1,1,,2,5,1,,,,1,1,,3,,4,,,,,1,2,1,,Canada,10,,,,1,7,,,1,,,1,,1,2,,,4,1,,1,4,3,,3,2,4,,,,1,1,37,8,15,2,,,1,1,2,,,1,1,,2,,,,,

diff --git a/man/diversity.Rd b/man/diversity.Rd
@@ -4,8 +4,8 @@
 \alias{diversity}
 \title{Diversity measures}
 \usage{
-diversity(data, type = "all", method = NULL, agg_type = NULL, q = 0,
-  alpha = 1, beta = 1)
+diversity(data, type = "all", method = "euclidean", agg_type = NULL,
+  q = 0, alpha = 1, beta = 1)
 }
 \arguments{
 \item{data}{A numeric matrix or data frame with objects as rows and categories as columns}

diff --git a/man/get_data.Rd b/man/get_data.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2 (4.1.1): do not edit by hand
+% Please edit documentation in R/diversity.R
+\name{get_data}
+\alias{get_data}
+\title{Get Data}
+\usage{
+get_data(data, agg_type)
+}
+\arguments{
+\item{data}{Data to be processed as dataframe or as matrix.}
+
+\item{data_agg}{Diferent of NULL if column analysis is needed.}
+}
+\description{
+It takes data as dataframe (edges) or as matrix (table) to be exported in proper form to be used by the diversity function.
+}
+\examples{
+X <- get_data(data=d, agg_type=NULL)
+}
+