diff --git a/README.md b/README.md index b007ce38..f2303695 100644 --- a/README.md +++ b/README.md @@ -157,4 +157,4 @@ Mobius is licensed under the MIT license. See [LICENSE](LICENSE) file for full l * tweet [@MobiusForSpark](http://twitter.com/MobiusForSpark) ## Code of Conduct -This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. \ No newline at end of file diff --git a/appveyor.yml b/appveyor.yml index b7a50cef..4eb27748 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,4 +1,4 @@ -version: 2.0.2-SNAPSHOT.{build} +version: 2.3.1-SNAPSHOT.{build} environment: securefile: diff --git a/build/Build.cmd b/build/Build.cmd index 05239aca..485aa3b0 100644 --- a/build/Build.cmd +++ b/build/Build.cmd @@ -6,6 +6,8 @@ rem Copyright (c) Microsoft. All rights reserved. rem Licensed under the MIT license. See LICENSE file in the project root for full license information. rem +SET MAVEN_OPTS=-Dhttps.protocols=TLSv1,TLSv1.1,TLSv1.2 + if "%1" == "csharp" set buildCSharp=true SET CMDHOME=%~dp0 diff --git a/build/localmode/RunSamples.cmd b/build/localmode/RunSamples.cmd index b9690e3f..57872f73 100644 --- a/build/localmode/RunSamples.cmd +++ b/build/localmode/RunSamples.cmd @@ -47,7 +47,7 @@ if "%precheck%" == "bad" (goto :EOF) @rem @rem setup Hadoop and Spark versions @rem -set SPARK_VERSION=2.0.2 +set SPARK_VERSION=2.3.1 set HADOOP_VERSION=2.6 set APACHE_DIST_SERVER=archive.apache.org @echo [RunSamples.cmd] SPARK_VERSION=%SPARK_VERSION%, HADOOP_VERSION=%HADOOP_VERSION%, APACHE_DIST_SERVER=%APACHE_DIST_SERVER% @@ -100,7 +100,7 @@ if "!USER_EXE!"=="" ( call sparkclr-submit.cmd --conf spark.sql.warehouse.dir=%TEMP_DIR% %* ) -@if ERRORLEVEL 1 GOTO :ErrorStop +@if ERRORLEVEL 2 GOTO :ErrorStop @GOTO :EOF diff --git a/build/localmode/downloadtools.ps1 b/build/localmode/downloadtools.ps1 index c42ab8ae..512a23f3 100644 --- a/build/localmode/downloadtools.ps1 +++ b/build/localmode/downloadtools.ps1 @@ -20,7 +20,7 @@ if ($stage.ToLower() -eq "run") $hadoopVersion = if ($envValue -eq $null) { "2.6" } else { $envValue } $envValue = [Environment]::GetEnvironmentVariable("SPARK_VERSION") - $sparkVersion = if ($envValue -eq $null) { "2.0.2" } else { $envValue } + $sparkVersion = if ($envValue -eq $null) { "2.3.1" } else { $envValue } Write-Output "[downloadtools] hadoopVersion=$hadoopVersion, sparkVersion=$sparkVersion, apacheDistServer=$apacheDistServer" } diff --git a/build/localmode/run-samples.sh b/build/localmode/run-samples.sh index 685507d3..24d4f3db 100755 --- a/build/localmode/run-samples.sh +++ b/build/localmode/run-samples.sh @@ -16,7 +16,7 @@ do done # setup Hadoop and Spark versions -export SPARK_VERSION=2.0.2 +export SPARK_VERSION=2.3.1 export HADOOP_VERSION=2.6 export APACHE_DIST_SERVER=archive.apache.org echo "[run-samples.sh] SPARK_VERSION=$SPARK_VERSION, HADOOP_VERSION=$HADOOP_VERSION, APACHE_DIST_SERVER=$APACHE_DIST_SERVER" diff --git a/cpp/Riosock/Riosock.vcxproj b/cpp/Riosock/Riosock.vcxproj index d61d067c..95b642db 100644 --- a/cpp/Riosock/Riosock.vcxproj +++ b/cpp/Riosock/Riosock.vcxproj @@ -1,5 +1,5 @@  - + Debug @@ -20,13 +20,13 @@ DynamicLibrary true - v120 + v140 Unicode DynamicLibrary false - v120 + v140 true Unicode diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Adapter.csproj b/csharp/Adapter/Microsoft.Spark.CSharp/Adapter.csproj index d887daf8..72341a3f 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Adapter.csproj +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Adapter.csproj @@ -35,16 +35,17 @@ prompt 4 ..\documentation\Microsoft.Spark.CSharp.Adapter.Doc.XML + true - - ..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll + + ..\..\packages\log4net.2.0.8\lib\net45-full\log4net.dll - - ..\..\packages\Newtonsoft.Json.7.0.1\lib\net45\Newtonsoft.Json.dll + + ..\..\packages\Newtonsoft.Json.11.0.2\lib\net45\Newtonsoft.Json.dll ..\..\packages\Razorvine.Pyrolite.4.10.0.0\lib\net40\Razorvine.Pyrolite.dll @@ -98,6 +99,7 @@ + @@ -184,6 +186,7 @@ + diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Core/IRDDCollector.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Core/IRDDCollector.cs index b8b078c2..51250de6 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Core/IRDDCollector.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Core/IRDDCollector.cs @@ -3,6 +3,7 @@ using System; using System.Collections.Generic; +using Microsoft.Spark.CSharp.Network; namespace Microsoft.Spark.CSharp.Core { @@ -11,6 +12,6 @@ namespace Microsoft.Spark.CSharp.Core /// interface IRDDCollector { - IEnumerable Collect(int port, SerializedMode serializedMode, Type type); + IEnumerable Collect(SocketInfo info, SerializedMode serializedMode, Type type); } } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Core/RDD.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Core/RDD.cs index bdfbd981..9dfd1198 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Core/RDD.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Core/RDD.cs @@ -4,6 +4,7 @@ using System; using System.Collections.Generic; using System.Linq; +using Microsoft.Spark.CSharp.Network; using Microsoft.Spark.CSharp.Proxy; using Microsoft.Spark.CSharp.Services; @@ -60,6 +61,7 @@ public SparkContext SparkContext { return sparkContext; } + set { sparkContext = value; } } /// @@ -592,13 +594,13 @@ public void ForeachPartition(Action> f) /// public T[] Collect() { - int port = RddProxy.CollectAndServe(); - return Collect(port).Cast().ToArray(); + var info = RddProxy.CollectAndServe(); + return Collect(info).Cast().ToArray(); } - internal IEnumerable Collect(int port) + internal IEnumerable Collect(SocketInfo info) { - return RddProxy.RDDCollector.Collect(port, serializedMode, typeof(T)); + return RddProxy.RDDCollector.Collect(info, serializedMode, typeof(T)); } /// @@ -830,9 +832,9 @@ public T[] Take(int num) var mappedRDD = MapPartitionsWithIndex(new TakeHelper(left).Execute); - int port = sparkContext.SparkContextProxy.RunJob(mappedRDD.RddProxy, partitions); + var info = sparkContext.SparkContextProxy.RunJob(mappedRDD.RddProxy, partitions); - IEnumerable res = Collect(port).Cast(); + IEnumerable res = Collect(info).Cast(); items.AddRange(res); partsScanned += numPartsToTry; @@ -925,7 +927,7 @@ public RDD Subtract(RDD other, int numPartitions = 0) /// public RDD Repartition(int numPartitions) { - return new RDD(RddProxy.Repartition(numPartitions), sparkContext); + return new RDD(RddProxy.Repartition(numPartitions), sparkContext, serializedMode); } /// @@ -942,8 +944,8 @@ public RDD Repartition(int numPartitions) /// public RDD Coalesce(int numPartitions, bool shuffle = false) { - return new RDD(RddProxy.Coalesce(numPartitions, shuffle), sparkContext); - } + return new RDD(RddProxy.Coalesce(numPartitions, shuffle), sparkContext, serializedMode); + } /// /// Zips this RDD with another one, returning key-value pairs with the @@ -1065,8 +1067,8 @@ public IEnumerable ToLocalIterator() foreach (int partition in Enumerable.Range(0, GetNumPartitions())) { var mappedRDD = MapPartitionsWithIndex((pid, iter) => iter); - int port = sparkContext.SparkContextProxy.RunJob(mappedRDD.RddProxy, Enumerable.Range(partition, 1)); - foreach (T row in Collect(port)) + var info = sparkContext.SparkContextProxy.RunJob(mappedRDD.RddProxy, Enumerable.Range(partition, 1)); + foreach (T row in Collect(info)) yield return row; } } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Core/RDDCollector.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Core/RDDCollector.cs index 6d92ad29..05963958 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Core/RDDCollector.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Core/RDDCollector.cs @@ -11,6 +11,7 @@ using System.Text; using Microsoft.Spark.CSharp.Interop.Ipc; using Microsoft.Spark.CSharp.Network; +using Microsoft.Spark.CSharp.Services; using Microsoft.Spark.CSharp.Sql; namespace Microsoft.Spark.CSharp.Core @@ -20,14 +21,31 @@ namespace Microsoft.Spark.CSharp.Core /// class RDDCollector : IRDDCollector { - public IEnumerable Collect(int port, SerializedMode serializedMode, Type type) + private static ILoggerService logger; + private static ILoggerService Logger + { + get + { + if (logger != null) return logger; + logger = LoggerServiceFactory.GetLogger(typeof(RDDCollector)); + return logger; + } + } + + public IEnumerable Collect(SocketInfo info, SerializedMode serializedMode, Type type) { IFormatter formatter = new BinaryFormatter(); var sock = SocketFactory.CreateSocket(); - sock.Connect(IPAddress.Loopback, port); + sock.Connect(IPAddress.Loopback, info.Port, null); using (var s = sock.GetStream()) { + if (info.Secret != null) + { + SerDe.Write(s, info.Secret); + var reply = SerDe.ReadString(s); + Logger.LogDebug("Connect back to JVM: " + reply); + } byte[] buffer; while ((buffer = SerDe.ReadBytes(s)) != null && buffer.Length > 0) { diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Interop/Ipc/JvmBridge.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Interop/Ipc/JvmBridge.cs index a3e6cd96..366ed966 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Interop/Ipc/JvmBridge.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Interop/Ipc/JvmBridge.cs @@ -36,7 +36,7 @@ private ISocketWrapper GetConnection() if (!sockets.TryDequeue(out socket)) { socket = SocketFactory.CreateSocket(); - socket.Connect(IPAddress.Loopback, portNumber); + socket.Connect(IPAddress.Loopback, portNumber, null); } return socket; } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Interop/Ipc/JvmObjectReference.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Interop/Ipc/JvmObjectReference.cs index 75c27e22..12cdd934 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Interop/Ipc/JvmObjectReference.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Interop/Ipc/JvmObjectReference.cs @@ -12,12 +12,12 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc /// Reference to object created in JVM /// [Serializable] - internal class JvmObjectReference + public class JvmObjectReference { public string Id { get; private set; } private DateTime creationTime; - public JvmObjectReference(string jvmReferenceId) + internal JvmObjectReference(string jvmReferenceId) { Id = jvmReferenceId; creationTime = DateTime.UtcNow; @@ -48,6 +48,11 @@ public override int GetHashCode() return base.GetHashCode(); } + public string ObjectToString() + { + return SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(this, "toString").ToString(); + } + public string GetDebugInfo() { var javaObjectReferenceForClassObject = new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(this, "getClass").ToString()); diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Interop/SparkCLREnvironment.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Interop/SparkCLREnvironment.cs index bee4625f..befa7ee5 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Interop/SparkCLREnvironment.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Interop/SparkCLREnvironment.cs @@ -31,7 +31,9 @@ internal static ISparkCLRProxy SparkCLRProxy } } - internal static IConfigurationService configurationService; + internal static IJvmBridge JvmBridge => SparkCLRIpcProxy.JvmBridge; + + internal static IConfigurationService configurationService; internal static IConfigurationService ConfigurationService { diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Network/ByteBuf.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Network/ByteBuf.cs index 90a11796..57886d51 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Network/ByteBuf.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Network/ByteBuf.cs @@ -11,7 +11,7 @@ namespace Microsoft.Spark.CSharp.Network /// ByteBuf delimits a section of a ByteBufChunk. /// It is the smallest unit to be allocated. /// - internal class ByteBuf + public class ByteBuf { private int readerIndex; private int writerIndex; diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Network/DefaultSocketWrapper.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Network/DefaultSocketWrapper.cs index 3db32f5c..8c96fcca 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Network/DefaultSocketWrapper.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Network/DefaultSocketWrapper.cs @@ -2,182 +2,203 @@ // Licensed under the MIT license. See LICENSE file in the project root for full license information. using System; +using System.Collections.Generic; using System.IO; using System.Net; using System.Net.Sockets; +using System.Text; +using System.Threading; using Microsoft.Spark.CSharp.Configuration; using Microsoft.Spark.CSharp.Services; namespace Microsoft.Spark.CSharp.Network { - /// - /// A simple wrapper of System.Net.Sockets.Socket class. - /// - internal class DefaultSocketWrapper : ISocketWrapper - { - private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(DefaultSocketWrapper)); - private readonly Socket innerSocket; - - /// - /// Default constructor that creates a new instance of DefaultSocket class which represents - /// a traditional socket (System.Net.Socket.Socket). - /// - /// This socket is bound to Loopback with port 0. - /// - public DefaultSocketWrapper() - { - innerSocket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); - var localEndPoint = new IPEndPoint(IPAddress.Loopback, 0); - innerSocket.Bind(localEndPoint); - } - - /// - /// Initializes a instance of DefaultSocket class using the specified System.Net.Socket.Socket object. - /// - /// The existing socket - private DefaultSocketWrapper(Socket socket) - { - innerSocket = socket; - } - - /// - /// Accepts a incoming connection request. - /// - /// A DefaultSocket instance used to send and receive data - public ISocketWrapper Accept() - { - var socket = innerSocket.Accept(); - return new DefaultSocketWrapper(socket); - } - - /// - /// Close the socket connections and releases all associated resources. - /// - public void Close() - { - innerSocket.Close(); - } - - /// - /// Establishes a connection to a remote host that is specified by an IP address and a port number - /// - /// The IP address of the remote host - /// The port number of the remote host - public void Connect(IPAddress remoteaddr, int port) - { - var remoteEndPoint = new IPEndPoint(remoteaddr, port); - innerSocket.Connect(remoteEndPoint); - } - - /// - /// Returns the NetworkStream used to send and receive data. - /// - /// The underlying Stream instance that be used to send and receive data - /// - /// GetStream returns a NetworkStream that you can use to send and receive data. You must close/dispose - /// the NetworkStream by yourself. Closing DefaultSocketWrapper does not release the NetworkStream - /// - public Stream GetStream() - { - return new NetworkStream(innerSocket); - } - - /// - /// Returns a stream used to receive data only. - /// - /// The underlying Stream instance that be used to receive data - public Stream GetInputStream() - { - // The default buffer size is 64K, PythonRDD also use 64K as default buffer size. - var readBufferSize = int.Parse(Environment.GetEnvironmentVariable(ConfigurationService.CSharpWorkerReadBufferSizeEnvName) ?? "65536"); - logger.LogDebug("Input stream buffer size: [{0}]", readBufferSize); - return readBufferSize > 0 ? new BufferedStream(GetStream(), readBufferSize) : GetStream(); - } - - /// - /// Returns a stream used to send data only. - /// - /// The underlying Stream instance that be used to send data - public Stream GetOutputStream() - { - // The default buffer size is 64K, PythonRDD also use 64K as default buffer size. - var writeBufferSize = int.Parse(Environment.GetEnvironmentVariable(ConfigurationService.CSharpWorkerWriteBufferSizeEnvName) ?? "65536"); - logger.LogDebug("Output stream buffer size: [{0}]", writeBufferSize); - return writeBufferSize > 0 ? new BufferedStream(GetStream(), writeBufferSize) : GetStream(); - } - - /// - /// Starts listening for incoming connections requests - /// - /// The maximum length of the pending connections queue. - public void Listen(int backlog = 16) - { - innerSocket.Listen(backlog); - } - - /// - /// Receives network data from this socket, and returns a ByteBuf that contains the received data. - /// - /// The DefaultSocketWrapper does not support this function. - /// - /// A ByteBuf object that contains received data. - public ByteBuf Receive() - { - throw new NotImplementedException(); - } - - /// - /// Sends data to this socket with a ByteBuf object that contains data to be sent. - /// - /// The DefaultSocketWrapper does not support this function. - /// - /// A ByteBuf object that contains data to be sent - public void Send(ByteBuf data) - { - throw new NotImplementedException(); - } - - /// - /// Disposes the resources used by this instance of the DefaultSocket class. - /// - /// - protected virtual void Dispose(bool disposing) - { - if (disposing) - { - innerSocket.Dispose(); - } - } - - /// - /// Releases all resources used by the current instance of the DefaultSocket class. - /// - public void Dispose() - { - Dispose(true); - } - - /// - /// Frees resources used by DefaultSocket class - /// - ~DefaultSocketWrapper() - { - Dispose(false); - } - - /// - /// Indicates whether there are data that has been received from the network and is available to be read. - /// - public bool HasData { get { return innerSocket.Available > 0; } } - - /// - /// Returns the local endpoint. - /// - public EndPoint LocalEndPoint { get { return innerSocket.LocalEndPoint; } } - - /// - /// Returns the remote endpoint if it has one. - /// - public EndPoint RemoteEndPoint { get { return innerSocket.RemoteEndPoint; } } - } + /// + /// A simple wrapper of System.Net.Sockets.Socket class. + /// + internal class DefaultSocketWrapper : ISocketWrapper + { + private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(DefaultSocketWrapper)); + private readonly Socket innerSocket; + + /// + /// Default constructor that creates a new instance of DefaultSocket class which represents + /// a traditional socket (System.Net.Socket.Socket). + /// + /// This socket is bound to Loopback with port 0. + /// + public DefaultSocketWrapper() + { + innerSocket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + var localEndPoint = new IPEndPoint(IPAddress.Loopback, 0); + innerSocket.Bind(localEndPoint); + } + + /// + /// Initializes a instance of DefaultSocket class using the specified System.Net.Socket.Socket object. + /// + /// The existing socket + private DefaultSocketWrapper(Socket socket) + { + innerSocket = socket; + } + + /// + /// Accepts a incoming connection request. + /// + /// A DefaultSocket instance used to send and receive data + public ISocketWrapper Accept() + { + var socket = innerSocket.Accept(); + return new DefaultSocketWrapper(socket); + } + + /// + /// Close the socket connections and releases all associated resources. + /// + public void Close() + { + innerSocket.Close(); + } + + /// + /// Establishes a connection to a remote host that is specified by an IP address and a port number + /// + /// The IP address of the remote host + /// The port number of the remote host + public void Connect(IPAddress remoteaddr, int port, string secret) + { + var remoteEndPoint = new IPEndPoint(remoteaddr, port); + innerSocket.Connect(remoteEndPoint); + } + + private static byte[] ReceiveAll(Socket socket, int len) + { + var buffer = new List(); + + while (socket.Available > 0 && buffer.Count < len) + { + var currByte = new Byte[1]; + var byteCounter = socket.Receive(currByte, currByte.Length, SocketFlags.None); + + if (byteCounter.Equals(1)) + { + buffer.Add(currByte[0]); + } + } + + return buffer.ToArray(); + } + + /// + /// Returns the NetworkStream used to send and receive data. + /// + /// The underlying Stream instance that be used to send and receive data + /// + /// GetStream returns a NetworkStream that you can use to send and receive data. You must close/dispose + /// the NetworkStream by yourself. Closing DefaultSocketWrapper does not release the NetworkStream + /// + public Stream GetStream() + { + return new NetworkStream(innerSocket); + } + + /// + /// Returns a stream used to receive data only. + /// + /// The underlying Stream instance that be used to receive data + public Stream GetInputStream() + { + // The default buffer size is 64K, PythonRDD also use 64K as default buffer size. + var readBufferSize = int.Parse(Environment.GetEnvironmentVariable(ConfigurationService.CSharpWorkerReadBufferSizeEnvName) ?? "65536"); + logger.LogDebug("Input stream buffer size: [{0}]", readBufferSize); + return readBufferSize > 0 ? new BufferedStream(GetStream(), readBufferSize) : GetStream(); + } + + /// + /// Returns a stream used to send data only. + /// + /// The underlying Stream instance that be used to send data + public Stream GetOutputStream() + { + // The default buffer size is 64K, PythonRDD also use 64K as default buffer size. + var writeBufferSize = int.Parse(Environment.GetEnvironmentVariable(ConfigurationService.CSharpWorkerWriteBufferSizeEnvName) ?? "65536"); + logger.LogDebug("Output stream buffer size: [{0}]", writeBufferSize); + return writeBufferSize > 0 ? new BufferedStream(GetStream(), writeBufferSize) : GetStream(); + } + + /// + /// Starts listening for incoming connections requests + /// + /// The maximum length of the pending connections queue. + public void Listen(int backlog = 16) + { + innerSocket.Listen(backlog); + } + + /// + /// Receives network data from this socket, and returns a ByteBuf that contains the received data. + /// + /// The DefaultSocketWrapper does not support this function. + /// + /// A ByteBuf object that contains received data. + public ByteBuf Receive() + { + throw new NotImplementedException(); + } + + /// + /// Sends data to this socket with a ByteBuf object that contains data to be sent. + /// + /// The DefaultSocketWrapper does not support this function. + /// + /// A ByteBuf object that contains data to be sent + public void Send(ByteBuf data) + { + throw new NotImplementedException(); + } + + /// + /// Disposes the resources used by this instance of the DefaultSocket class. + /// + /// + protected virtual void Dispose(bool disposing) + { + if (disposing) + { + innerSocket.Dispose(); + } + } + + /// + /// Releases all resources used by the current instance of the DefaultSocket class. + /// + public void Dispose() + { + Dispose(true); + } + + /// + /// Frees resources used by DefaultSocket class + /// + ~DefaultSocketWrapper() + { + Dispose(false); + } + + /// + /// Indicates whether there are data that has been received from the network and is available to be read. + /// + public bool HasData { get { return innerSocket.Available > 0; } } + + /// + /// Returns the local endpoint. + /// + public EndPoint LocalEndPoint { get { return innerSocket.LocalEndPoint; } } + + /// + /// Returns the remote endpoint if it has one. + /// + public EndPoint RemoteEndPoint { get { return innerSocket.RemoteEndPoint; } } + } } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Network/ISocketWrapper.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Network/ISocketWrapper.cs index b08dcd6f..45b61d26 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Network/ISocketWrapper.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Network/ISocketWrapper.cs @@ -11,7 +11,7 @@ namespace Microsoft.Spark.CSharp.Network /// ISocketWrapper interface defines the common methods to operate a socket (traditional socket or /// Windows Registered IO socket) /// - internal interface ISocketWrapper : IDisposable + public interface ISocketWrapper : IDisposable { /// /// Accepts a incoming connection request. @@ -24,12 +24,13 @@ internal interface ISocketWrapper : IDisposable /// void Close(); - /// - /// Establishes a connection to a remote host that is specified by an IP address and a port number - /// - /// The IP address of the remote host - /// The port number of the remote host - void Connect(IPAddress remoteaddr, int port); + /// + /// Establishes a connection to a remote host that is specified by an IP address and a port number + /// + /// The IP address of the remote host + /// The port number of the remote host + /// The secret to connect, can be null + void Connect(IPAddress remoteaddr, int port, string secret); /// /// Returns a stream used to send and receive data. diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Network/RioSocketWrapper.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Network/RioSocketWrapper.cs index 740787f2..54e73ed0 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Network/RioSocketWrapper.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Network/RioSocketWrapper.cs @@ -151,7 +151,7 @@ public void Close() /// /// The IP address of the remote host /// The port number of the remote host - public void Connect(IPAddress remoteaddr, int port) + public void Connect(IPAddress remoteaddr, int port, string secret) { EnsureAccessible(); diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Network/SaeaSocketWrapper.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Network/SaeaSocketWrapper.cs index cb8ed0fe..505bf96d 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Network/SaeaSocketWrapper.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Network/SaeaSocketWrapper.cs @@ -111,7 +111,7 @@ public void Close() /// /// The IP address of the remote host /// The port number of the remote host - public void Connect(IPAddress remoteaddr, int port) + public void Connect(IPAddress remoteaddr, int port, string secret) { var remoteEndPoint = new IPEndPoint(remoteaddr, port); innerSocket.Connect(remoteEndPoint); diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Network/SocketInfo.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Network/SocketInfo.cs new file mode 100644 index 00000000..d14e5cc7 --- /dev/null +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Network/SocketInfo.cs @@ -0,0 +1,28 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Microsoft.Spark.CSharp.Interop.Ipc; + +namespace Microsoft.Spark.CSharp.Network +{ + public class SocketInfo + { + public readonly int Port; + public readonly string Secret; + + public SocketInfo(int port, string secret) + { + Port = port; + Secret = secret; + } + + public static SocketInfo Parse(object o) + { + var oo = o as List; + if (oo == null) throw new Exception(o.ToString() + " is not socket info "+typeof(List)+" "+o.GetType()); + return new SocketInfo(int.Parse(oo[0].ObjectToString()), oo[1].ObjectToString()); + } + } +} diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/IDataFrameProxy.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/IDataFrameProxy.cs index 99285237..87071d9c 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/IDataFrameProxy.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/IDataFrameProxy.cs @@ -13,7 +13,7 @@ internal interface IDataFrameProxy IRDDProxy JavaToCSharp(); string GetQueryExecution(); string GetExecutedPlan(); - string GetShowString(int numberOfRows, bool truncate); + string GetShowString(int numberOfRows, int truncate, bool vertical); bool IsLocal(); IStructTypeProxy GetSchema(); IRDDProxy ToJSON(); @@ -59,7 +59,9 @@ internal interface IDataFrameProxy IDataFrameProxy Repartition(int numPartitions, IColumnProxy[] columns); IDataFrameProxy Repartition(IColumnProxy[] columns); IDataFrameProxy Sample(bool withReplacement, double fraction, long seed); - IDataFrameWriterProxy Write(); + IDataFrameProxy Broadcast(); + + IDataFrameWriterProxy Write(); } internal interface IUDFProxy diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/IRDDProxy.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/IRDDProxy.cs index e323cf47..24788c07 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/IRDDProxy.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/IRDDProxy.cs @@ -7,6 +7,7 @@ using System.Text; using System.Threading.Tasks; using Microsoft.Spark.CSharp.Core; +using Microsoft.Spark.CSharp.Network; namespace Microsoft.Spark.CSharp.Proxy { @@ -41,6 +42,6 @@ internal interface IRDDProxy void SaveAsSequenceFile(string path, string compressionCodecClass); void SaveAsTextFile(string path, string compressionCodecClass); long Count(); - int CollectAndServe(); + SocketInfo CollectAndServe(); } } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/ISparkContextProxy.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/ISparkContextProxy.cs index f1a00acb..a53fdab7 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/ISparkContextProxy.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/ISparkContextProxy.cs @@ -8,6 +8,7 @@ using System.Threading.Tasks; using Microsoft.Spark.CSharp.Core; using Microsoft.Spark.CSharp.Interop; +using Microsoft.Spark.CSharp.Network; namespace Microsoft.Spark.CSharp.Proxy @@ -50,7 +51,7 @@ internal interface ISparkContextProxy void CancelJobGroup(string groupId); void CancelAllJobs(); IStatusTrackerProxy StatusTracker { get; } - int RunJob(IRDDProxy rdd, IEnumerable partitions); + SocketInfo RunJob(IRDDProxy rdd, IEnumerable partitions); IBroadcastProxy ReadBroadcastFromFile(string path, out long broadcastId); IRDDProxy CreateCSharpRdd(IRDDProxy prefvJavaRddReference, byte[] command, Dictionary environmentVariables, List pythonIncludes, bool preservePartitioning, List broadcastVariables, List accumulator); IRDDProxy CreatePairwiseRDD(IRDDProxy javaReferenceInByteArrayRdd, int numPartitions, long partitionFuncId); diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/DataFrameIpcProxy.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/DataFrameIpcProxy.cs index 177d33c5..85c1210c 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/DataFrameIpcProxy.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/DataFrameIpcProxy.cs @@ -79,12 +79,12 @@ public string GetExecutedPlan() return SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(executedPlanReference, "toString", new object[] { }).ToString(); } - public string GetShowString(int numberOfRows, bool truncate) + public string GetShowString(int numberOfRows, int truncate, bool vertical) { return SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod( jvmDataFrameReference, "showString", - new object[] { numberOfRows, truncate }).ToString(); + new object[] { numberOfRows, truncate, vertical}).ToString(); } public bool IsLocal() @@ -575,7 +575,16 @@ public IDataFrameProxy Sample(bool withReplacement, double fraction, long seed) new object[] { withReplacement, fraction, seed }).ToString()), sqlContextProxy); } - public IDataFrameWriterProxy Write() + public IDataFrameProxy Broadcast() + { + return + new DataFrameIpcProxy( + new JvmObjectReference( + SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.functions", "broadcast", + new object[] { jvmDataFrameReference }).ToString()), sqlContextProxy); + } + + public IDataFrameWriterProxy Write() { return new DataFrameWriterIpcProxy(new JvmObjectReference( SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmDataFrameReference, "write").ToString())); diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/RDDIpcProxy.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/RDDIpcProxy.cs index 9377c079..3ef65772 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/RDDIpcProxy.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/RDDIpcProxy.cs @@ -12,6 +12,7 @@ using Microsoft.Spark.CSharp.Core; using Microsoft.Spark.CSharp.Interop; using Microsoft.Spark.CSharp.Interop.Ipc; +using Microsoft.Spark.CSharp.Network; namespace Microsoft.Spark.CSharp.Proxy.Ipc { @@ -66,10 +67,10 @@ public long Count() return long.Parse(SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(rdd, "count").ToString()); } - public int CollectAndServe() + public SocketInfo CollectAndServe() { var rdd = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmRddReference, "rdd")); - return int.Parse(SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "collectAndServe", new object[] { rdd }).ToString()); + return SocketInfo.Parse(SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "collectAndServe", new object[] { rdd })); } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SparkContextIpcProxy.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SparkContextIpcProxy.cs index 01290fd2..f48aa52e 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SparkContextIpcProxy.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SparkContextIpcProxy.cs @@ -11,6 +11,7 @@ using Microsoft.Spark.CSharp.Core; using Microsoft.Spark.CSharp.Interop; using Microsoft.Spark.CSharp.Interop.Ipc; +using Microsoft.Spark.CSharp.Network; using Microsoft.Spark.CSharp.Proxy.Ipc; namespace Microsoft.Spark.CSharp.Proxy.Ipc @@ -134,10 +135,8 @@ public IHadoopConfigurationProxy HadoopConfiguration public void Accumulator(int port) { - jvmAccumulatorReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmJavaContextReference, "accumulator", - SparkCLRIpcProxy.JvmBridge.CallConstructor("java.util.ArrayList"), - SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.api.python.PythonAccumulatorParam", IPAddress.Loopback.ToString(), port) - )); + jvmAccumulatorReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.api.python.PythonAccumulatorV2", IPAddress.Loopback.ToString(), port); + SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSparkContextReference, "register", new object[] { jvmAccumulatorReference }); } public void Stop() @@ -241,7 +240,7 @@ public void SetCheckpointDir(string directory) public void SetJobGroup(string groupId, string description, bool interruptOnCancel) { - SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmJavaContextReference, "setCheckpointDir", new object[] { groupId, description, interruptOnCancel }); + SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmJavaContextReference, "setJobGroup", new object[] { groupId, description, interruptOnCancel }); } public void SetLocalProperty(string key, string value) @@ -344,10 +343,10 @@ public IUDFProxy CreateUserDefinedCSharpFunction(string name, byte[] command, st } - public int RunJob(IRDDProxy rdd, IEnumerable partitions) + public SocketInfo RunJob(IRDDProxy rdd, IEnumerable partitions) { var jpartitions = JvmBridgeUtils.GetJavaList(partitions); - return int.Parse(SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "runJob", new object[] { jvmSparkContextReference, (rdd as RDDIpcProxy).JvmRddReference, jpartitions }).ToString()); + return SocketInfo.Parse(SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "runJob", new object[] { jvmSparkContextReference, (rdd as RDDIpcProxy).JvmRddReference, jpartitions })); } public IBroadcastProxy ReadBroadcastFromFile(string path, out long broadcastId) diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SparkSessionIpcProxy.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SparkSessionIpcProxy.cs index febfd3b5..bc6e5a19 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SparkSessionIpcProxy.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SparkSessionIpcProxy.cs @@ -27,7 +27,9 @@ public IUdfRegistrationProxy Udf } } - public ISqlContextProxy SqlContextProxy + internal JvmObjectReference JvmReference => jvmSparkSessionReference; + + public ISqlContextProxy SqlContextProxy { get { return sqlContextProxy; } } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SqlContextIpcProxy.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SqlContextIpcProxy.cs index 4bb930fe..d6f00984 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SqlContextIpcProxy.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SqlContextIpcProxy.cs @@ -106,7 +106,7 @@ public void RegisterFunction(string name, byte[] command, string returnType) var udf = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.sql.execution.python.UserDefinedPythonFunction", new object[] { - name, function, dt + name, function, dt, 100 /*BatchUDF*/, true /*deterministic*/ }); SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(judf, "registerPython", new object[] { name, udf }); diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/DataFrame.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/DataFrame.cs index 66601ca2..b288baa9 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/DataFrame.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/DataFrame.cs @@ -6,7 +6,9 @@ using System.Globalization; using System.Linq; using Microsoft.Spark.CSharp.Core; +using Microsoft.Spark.CSharp.Interop.Ipc; using Microsoft.Spark.CSharp.Proxy; +using Microsoft.Spark.CSharp.Proxy.Ipc; using Microsoft.Spark.CSharp.Services; namespace Microsoft.Spark.CSharp.Sql @@ -66,10 +68,12 @@ private IRDDProxy RddProxy } } - /// - /// Returns true if the collect and take methods can be run locally (without any Spark executors). - /// - public bool IsLocal + internal JvmObjectReference JvmReference => (dataFrameProxy as DataFrameIpcProxy)?.JvmDataFrameReference; + + /// + /// Returns true if the collect and take methods can be run locally (without any Spark executors). + /// + public bool IsLocal { get { @@ -145,10 +149,11 @@ public long Count() /// /// Number of rows to display - default 20 /// Indicates if strings more than 20 characters long will be truncated - public void Show(int numberOfRows = 20, bool truncate = true) + /// If set to True, print output rows vertically (one line per column value). + public void Show(int numberOfRows = 20, int truncate = 20, bool vertical = false) { logger.LogInfo("Writing {0} rows in the DataFrame to Console output", numberOfRows); - Console.WriteLine(dataFrameProxy.GetShowString(numberOfRows, truncate)); + Console.WriteLine(dataFrameProxy.GetShowString(numberOfRows, truncate, vertical)); } /// @@ -166,8 +171,8 @@ public void ShowSchema() /// public IEnumerable Collect() { - int port = RddProxy.CollectAndServe(); - return Rdd.Collect(port).Cast(); + var info = RddProxy.CollectAndServe(); + return Rdd.Collect(info).Cast(); } //TODO - add this method if needed to convert Row to collection of T @@ -917,10 +922,11 @@ public DataFrame Coalesce(int numPartitions) /// /// Persist this DataFrame with the default storage level (`MEMORY_AND_DISK`) /// + /// Persist storage type // Python API: https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py persist(self, storageLevel) - public DataFrame Persist() + public DataFrame Persist(StorageLevelType type= StorageLevelType.MEMORY_AND_DISK) { - dataFrameProxy.Persist(StorageLevelType.MEMORY_AND_DISK); + dataFrameProxy.Persist(type); return this; } @@ -944,6 +950,11 @@ public DataFrame Cache() return Persist(); } + public DataFrame Broadcast() + { + return new DataFrame(dataFrameProxy.Broadcast(), sparkContext); + } + /// /// Returns a new DataFrame that has exactly `numPartitions` partitions. /// diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/DataFrameReader.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/DataFrameReader.cs index 04fcc90c..c27700e2 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/DataFrameReader.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/DataFrameReader.cs @@ -159,5 +159,18 @@ public DataFrame Parquet(params string[] path) logger.LogInfo("Constructing DataFrame using Parquet source {0}", string.Join(";", path)); return new DataFrame(dataFrameReaderProxy.Parquet(path), sparkContext); } - } + + /// + /// Loads a AVRO file (one object per line) and returns the result as a DataFrame. + /// + /// This function goes through the input once to determine the input schema. If you know the + /// schema in advance, use the version that specifies the schema to avoid the extra scan. + /// + /// input path + public DataFrame Avro(string path) + { + logger.LogInfo("Constructing DataFrame using AVRO source {0}", path); + return Format("com.databricks.spark.avro").Load(path); + } + } } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/DataFrameWriter.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/DataFrameWriter.cs index a16478dd..9fa9fdb0 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/DataFrameWriter.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/DataFrameWriter.cs @@ -170,5 +170,16 @@ public void Parquet(string path) { Format("parquet").Save(path); } - } + + /// + /// Saves the content of the DataFrame in AVRO format at the specified path. + /// This is equivalent to: + /// Format("com.databricks.spark.avro").Save(path) + /// + public void Avro(string path) + { + Format("com.databricks.spark.avro").Save(path); + } + + } } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Dataset.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Dataset.cs index b3a81cf0..bc89168c 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Dataset.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Dataset.cs @@ -92,7 +92,8 @@ public IEnumerable Columns() /// /// Number of rows - default is 20 /// Indicates if rows with more than 20 characters to be truncated - public void Show(int numberOfRows = 20, bool truncate = true) + /// If set to true, prints output rows vertically (one line per column value). + public void Show(int numberOfRows = 20, int truncate = 20, bool vertical = false) { ToDF().Show(numberOfRows, truncate); } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Functions.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Functions.cs index c9166fe0..a23d91a0 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Functions.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Functions.cs @@ -4,6 +4,8 @@ using System; using System.Collections.Generic; using System.Linq; +using System.Reflection; +using System.Runtime.Serialization; using System.Text; using System.Threading.Tasks; @@ -1119,5 +1121,42 @@ internal IEnumerable Execute(int pid, IEnumerable input) return input.Select(a => func((A1)(a[0]), (A2)(a[1]), (A3)(a[2]), (A4)(a[3]), (A5)(a[4]), (A6)(a[5]), (A7)(a[6]), (A8)(a[7]), (A9)(a[8]), (A10)(a[9]))).Cast(); } } - #endregion + + [Serializable] + internal class UdfReflectionHelper + { + private readonly MethodInfo func; + + [NonSerialized] + private object[] _cache; + + internal UdfReflectionHelper(MethodInfo f) + { + func = f; + _cache = new object[func.GetParameters().Length]; + } + + public Type ReturnType => func.ReturnType; + + [OnDeserialized()] + public void Init(StreamingContext context) + { + _cache = new object[func.GetParameters().Length]; + } + + internal IEnumerable Execute(int pid, IEnumerable input) + { + return input.Select(Run).Cast(); + } + + private dynamic Run(dynamic input) + { + for (int i = 0; i < _cache.Length; ++i) + { + _cache[i] = input[i]; + } + return func.Invoke(null, _cache); + } + } + #endregion } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Row.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Row.cs index 77614a71..a299d1a8 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Row.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Row.cs @@ -18,17 +18,24 @@ public abstract class Row [NonSerialized] private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(Row)); - /// - /// Number of elements in the Row. - /// - /// elements count in this row - public abstract int Size(); + public abstract dynamic[] Values { get; } + + /// + /// Number of elements in the Row. + /// + /// elements count in this row + public abstract int Size(); /// /// Schema for the row. /// public abstract StructType GetSchema(); + public virtual void ResetValues(dynamic[] values) + { + throw new NotImplementedException(); + } + /// /// Returns the value at position i. /// @@ -80,8 +87,22 @@ public T GetAs(string columnName) internal class RowImpl : Row { private readonly StructType schema; - public dynamic[] Values { get { return values; } } - private readonly dynamic[] values; + + public override dynamic[] Values + { + get + { + if (!valuesConverted) + { + schema.ConvertPickleObjects(rawValues,rawValues); + valuesConverted = true; + } + return rawValues; + } + } + + private dynamic[] rawValues; + private bool valuesConverted = false; private readonly int columnCount; @@ -96,11 +117,11 @@ internal RowImpl(dynamic data, StructType schema) { if (data is dynamic[]) { - values = data as dynamic[]; + rawValues = data as dynamic[]; } else if (data is List) { - values = (data as List).ToArray(); + rawValues = (data as List).ToArray(); } else { @@ -109,17 +130,25 @@ internal RowImpl(dynamic data, StructType schema) this.schema = schema; - columnCount = values.Count(); - int schemaColumnCount = this.schema.Fields.Count(); + columnCount = rawValues.Length; + int schemaColumnCount = this.schema.Fields.Count; if (columnCount != schemaColumnCount) { throw new Exception(string.Format("column count inferred from data ({0}) and schema ({1}) mismatch", columnCount, schemaColumnCount)); } - - Initialize(); } - public override int Size() + public override void ResetValues(dynamic[] values) + { + if (columnCount != values.Length) + { + throw new ArgumentException("column count inferred from data and schema mismatch"); + } + rawValues = values; + valuesConverted = false; + } + + public override int Size() { return columnCount; } @@ -131,16 +160,15 @@ public override StructType GetSchema() public override dynamic Get(int i) { + if (i >= 0 && i < columnCount) return Values[i]; if (i >= columnCount) { throw new Exception(string.Format("i ({0}) >= columnCount ({1})", i, columnCount)); } - else if(i < 0) + else { throw new Exception(string.Format("i ({0}) < 0", i)); } - - return values[i]; } public override dynamic Get(string columnName) @@ -152,7 +180,7 @@ public override dynamic Get(string columnName) public override string ToString() { List cols = new List(); - foreach (var item in values) + foreach (var item in Values) { if (item != null) { @@ -166,73 +194,7 @@ public override string ToString() return string.Format("[{0}]", string.Join(",", cols.ToArray())); } - - - private void Initialize() - { - - int index = 0; - foreach (var field in schema.Fields) - { - if (field.DataType is ArrayType) - { - Func convertArrayTypeToStructTypeFunc = (dataType, length) => - { - StructField[] fields = new StructField[length]; - for(int i = 0; i < length ; i++) - { - fields[i] = new StructField(string.Format("_array_{0}", i), dataType); - } - return new StructType(fields); - }; - var elementType = (field.DataType as ArrayType).ElementType; - - // Note: When creating object from json, PySpark converts Json array to Python List (https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/types.py, _create_cls(dataType)), - // then Pyrolite unpickler converts Python List to C# ArrayList (https://github.com/irmen/Pyrolite/blob/v4.10/README.txt). So values[index] should be of type ArrayList; - // In case Python changes its implementation, which means value is not of type ArrayList, try cast to object[] because Pyrolite unpickler convert Python Tuple to C# object[]. - object[] valueOfArray = values[index] is ArrayList ? (values[index] as ArrayList).ToArray() : values[index] as object[]; - if (valueOfArray == null) - { - throw new ArgumentException("Cannot parse data of ArrayType: " + field.Name); - } - - values[index] = new RowImpl(valueOfArray, elementType as StructType ?? convertArrayTypeToStructTypeFunc(elementType, valueOfArray.Length)).values; - } - else if (field.DataType is MapType) - { - //TODO - throw new NotImplementedException(); - } - else if (field.DataType is StructType) - { - dynamic value = values[index]; - if (value != null) - { - var subRow = new RowImpl(values[index], field.DataType as StructType); - values[index] = subRow; - } - } - else if (field.DataType is DecimalType) - { - //TODO - throw new NotImplementedException(); - } - else if (field.DataType is DateType) - { - //TODO - throw new NotImplementedException(); - } - else if (field.DataType is StringType) - { - if (values[index] != null) values[index] = values[index].ToString(); - } - else - { - values[index] = values[index]; - } - index++; - } - } + } } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/RowConstructor.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/RowConstructor.cs index 96b50c29..25726ad3 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/RowConstructor.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/RowConstructor.cs @@ -78,7 +78,7 @@ public Row GetRow() currentSchema = null; return row; } - + //removes objects of type RowConstructor and replacing them with actual values private object[] GetValues(object[] arguments) { @@ -86,7 +86,7 @@ private object[] GetValues(object[] arguments) int i = 0; foreach (var argument in arguments) { - if (argument != null && argument.GetType() == typeof(RowConstructor)) + if (argument is RowConstructor) { values[i++] = (argument as RowConstructor).Values; } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/SparkSession.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/SparkSession.cs index c4f72885..aa702164 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/SparkSession.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/SparkSession.cs @@ -9,7 +9,9 @@ using System.Text; using System.Threading.Tasks; using Microsoft.Spark.CSharp.Core; +using Microsoft.Spark.CSharp.Interop.Ipc; using Microsoft.Spark.CSharp.Proxy; +using Microsoft.Spark.CSharp.Proxy.Ipc; using Microsoft.Spark.CSharp.Services; using Microsoft.Spark.CSharp.Sql.Catalog; @@ -42,10 +44,12 @@ public Catalog.Catalog Catalog get { return catalog ?? (catalog = new Catalog.Catalog(SparkSessionProxy.GetCatalog())); } } - /// - /// Interface through which the user may access the underlying SparkContext. - /// - public SparkContext SparkContext { get; private set; } + internal JvmObjectReference JvmReference => (sparkSessionProxy as SparkSessionIpcProxy)?.JvmReference; + + /// + /// Interface through which the user may access the underlying SparkContext. + /// + public SparkContext SparkContext { get; private set; } public UdfRegistration Udf { @@ -114,18 +118,30 @@ public DataFrame CreateDataFrame(RDD rdd, StructType schema) // The below sqlContextProxy.CreateDataFrame() will call byteArrayRDDToAnyArrayRDD() of SQLUtils.scala which only accept RDD of type RDD[Array[Byte]]. // In byteArrayRDDToAnyArrayRDD() of SQLUtils.scala, the SerDeUtil.pythonToJava() will be called which is a mapPartitions inside. // It will be executed until the CSharpWorker finishes Pickling to RDD[Array[Byte]]. - var rddRow = rdd.Map(r => r); + var rddRow = rdd.MapPartitions(r => r.Select(rr => rr)); rddRow.serializedMode = SerializedMode.Row; return new DataFrame(sparkSessionProxy.CreateDataFrame(rddRow.RddProxy, schema.StructTypeProxy), SparkContext); } - /// - /// Returns the specified table as a - /// - /// - /// - public DataFrame Table(string tableName) + public DataFrame CreateDataFrame(RDD rdd, StructType schema) + { + // Note: This is for pickling RDD, convert to RDD which happens in CSharpWorker. + // The below sqlContextProxy.CreateDataFrame() will call byteArrayRDDToAnyArrayRDD() of SQLUtils.scala which only accept RDD of type RDD[Array[Byte]]. + // In byteArrayRDDToAnyArrayRDD() of SQLUtils.scala, the SerDeUtil.pythonToJava() will be called which is a mapPartitions inside. + // It will be executed until the CSharpWorker finishes Pickling to RDD[Array[Byte]]. + var rddRow = rdd.MapPartitions(rows => rows.Select(r => r.Values)); + rddRow.serializedMode = SerializedMode.Row; + + return new DataFrame(sparkSessionProxy.CreateDataFrame(rddRow.RddProxy, schema.StructTypeProxy), SparkContext); + } + + /// + /// Returns the specified table as a + /// + /// + /// + public DataFrame Table(string tableName) { return new DataFrame(sparkSessionProxy.Table(tableName), SparkContext); } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/SqlContext.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/SqlContext.cs index 03e9fb26..c99e9010 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/SqlContext.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/SqlContext.cs @@ -3,6 +3,7 @@ using System; using System.Collections.Generic; +using System.Reflection; using Microsoft.Spark.CSharp.Core; using Microsoft.Spark.CSharp.Proxy; using Microsoft.Spark.CSharp.Services; @@ -150,13 +151,25 @@ public DataFrame CreateDataFrame(RDD rdd, StructType schema) return new DataFrame(sqlContextProxy.CreateDataFrame(rddRow.RddProxy, schema.StructTypeProxy), sparkContext); } - /// - /// Registers the given as a temporary table in the catalog. - /// Temporary tables exist only during the lifetime of this instance of SqlContext. - /// - /// - /// - public void RegisterDataFrameAsTable(DataFrame dataFrame, string tableName) + public DataFrame CreateDataFrame(RDD rdd, StructType schema) + { + // Note: This is for pickling RDD, convert to RDD which happens in CSharpWorker. + // The below sqlContextProxy.CreateDataFrame() will call byteArrayRDDToAnyArrayRDD() of SQLUtils.scala which only accept RDD of type RDD[Array[Byte]]. + // In byteArrayRDDToAnyArrayRDD() of SQLUtils.scala, the SerDeUtil.pythonToJava() will be called which is a mapPartitions inside. + // It will be executed until the CSharpWorker finishes Pickling to RDD[Array[Byte]]. + var rddRow = rdd.Map(r => r); + rddRow.serializedMode = SerializedMode.Row; + + return new DataFrame(sqlContextProxy.CreateDataFrame(rddRow.RddProxy, schema.StructTypeProxy), sparkContext); + } + + /// + /// Registers the given as a temporary table in the catalog. + /// Temporary tables exist only during the lifetime of this instance of SqlContext. + /// + /// + /// + public void RegisterDataFrameAsTable(DataFrame dataFrame, string tableName) { sqlContextProxy.RegisterDataFrameAsTable(dataFrame.DataFrameProxy, tableName); } @@ -527,6 +540,14 @@ public void RegisterFunction(string name, Func f) Func, IEnumerable> udfHelper = new UdfHelper(f).Execute; sqlContextProxy.RegisterFunction(name, SparkContext.BuildCommand(new CSharpWorkerFunc(udfHelper), SerializedMode.Row, SerializedMode.Row), Functions.GetReturnType(typeof(RT))); } - #endregion - } + + public void RegisterFunction(string name, MethodInfo f) + { + logger.LogInfo("Name of the function to register {0}, method info", name, f.DeclaringType?.FullName + "." + f.Name); + var helper = new UdfReflectionHelper(f); + Func, IEnumerable> udfHelper = helper.Execute; + sqlContextProxy.RegisterFunction(name, SparkContext.BuildCommand(new CSharpWorkerFunc(udfHelper), SerializedMode.Row, SerializedMode.Row), Functions.GetReturnType(helper.ReturnType)); + } + #endregion + } } \ No newline at end of file diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Types.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Types.cs index 2efcf209..ef945c37 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Types.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Types.cs @@ -2,6 +2,7 @@ // Licensed under the MIT license. See LICENSE file in the project root for full license information. using System; +using System.Collections; using System.Collections.Generic; using System.Linq; using System.Reflection; @@ -14,512 +15,600 @@ namespace Microsoft.Spark.CSharp.Sql { - /// - /// The base type of all Spark SQL data types. - /// - [Serializable] - public abstract class DataType - { - /// - /// Trim "Type" in the end from class name, ToLower() to align with Scala. - /// - public string TypeName - { - get { return NormalizeTypeName(GetType().Name); } - } - - /// - /// return TypeName by default, subclass can override it - /// - public virtual string SimpleString - { - get { return TypeName; } - } - - /// - /// return only type: TypeName by default, subclass can override it - /// - internal virtual object JsonValue { get { return TypeName; } } - - /// - /// The compact JSON representation of this data type. - /// - public string Json - { - get - { - var jObject = JsonValue is JObject ? ((JObject)JsonValue).SortProperties() : JsonValue; - return JsonConvert.SerializeObject(jObject, Formatting.None); - } - } - - /// - /// Parses a Json string to construct a DataType. - /// - /// The Json string to be parsed - /// The new DataType instance from the Json string - public static DataType ParseDataTypeFromJson(string json) - { - return ParseDataTypeFromJson(JToken.Parse(json)); - } - - /// - /// Parse a JToken object to construct a DataType. - /// - /// The JToken object to be parsed - /// The new DataType instance from the Json string - /// Not implemented for "udt" type - /// - protected static DataType ParseDataTypeFromJson(JToken json) - { - if (json.Type == JTokenType.Object) // {name: address, type: {type: struct,...},...} - { - JToken type; - var typeJObject = (JObject)json; - if (typeJObject.TryGetValue("type", out type)) - { - Type complexType; - if ((complexType = ComplexTypes.FirstOrDefault(ct => NormalizeTypeName(ct.Name) == type.ToString())) != default(Type)) - { - return ((ComplexType)Activator.CreateInstance(complexType, BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Instance - , null, new object[] { typeJObject }, null)); // create new instance of ComplexType - } - if (type.ToString() == "udt") - { - // TODO - throw new NotImplementedException(); - } - } - throw new ArgumentException(string.Format("Could not parse data type: {0}", type)); - } - else // {name: age, type: bigint,...} // TODO: validate more JTokenType other than Object - { - return ParseAtomicType(json); - } - - } - - private static AtomicType ParseAtomicType(JToken type) - { - Type atomicType; - if ((atomicType = AtomicTypes.FirstOrDefault(at => NormalizeTypeName(at.Name) == type.ToString())) != default(Type)) - { - return (AtomicType)Activator.CreateInstance(atomicType); // create new instance of AtomicType - } - - Match fixedDecimal = DecimalType.FixedDecimal.Match(type.ToString()); - if (fixedDecimal.Success) - { - return new DecimalType(int.Parse(fixedDecimal.Groups[1].Value), int.Parse(fixedDecimal.Groups[2].Value)); - } - - throw new ArgumentException(string.Format("Could not parse data type: {0}", type)); - } - - [NonSerialized] - private static readonly Type[] AtomicTypes = typeof(AtomicType).Assembly.GetTypes().Where(type => - type.IsSubclassOf(typeof(AtomicType))).ToArray(); - - [NonSerialized] - private static readonly Type[] ComplexTypes = typeof(ComplexType).Assembly.GetTypes().Where(type => - type.IsSubclassOf(typeof(ComplexType))).ToArray(); - - [NonSerialized] - private static readonly Func NormalizeTypeName = s => s.Substring(0, s.Length - 4).ToLower(); // trim "Type" at the end of type name - - - } - - /// - /// An internal type used to represent a simple type. - /// - [Serializable] - public class AtomicType : DataType - { - } - - /// - /// An internal type used to represent a complex type (such as arrays, structs, and maps). - /// - [Serializable] - public abstract class ComplexType : DataType - { - /// - /// Abstract method that constructs a complex type from a Json object - /// - /// The Json object to construct a complex type - /// A new constructed complex type - public abstract DataType FromJson(JObject json); - /// - /// Constructs a complex type from a Json string - /// - /// The string that represents a Json. - /// A new constructed complex type - public DataType FromJson(string json) - { - return FromJson(JObject.Parse(json)); - } - } - - /// - /// The data type representing NULL values. - /// - [Serializable] - public class NullType : AtomicType { } - - /// - /// The data type representing String values. - /// - [Serializable] - public class StringType : AtomicType { } - - /// - /// The data type representing binary values. - /// - [Serializable] - public class BinaryType : AtomicType { } - - /// - /// The data type representing Boolean values. - /// - [Serializable] - public class BooleanType : AtomicType { } - - /// - /// The data type representing Date values. - /// - [Serializable] - public class DateType : AtomicType { } - - /// - /// The data type representing Timestamp values. - /// - [Serializable] - public class TimestampType : AtomicType { } - - /// - /// The data type representing Double values. - /// - [Serializable] - public class DoubleType : AtomicType { } - - /// - /// - /// - [Serializable] - public class FloatType : AtomicType { } - - /// - /// The data type representing Float values. - /// - [Serializable] - public class ByteType : AtomicType { } - - /// - /// - /// - [Serializable] - public class IntegerType : AtomicType { } - - /// - /// The data type representing Int values. - /// - [Serializable] - public class LongType : AtomicType { } - - /// - /// The data type representing Short values. - /// - [Serializable] - public class ShortType : AtomicType { } - - /// - /// The data type representing Decimal values. - /// - [Serializable] - public class DecimalType : AtomicType - { - /// - /// Gets the regular expression that represents a fixed decimal. - /// - public static Regex FixedDecimal = new Regex(@"decimal\((\d+),\s(\d+)\)"); - private int? precision, scale; - /// - /// Initializes a new instance of DecimalType from parameters specifying its precision and scale. - /// - /// The precision of the type - /// The scale of the type - public DecimalType(int? precision = null, int? scale = null) - { - this.precision = precision; - this.scale = scale; - } - - internal override object JsonValue - { - get { throw new NotImplementedException(); } - } - - /// - /// Constructs a DecimalType from a Json object - /// - /// The Json object used to construct a DecimalType - /// A new DecimalType instance - /// Not implemented yet. - public DataType FromJson(JObject json) - { - throw new NotImplementedException(); - } - } - - /// - /// The data type for collections of multiple values. - /// - [Serializable] - public class ArrayType : ComplexType - { - /// - /// Gets the DataType of each element in the array - /// - public DataType ElementType { get { return elementType; } } - /// - /// Returns whether the array can contain null (None) values - /// - public bool ContainsNull { get { return containsNull; } } - - /// - /// Initializes a ArrayType instance with a specific DataType and specifying if the array has null values. - /// - /// The data type of values - /// Indicates if values have null values - public ArrayType(DataType elementType, bool containsNull = true) - { - this.elementType = elementType; - this.containsNull = containsNull; - } - - internal ArrayType(JObject json) - { - FromJson(json); - } - - /// - /// Readable string representation for the type. - /// - public override string SimpleString - { - get { return string.Format("array<{0}>", elementType.SimpleString); } - } - - internal override object JsonValue - { - get - { - return new JObject( - new JProperty("type", TypeName), - new JProperty("elementType", elementType.JsonValue), - new JProperty("containsNull", containsNull)); - } - } - - /// - /// Constructs a ArrayType from a Json object - /// - /// The Json object used to construct a ArrayType - /// A new ArrayType instance - public override sealed DataType FromJson(JObject json) - { - elementType = ParseDataTypeFromJson(json["elementType"]); - containsNull = (bool)json["containsNull"]; - return this; - } - - private DataType elementType; - private bool containsNull; - } - - /// - /// The data type for Maps. Not implemented yet. - /// - [Serializable] - public class MapType : ComplexType - { - internal override object JsonValue - { - get { throw new NotImplementedException(); } - } - - /// - /// Constructs a StructField from a Json object. Not implemented yet. - /// - /// The Json object used to construct a MapType - /// A new MapType instance - /// - public override DataType FromJson(JObject json) - { - throw new NotImplementedException(); - } - } - - /// - /// A field inside a StructType. - /// - [Serializable] - public class StructField : ComplexType - { - /// - /// The name of this field. - /// - public string Name { get { return name; } } - /// - /// The data type of this field. - /// - public DataType DataType { get { return dataType; } } - /// - /// Indicates if values of this field can be null values. - /// - public bool IsNullable { get { return isNullable; } } - /// - /// The metadata of this field. The metadata should be preserved during transformation if the content of the column is not modified, e.g, in selection. - /// - public JObject Metadata { get { return metadata; } } - - /// - /// Initializes a StructField instance with a specific name, data type, nullable, and metadata - /// - /// The name of this field - /// The data type of this field - /// Indicates if values of this field can be null values - /// The metadata of this field - public StructField(string name, DataType dataType, bool isNullable = true, JObject metadata = null) - { - this.name = name; - this.dataType = dataType; - this.isNullable = isNullable; - this.metadata = metadata ?? new JObject(); - } - - internal StructField(JObject json) - { - FromJson(json); - } - - /// - /// Returns a readable string that represents the type. - /// - public override string SimpleString { get { return string.Format(@"{0}:{1}", name, dataType.SimpleString); } } - - internal override object JsonValue - { - get - { - return new JObject( - new JProperty("name", name), - new JProperty("type", dataType.JsonValue), - new JProperty("nullable", isNullable), - new JProperty("metadata", metadata)); - } - } - - /// - /// Constructs a StructField from a Json object - /// - /// The Json object used to construct a StructField - /// A new StructField instance - public override sealed DataType FromJson(JObject json) - { - name = json["name"].ToString(); - dataType = ParseDataTypeFromJson(json["type"]); - isNullable = (bool)json["nullable"]; - metadata = (JObject)json["metadata"]; - return this; - } - - private string name; - private DataType dataType; - private bool isNullable; - [NonSerialized] - private JObject metadata; - } - - /// - /// Struct type, consisting of a list of StructField - /// This is the data type representing a Row - /// - [Serializable] - public class StructType : ComplexType - { - /// - /// Gets a list of StructField. - /// - public List Fields { get { return fields; } } - - internal IStructTypeProxy StructTypeProxy - { - get - { - return structTypeProxy ?? - new StructTypeIpcProxy( - new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.api.csharp.SQLUtils", "createSchema", - new object[] { Json }).ToString())); - } - } - - /// - /// Initializes a StructType instance with a specific collection of SructField object. - /// - /// The collection that holds StructField objects - public StructType(IEnumerable fields) - { - this.fields = fields.ToList(); - } - - internal StructType(JObject json) - { - FromJson(json); - } - - internal StructType(IStructTypeProxy structTypeProxy) - { - this.structTypeProxy = structTypeProxy; - var jsonSchema = structTypeProxy.ToJson(); - FromJson(jsonSchema); - } - - /// - /// Returns a readable string that joins all s together. - /// - public override string SimpleString - { - get { return string.Format(@"struct<{0}>", string.Join(",", fields.Select(f => f.SimpleString))); } - } - - internal override object JsonValue - { - get - { - return new JObject( - new JProperty("type", TypeName), - new JProperty("fields", fields.Select(f => f.JsonValue).ToArray())); - } - } - - /// - /// Constructs a StructType from a Json object - /// - /// The Json object used to construct a StructType - /// A new StructType instance - public override sealed DataType FromJson(JObject json) - { - var fieldsJObjects = json["fields"].Select(f => (JObject)f); - fields = fieldsJObjects.Select(fieldJObject => (new StructField(fieldJObject))).ToList(); - return this; - } - - [NonSerialized] - private readonly IStructTypeProxy structTypeProxy; - - private List fields; - } + /// + /// The base type of all Spark SQL data types. + /// + [Serializable] + public abstract class DataType + { + /// + /// Trim "Type" in the end from class name, ToLower() to align with Scala. + /// + public string TypeName + { + get { return NormalizeTypeName(GetType().Name); } + } + + /// + /// return TypeName by default, subclass can override it + /// + public virtual string SimpleString + { + get { return TypeName; } + } + + /// + /// return only type: TypeName by default, subclass can override it + /// + internal virtual object JsonValue { get { return TypeName; } } + + /// + /// The compact JSON representation of this data type. + /// + public string Json + { + get + { + var jObject = JsonValue is JObject ? ((JObject)JsonValue).SortProperties() : JsonValue; + return JsonConvert.SerializeObject(jObject, Formatting.None); + } + } + + /// + /// Parses a Json string to construct a DataType. + /// + /// The Json string to be parsed + /// The new DataType instance from the Json string + public static DataType ParseDataTypeFromJson(string json) + { + return ParseDataTypeFromJson(JToken.Parse(json)); + } + + /// + /// Parse a JToken object to construct a DataType. + /// + /// The JToken object to be parsed + /// The new DataType instance from the Json string + /// Not implemented for "udt" type + /// + protected static DataType ParseDataTypeFromJson(JToken json) + { + if (json.Type == JTokenType.Object) // {name: address, type: {type: struct,...},...} + { + JToken type; + var typeJObject = (JObject)json; + if (typeJObject.TryGetValue("type", out type)) + { + Type complexType; + if ((complexType = ComplexTypes.FirstOrDefault(ct => NormalizeTypeName(ct.Name) == type.ToString())) != default(Type)) + { + return ((ComplexType)Activator.CreateInstance(complexType, BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Instance + , null, new object[] { typeJObject }, null)); // create new instance of ComplexType + } + if (type.ToString() == "udt") + { + // TODO + throw new NotImplementedException(); + } + } + throw new ArgumentException(string.Format("Could not parse data type: {0}", type)); + } + else // {name: age, type: bigint,...} // TODO: validate more JTokenType other than Object + { + return ParseAtomicType(json); + } + + } + + private static AtomicType ParseAtomicType(JToken type) + { + Type atomicType; + if ((atomicType = AtomicTypes.FirstOrDefault(at => NormalizeTypeName(at.Name) == type.ToString())) != default(Type)) + { + return (AtomicType)Activator.CreateInstance(atomicType); // create new instance of AtomicType + } + + Match fixedDecimal = DecimalType.FixedDecimal.Match(type.ToString()); + if (fixedDecimal.Success) + { + return new DecimalType(int.Parse(fixedDecimal.Groups[1].Value), int.Parse(fixedDecimal.Groups[2].Value)); + } + + throw new ArgumentException(string.Format("Could not parse data type: {0}", type)); + } + + [NonSerialized] + private static readonly Type[] AtomicTypes = typeof(AtomicType).Assembly.GetTypes().Where(type => + type.IsSubclassOf(typeof(AtomicType))).ToArray(); + + [NonSerialized] + private static readonly Type[] ComplexTypes = typeof(ComplexType).Assembly.GetTypes().Where(type => + type.IsSubclassOf(typeof(ComplexType))).ToArray(); + + [NonSerialized] + private static readonly Func NormalizeTypeName = s => s.Substring(0, s.Length - 4).ToLower(); // trim "Type" at the end of type name + + + } + + /// + /// An internal type used to represent a simple type. + /// + [Serializable] + public class AtomicType : DataType + { + } + + /// + /// An internal type used to represent a complex type (such as arrays, structs, and maps). + /// + [Serializable] + public abstract class ComplexType : DataType + { + /// + /// Abstract method that constructs a complex type from a Json object + /// + /// The Json object to construct a complex type + /// A new constructed complex type + public abstract DataType FromJson(JObject json); + /// + /// Constructs a complex type from a Json string + /// + /// The string that represents a Json. + /// A new constructed complex type + public DataType FromJson(string json) + { + return FromJson(JObject.Parse(json)); + } + } + + /// + /// The data type representing NULL values. + /// + [Serializable] + public class NullType : AtomicType { } + + /// + /// The data type representing String values. + /// + [Serializable] + public class StringType : AtomicType { } + + /// + /// The data type representing binary values. + /// + [Serializable] + public class BinaryType : AtomicType { } + + /// + /// The data type representing Boolean values. + /// + [Serializable] + public class BooleanType : AtomicType { } + + /// + /// The data type representing Date values. + /// + [Serializable] + public class DateType : AtomicType { } + + /// + /// The data type representing Timestamp values. + /// + [Serializable] + public class TimestampType : AtomicType { } + + /// + /// The data type representing Double values. + /// + [Serializable] + public class DoubleType : AtomicType { } + + /// + /// + /// + [Serializable] + public class FloatType : AtomicType { } + + /// + /// The data type representing Float values. + /// + [Serializable] + public class ByteType : AtomicType { } + + /// + /// + /// + [Serializable] + public class IntegerType : AtomicType { } + + /// + /// The data type representing Int values. + /// + [Serializable] + public class LongType : AtomicType { } + + /// + /// The data type representing Short values. + /// + [Serializable] + public class ShortType : AtomicType { } + + /// + /// The data type representing Decimal values. + /// + [Serializable] + public class DecimalType : AtomicType + { + /// + /// Gets the regular expression that represents a fixed decimal. + /// + public static Regex FixedDecimal = new Regex(@"decimal\s*\((\d+),\s*(\d+)\)"); + private int? precision, scale; + /// + /// Initializes a new instance of DecimalType from parameters specifying its precision and scale. + /// + /// The precision of the type + /// The scale of the type + public DecimalType(int? precision = null, int? scale = null) + { + this.precision = precision; + this.scale = scale; + } + + internal override object JsonValue + { + get + { + if (precision == null && scale == null) return "decimal"; + return "decimal(" + precision + "," + scale + ")"; + } + } + + /// + /// Constructs a DecimalType from a Json object + /// + /// The Json object used to construct a DecimalType + /// A new DecimalType instance + /// Not implemented yet. + public DataType FromJson(JObject json) + { + return ParseDataTypeFromJson(json); + } + } + + /// + /// The data type for collections of multiple values. + /// + [Serializable] + public class ArrayType : ComplexType + { + /// + /// Gets the DataType of each element in the array + /// + public DataType ElementType { get { return elementType; } } + /// + /// Returns whether the array can contain null (None) values + /// + public bool ContainsNull { get { return containsNull; } } + + /// + /// Initializes a ArrayType instance with a specific DataType and specifying if the array has null values. + /// + /// The data type of values + /// Indicates if values have null values + public ArrayType(DataType elementType, bool containsNull = true) + { + this.elementType = elementType; + this.containsNull = containsNull; + } + + internal ArrayType(JObject json) + { + FromJson(json); + } + + /// + /// Readable string representation for the type. + /// + public override string SimpleString + { + get { return string.Format("array<{0}>", elementType.SimpleString); } + } + + internal override object JsonValue + { + get + { + return new JObject( + new JProperty("type", TypeName), + new JProperty("elementType", elementType.JsonValue), + new JProperty("containsNull", containsNull)); + } + } + + /// + /// Constructs a ArrayType from a Json object + /// + /// The Json object used to construct a ArrayType + /// A new ArrayType instance + public override sealed DataType FromJson(JObject json) + { + elementType = ParseDataTypeFromJson(json["elementType"]); + containsNull = (bool)json["containsNull"]; + return this; + } + + private DataType elementType; + private bool containsNull; + } + + /// + /// The data type for Maps. Not implemented yet. + /// + [Serializable] + public class MapType : ComplexType + { + internal override object JsonValue + { + get { throw new NotImplementedException(); } + } + + /// + /// Constructs a StructField from a Json object. Not implemented yet. + /// + /// The Json object used to construct a MapType + /// A new MapType instance + /// + public override DataType FromJson(JObject json) + { + throw new NotImplementedException(); + } + } + + /// + /// A field inside a StructType. + /// + [Serializable] + public class StructField : ComplexType + { + /// + /// The name of this field. + /// + public string Name { get { return name; } } + /// + /// The data type of this field. + /// + public DataType DataType { get { return dataType; } } + /// + /// Indicates if values of this field can be null values. + /// + public bool IsNullable { get { return isNullable; } } + /// + /// The metadata of this field. The metadata should be preserved during transformation if the content of the column is not modified, e.g, in selection. + /// + public JObject Metadata { get { return metadata; } } + + /// + /// Initializes a StructField instance with a specific name, data type, nullable, and metadata + /// + /// The name of this field + /// The data type of this field + /// Indicates if values of this field can be null values + /// The metadata of this field + public StructField(string name, DataType dataType, bool isNullable = true, JObject metadata = null) + { + this.name = name; + this.dataType = dataType; + this.isNullable = isNullable; + this.metadata = metadata ?? new JObject(); + } + + internal StructField(JObject json) + { + FromJson(json); + } + + /// + /// Returns a readable string that represents the type. + /// + public override string SimpleString { get { return string.Format(@"{0}:{1}", name, dataType.SimpleString); } } + + internal override object JsonValue + { + get + { + return new JObject( + new JProperty("name", name), + new JProperty("type", dataType.JsonValue), + new JProperty("nullable", isNullable), + new JProperty("metadata", metadata)); + } + } + + /// + /// Constructs a StructField from a Json object + /// + /// The Json object used to construct a StructField + /// A new StructField instance + public override sealed DataType FromJson(JObject json) + { + name = json["name"].ToString(); + dataType = ParseDataTypeFromJson(json["type"]); + isNullable = (bool)json["nullable"]; + metadata = (JObject)json["metadata"]; + return this; + } + + private string name; + private DataType dataType; + private bool isNullable; + [NonSerialized] + private JObject metadata; + } + + /// + /// Struct type, consisting of a list of StructField + /// This is the data type representing a Row + /// + [Serializable] + public class StructType : ComplexType + { + /// + /// Gets a list of StructField. + /// + public List Fields { get { return fields; } } + + + private Lazy[]> pickleConverters; + + private Func[] ConstructPickleConverters() + { + var funcs = new Func[fields.Count]; + int index = 0; + foreach (var field in fields) + { + if (field.DataType is StringType) + { + funcs[index] = x => x?.ToString(); + } + /*else if (field.DataType is LongType) + { + funcs[index] = x => x==null?null:(dynamic)(long)x ; + }*/ + /*else if (field.DataType is DateType) + { + funcs[index] = x => x; + }*/ + else if (field.DataType is ArrayType) + { + Func convertArrayTypeToStructTypeFunc = (dataType, length) => + { + StructField[] f = new StructField[length]; + for (int i = 0; i < length; i++) + { + f[i] = new StructField(string.Format("_array_{0}", i), dataType); + } + return new StructType(f); + }; + var elementType = (field.DataType as ArrayType).ElementType; + funcs[index] = x => + { + + // Note: When creating object from json, PySpark converts Json array to Python List (https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/types.py, _create_cls(dataType)), + // then Pyrolite unpickler converts Python List to C# ArrayList (https://github.com/irmen/Pyrolite/blob/v4.10/README.txt). So values[index] should be of type ArrayList; + // In case Python changes its implementation, which means value is not of type ArrayList, try cast to object[] because Pyrolite unpickler convert Python Tuple to C# object[]. + object[] valueOfArray = (x as ArrayList)?.ToArray() ?? x as object[]; + if (valueOfArray == null) + { + throw new ArgumentException("Cannot parse data of ArrayType: " + field.Name); + } + + return new RowImpl(valueOfArray, + elementType as StructType ?? convertArrayTypeToStructTypeFunc(elementType, valueOfArray.Length)).Values; // TODO: this part may have some problems, not verified + }; + } + else if (field.DataType is MapType) + { + //TODO + throw new NotImplementedException(); + } + else if (field.DataType is StructType) + { + funcs[index] = x => x != null ? new RowImpl(x, field.DataType as StructType) : null; + } + else + { + funcs[index] = x => x; + } + index++; + } + return funcs; + } + + internal IStructTypeProxy StructTypeProxy + { + get + { + return structTypeProxy ?? + new StructTypeIpcProxy( + new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.api.csharp.SQLUtils", "createSchema", + new object[] { Json }).ToString())); + } + } + + /// + /// Initializes a StructType instance with a specific collection of SructField object. + /// + /// The collection that holds StructField objects + public StructType(IEnumerable fields) + { + this.fields = fields.ToList(); + Initialize(); + } + + internal StructType(JObject json) + { + FromJson(json); + Initialize(); + } + + internal StructType(IStructTypeProxy structTypeProxy) + { + this.structTypeProxy = structTypeProxy; + var jsonSchema = structTypeProxy.ToJson(); + FromJson(jsonSchema); + Initialize(); + } + + public void ConvertPickleObjects(dynamic[] input, dynamic[] output) + { + var c = pickleConverters.Value; + for (int i = 0; i < input.Length; ++i) + { + output[i] = c[i](input[i]); + } + } + + private void Initialize() + { + pickleConverters = new Lazy[]>(ConstructPickleConverters); + } + + /// + /// Returns a readable string that joins all s together. + /// + public override string SimpleString + { + get { return string.Format(@"struct<{0}>", string.Join(",", fields.Select(f => f.SimpleString))); } + } + + internal override object JsonValue + { + get + { + return new JObject( + new JProperty("type", TypeName), + new JProperty("fields", fields.Select(f => f.JsonValue).ToArray())); + } + } + + /// + /// Constructs a StructType from a Json object + /// + /// The Json object used to construct a StructType + /// A new StructType instance + public override sealed DataType FromJson(JObject json) + { + var fieldsJObjects = json["fields"].Select(f => (JObject)f); + fields = fieldsJObjects.Select(fieldJObject => (new StructField(fieldJObject))).ToList(); + return this; + } + + [NonSerialized] + private readonly IStructTypeProxy structTypeProxy; + + private List fields; + } } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/UdfRegistration.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/UdfRegistration.cs index b9c50083..eaa602ba 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/UdfRegistration.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/UdfRegistration.cs @@ -5,6 +5,7 @@ using System.Collections.Generic; using System.Diagnostics.CodeAnalysis; using System.Linq; +using System.Reflection; using System.Text; using System.Threading.Tasks; using Microsoft.Spark.CSharp.Core; @@ -249,6 +250,17 @@ public void RegisterFunction(string name, Func f) Func, IEnumerable> udfHelper = new UdfHelper(f).Execute; udfRegistrationProxy.RegisterFunction(name, SparkContext.BuildCommand(new CSharpWorkerFunc(udfHelper), SerializedMode.Row, SerializedMode.Row), Functions.GetReturnType(typeof(RT))); } - #endregion - } + + public void RegisterFunction(string name, MethodInfo f) + { + if (!f.IsStatic) + throw new InvalidOperationException(f.DeclaringType?.FullName + "." + f.Name + + " is not a static method, can't be registered"); + logger.LogInfo("Name of the function to register {0}, method info", name, f.DeclaringType?.FullName + "." + f.Name); + var helper = new UdfReflectionHelper(f); + Func, IEnumerable> udfHelper = helper.Execute; + udfRegistrationProxy.RegisterFunction(name, SparkContext.BuildCommand(new CSharpWorkerFunc(udfHelper), SerializedMode.Row, SerializedMode.Row), Functions.GetReturnType(helper.ReturnType)); + } + #endregion + } } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/packages.config b/csharp/Adapter/Microsoft.Spark.CSharp/packages.config index 8f5143eb..d95f59da 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/packages.config +++ b/csharp/Adapter/Microsoft.Spark.CSharp/packages.config @@ -1,7 +1,7 @@  - - + + - + \ No newline at end of file diff --git a/csharp/Adapter/documentation/Microsoft.Spark.CSharp.Adapter.Doc.XML b/csharp/Adapter/documentation/Microsoft.Spark.CSharp.Adapter.Doc.XML index 0d192a55..f7d5b481 100644 --- a/csharp/Adapter/documentation/Microsoft.Spark.CSharp.Adapter.Doc.XML +++ b/csharp/Adapter/documentation/Microsoft.Spark.CSharp.Adapter.Doc.XML @@ -3513,7 +3513,7 @@ Close the socket connections and releases all associated resources. - + Establishes a connection to a remote host that is specified by an IP address and a port number @@ -3612,12 +3612,13 @@ Close the ISocket connections and releases all associated resources. - + Establishes a connection to a remote host that is specified by an IP address and a port number The IP address of the remote host The port number of the remote host + The secret to connect, can be null @@ -3770,7 +3771,7 @@ Close the ISocket connections and releases all associated resources. - + Establishes a connection to a remote host that is specified by an IP address and a port number @@ -3912,7 +3913,7 @@ Close the ISocket connections and releases all associated resources. - + Establishes a connection to a remote host that is specified by an IP address and a port number @@ -5190,12 +5191,13 @@ row count - + Displays rows of the DataFrame in tabular form Number of rows to display - default 20 Indicates if strings more than 20 characters long will be truncated + If set to True, print output rows vertically (one line per column value). @@ -5627,10 +5629,11 @@ the 100 new partitions will claim 10 of the current partitions. - + Persist this DataFrame with the default storage level (`MEMORY_AND_DISK`) + Persist storage type @@ -6040,6 +6043,15 @@ DataFrame if no paths are passed in. + + + Loads a AVRO file (one object per line) and returns the result as a DataFrame. + + This function goes through the input once to determine the input schema. If you know the + schema in advance, use the version that specifies the schema to avoid the extra scan. + + input path + Interface used to write a DataFrame to external storage systems (e.g. file systems, @@ -6145,6 +6157,13 @@ Format("parquet").Save(path) + + + Saves the content of the DataFrame in AVRO format at the specified path. + This is equivalent to: + Format("com.databricks.spark.avro").Save(path) + + Dataset is a strongly typed collection of domain-specific objects that can be transformed @@ -6193,13 +6212,14 @@ Returns all column names as an array. - + Displays the top 20 rows of Dataset in a tabular form. Strings more than 20 characters will be truncated, and all cells will be aligned right. Number of rows - default is 20 Indicates if rows with more than 20 characters to be truncated + If set to true, prints output rows vertically (one line per column value). diff --git a/csharp/Adapter/documentation/Mobius_API_Documentation.md b/csharp/Adapter/documentation/Mobius_API_Documentation.md index c9e40654..7ee0e9ee 100644 --- a/csharp/Adapter/documentation/Mobius_API_Documentation.md +++ b/csharp/Adapter/documentation/Mobius_API_Documentation.md @@ -638,7 +638,7 @@ ####Methods -
NameDescription
RegisterTempTableRegisters this DataFrame as a temporary table using the given name. The lifetime of this temporary table is tied to the SqlContext that was used to create this DataFrame.
CountNumber of rows in the DataFrame
ShowDisplays rows of the DataFrame in tabular form
ShowSchemaPrints the schema information of the DataFrame
CollectReturns all of Rows in this DataFrame
ToRDDConverts the DataFrame to RDD of Row
ToJSONReturns the content of the DataFrame as RDD of JSON strings
ExplainPrints the plans (logical and physical) to the console for debugging purposes
SelectSelects a set of columns specified by column name or Column. df.Select("colA", df["colB"]) df.Select("*", df["colB"] + 10)
SelectSelects a set of columns. This is a variant of `select` that can only select existing columns using column names (i.e. cannot construct expressions). df.Select("colA", "colB")
SelectExprSelects a set of SQL expressions. This is a variant of `select` that accepts SQL expressions. df.SelectExpr("colA", "colB as newName", "abs(colC)")
WhereFilters rows using the given condition
FilterFilters rows using the given condition
GroupByGroups the DataFrame using the specified columns, so we can run aggregation on them.
RollupCreate a multi-dimensional rollup for the current DataFrame using the specified columns, so we can run aggregation on them.
CubeCreate a multi-dimensional cube for the current DataFrame using the specified columns, so we can run aggregation on them.
AggAggregates on the DataFrame for the given column-aggregate function mapping
JoinJoin with another DataFrame - Cartesian join
JoinJoin with another DataFrame - Inner equi-join using given column name
JoinJoin with another DataFrame - Inner equi-join using given column name
JoinJoin with another DataFrame, using the specified JoinType
IntersectIntersect with another DataFrame. This is equivalent to `INTERSECT` in SQL. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, intersect(self, other)
UnionAllUnion with another DataFrame WITHOUT removing duplicated rows. This is equivalent to `UNION ALL` in SQL. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, unionAll(self, other)
SubtractReturns a new DataFrame containing rows in this frame but not in another frame. This is equivalent to `EXCEPT` in SQL. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, subtract(self, other)
DropReturns a new DataFrame with a column dropped. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, drop(self, col)
DropNaReturns a new DataFrame omitting rows with null values. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, dropna(self, how='any', thresh=None, subset=None)
NaReturns a DataFrameNaFunctions for working with missing data.
FillNaReplace null values, alias for ``na.fill()`
DropDuplicatesReturns a new DataFrame with duplicate rows removed, considering only the subset of columns. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, dropDuplicates(self, subset=None)
Replace``1Returns a new DataFrame replacing a value with another value. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, replace(self, to_replace, value, subset=None)
ReplaceAll``1Returns a new DataFrame replacing values with other values. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, replace(self, to_replace, value, subset=None)
ReplaceAll``1Returns a new DataFrame replacing values with another value. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, replace(self, to_replace, value, subset=None)
RandomSplitRandomly splits this DataFrame with the provided weights. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, randomSplit(self, weights, seed=None)
ColumnsReturns all column names as a list. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, columns(self)
DTypesReturns all column names and their data types. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, dtypes(self)
SortReturns a new DataFrame sorted by the specified column(s). Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, sort(self, *cols, **kwargs)
SortReturns a new DataFrame sorted by the specified column(s). Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, sort(self, *cols, **kwargs)
SortWithinPartitionsReturns a new DataFrame sorted by the specified column(s). Reference to https://github.com/apache/spark/blob/branch-1.6/python/pyspark/sql/dataframe.py, sortWithinPartitions(self, *cols, **kwargs)
SortWithinPartitionReturns a new DataFrame sorted by the specified column(s). Reference to https://github.com/apache/spark/blob/branch-1.6/python/pyspark/sql/dataframe.py, sortWithinPartitions(self, *cols, **kwargs)
AliasReturns a new DataFrame with an alias set. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, alias(self, alias)
WithColumnReturns a new DataFrame by adding a column. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, withColumn(self, colName, col)
WithColumnRenamedReturns a new DataFrame by renaming an existing column. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, withColumnRenamed(self, existing, new)
CorrCalculates the correlation of two columns of a DataFrame as a double value. Currently only supports the Pearson Correlation Coefficient. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, corr(self, col1, col2, method=None)
CovCalculate the sample covariance of two columns as a double value. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, cov(self, col1, col2)
FreqItemsFinding frequent items for columns, possibly with false positives. Using the frequent element count algorithm described in "http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou". Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, freqItems(self, cols, support=None) Note: This function is meant for exploratory data analysis, as we make no guarantee about the backward compatibility of the schema of the resulting DataFrame.
CrosstabComputes a pair-wise frequency table of the given columns. Also known as a contingency table. The number of distinct values for each column should be less than 1e4. At most 1e6 non-zero pair frequencies will be returned. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, crosstab(self, col1, col2)
DescribeComputes statistics for numeric columns. This include count, mean, stddev, min, and max. If no columns are given, this function computes statistics for all numerical columns.
LimitReturns a new DataFrame by taking the first `n` rows. The difference between this function and `head` is that `head` returns an array while `limit` returns a new DataFrame.
HeadReturns the first `n` rows.
FirstReturns the first row.
TakeReturns the first `n` rows in the DataFrame.
DistinctReturns a new DataFrame that contains only the unique rows from this DataFrame.
CoalesceReturns a new DataFrame that has exactly `numPartitions` partitions. Similar to coalesce defined on an RDD, this operation results in a narrow dependency, e.g. if you go from 1000 partitions to 100 partitions, there will not be a shuffle, instead each of the 100 new partitions will claim 10 of the current partitions.
PersistPersist this DataFrame with the default storage level (`MEMORY_AND_DISK`)
UnpersistMark the DataFrame as non-persistent, and remove all blocks for it from memory and disk.
CachePersist this DataFrame with the default storage level (`MEMORY_AND_DISK`)
RepartitionReturns a new DataFrame that has exactly `numPartitions` partitions.
RepartitionReturns a new [[DataFrame]] partitioned by the given partitioning columns into . The resulting DataFrame is hash partitioned. optional. If not specified, keep current partitions.
RepartitionReturns a new [[DataFrame]] partitioned by the given partitioning columns into . The resulting DataFrame is hash partitioned. optional. If not specified, keep current partitions.
SampleReturns a new DataFrame by sampling a fraction of rows.
FlatMap``1Returns a new RDD by first applying a function to all rows of this DataFrame, and then flattening the results.
Map``1Returns a new RDD by applying a function to all rows of this DataFrame.
MapPartitions``1Returns a new RDD by applying a function to each partition of this DataFrame.
ForeachPartitionApplies a function f to each partition of this DataFrame.
ForeachApplies a function f to all rows.
WriteInterface for saving the content of the DataFrame out into external storage.
SaveAsParquetFileSaves the contents of this DataFrame as a parquet file, preserving the schema. Files that are written out using this method can be read back in as a DataFrame using the `parquetFile` function in SQLContext.
InsertIntoAdds the rows from this RDD to the specified table, optionally overwriting the existing data.
SaveAsTableCreates a table from the the contents of this DataFrame based on a given data source, SaveMode specified by mode, and a set of options. Note that this currently only works with DataFrames that are created from a HiveContext as there is no notion of a persisted catalog in a standard SQL context. Instead you can write an RDD out to a parquet file, and then register that file as a table. This "table" can then be the target of an `insertInto`. Also note that while this function can persist the table metadata into Hive's metastore, the table will NOT be accessible from Hive, until SPARK-7550 is resolved.
SaveSaves the contents of this DataFrame based on the given data source, SaveMode specified by mode, and a set of options.
Returns a new DataFrame that drops rows containing any null values.
Returns a new DataFrame that drops rows containing null values. If `how` is "any", then drop rows containing any null values. If `how` is "all", then drop rows only if every column is null for that row.
Returns a new [[DataFrame]] that drops rows containing null values in the specified columns. If `how` is "any", then drop rows containing any null values in the specified columns. If `how` is "all", then drop rows only if every specified column is null for that row.
Returns a new DataFrame that drops rows containing any null values in the specified columns.
Returns a new DataFrame that drops rows containing less than `minNonNulls` non-null values.
Returns a new DataFrame that drops rows containing less than `minNonNulls` non-null values values in the specified columns.
Returns a new DataFrame that replaces null values in numeric columns with `value`.
Returns a new DataFrame that replaces null values in string columns with `value`.
Returns a new DataFrame that replaces null values in specified numeric columns. If a specified column is not a numeric column, it is ignored.
Returns a new DataFrame that replaces null values in specified string columns. If a specified column is not a numeric column, it is ignored.
Replaces values matching keys in `replacement` map with the corresponding values. Key and value of `replacement` map must have the same type, and can only be doubles or strings. The value must be of the following type: `Integer`, `Long`, `Float`, `Double`, `String`. For example, the following replaces null values in column "A" with string "unknown", and null values in column "B" with numeric value 1.0. import com.google.common.collect.ImmutableMap; df.na.fill(ImmutableMap.of("A", "unknown", "B", 1.0));
Replaces values matching keys in `replacement` map with the corresponding values. Key and value of `replacement` map must have the same type, and can only be doubles or strings. If `col` is "*", then the replacement is applied on all string columns or numeric columns. Example: import com.google.common.collect.ImmutableMap; // Replaces all occurrences of 1.0 with 2.0 in column "height". df.replace("height", ImmutableMap.of(1.0, 2.0)); // Replaces all occurrences of "UNKNOWN" with "unnamed" in column "name". df.replace("name", ImmutableMap.of("UNKNOWN", "unnamed")); // Replaces all occurrences of "UNKNOWN" with "unnamed" in all string columns. df.replace("*", ImmutableMap.of("UNKNOWN", "unnamed"));
Replaces values matching keys in `replacement` map with the corresponding values. Key and value of `replacement` map must have the same type, and can only be doubles or strings. If `col` is "*", then the replacement is applied on all string columns or numeric columns. Example: import com.google.common.collect.ImmutableMap; // Replaces all occurrences of 1.0 with 2.0 in column "height" and "weight". df.replace(new String[] {"height", "weight"}, ImmutableMap.of(1.0, 2.0)); // Replaces all occurrences of "UNKNOWN" with "unnamed" in column "firstname" and "lastname". df.replace(new String[] {"firstname", "lastname"}, ImmutableMap.of("UNKNOWN", "unnamed"));
Specifies the input data source format.
Specifies the input schema. Some data sources (e.g. JSON) can infer the input schema automatically from data. By specifying the schema here, the underlying data source can skip the schema inference step, and thus speed up data loading.
Adds an input option for the underlying data source.
Adds input options for the underlying data source.
Loads input in as a [[DataFrame]], for data sources that require a path (e.g. data backed by a local or distributed file system).
Loads input in as a DataFrame, for data sources that don't require a path (e.g. external key-value stores).
Construct a [[DataFrame]] representing the database table accessible via JDBC URL, url named table and connection properties.
Construct a DataFrame representing the database table accessible via JDBC URL url named table. Partitions of the table will be retrieved in parallel based on the parameters passed to this function. Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash your external database systems.
Construct a DataFrame representing the database table accessible via JDBC URL url named table using connection properties. The `predicates` parameter gives a list expressions suitable for inclusion in WHERE clauses; each one defines one partition of the DataFrame. Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash your external database systems.
Loads a JSON file (one object per line) and returns the result as a DataFrame. This function goes through the input once to determine the input schema. If you know the schema in advance, use the version that specifies the schema to avoid the extra scan.
Loads a Parquet file, returning the result as a [[DataFrame]]. This function returns an empty DataFrame if no paths are passed in.
Specifies the behavior when data or table already exists. Options include: - `SaveMode.Overwrite`: overwrite the existing data. - `SaveMode.Append`: append the data. - `SaveMode.Ignore`: ignore the operation (i.e. no-op). - `SaveMode.ErrorIfExists`: default option, throw an exception at runtime.
Specifies the behavior when data or table already exists. Options include: - `SaveMode.Overwrite`: overwrite the existing data. - `SaveMode.Append`: append the data. - `SaveMode.Ignore`: ignore the operation (i.e. no-op). - `SaveMode.ErrorIfExists`: default option, throw an exception at runtime.
Specifies the underlying output data source. Built-in options include "parquet", "json", etc.
Adds an output option for the underlying data source.
Adds output options for the underlying data source.
Partitions the output by the given columns on the file system. If specified, the output is laid out on the file system similar to Hive's partitioning scheme. This is only applicable for Parquet at the moment.
Saves the content of the DataFrame at the specified path.
Saves the content of the DataFrame as the specified table.
Inserts the content of the DataFrame to the specified table. It requires that the schema of the DataFrame is the same as the schema of the table. Because it inserts data to an existing table, format or options will be ignored.
Saves the content of the DataFrame as the specified table. In the case the table already exists, behavior of this function depends on the save mode, specified by the `mode` function (default to throwing an exception). When `mode` is `Overwrite`, the schema of the DataFrame does not need to be the same as that of the existing table. When `mode` is `Append`, the schema of the DataFrame need to be the same as that of the existing table, and format or options will be ignored.
Saves the content of the DataFrame to a external database table via JDBC. In the case the table already exists in the external database, behavior of this function depends on the save mode, specified by the `mode` function (default to throwing an exception). Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash your external database systems.
Saves the content of the DataFrame in JSON format at the specified path. This is equivalent to: Format("json").Save(path)
Saves the content of the DataFrame in JSON format at the specified path. This is equivalent to: Format("parquet").Save(path)
+
NameDescription
RegisterTempTableRegisters this DataFrame as a temporary table using the given name. The lifetime of this temporary table is tied to the SqlContext that was used to create this DataFrame.
CountNumber of rows in the DataFrame
ShowDisplays rows of the DataFrame in tabular form
ShowSchemaPrints the schema information of the DataFrame
CollectReturns all of Rows in this DataFrame
ToRDDConverts the DataFrame to RDD of Row
ToJSONReturns the content of the DataFrame as RDD of JSON strings
ExplainPrints the plans (logical and physical) to the console for debugging purposes
SelectSelects a set of columns specified by column name or Column. df.Select("colA", df["colB"]) df.Select("*", df["colB"] + 10)
SelectSelects a set of columns. This is a variant of `select` that can only select existing columns using column names (i.e. cannot construct expressions). df.Select("colA", "colB")
SelectExprSelects a set of SQL expressions. This is a variant of `select` that accepts SQL expressions. df.SelectExpr("colA", "colB as newName", "abs(colC)")
WhereFilters rows using the given condition
FilterFilters rows using the given condition
GroupByGroups the DataFrame using the specified columns, so we can run aggregation on them.
RollupCreate a multi-dimensional rollup for the current DataFrame using the specified columns, so we can run aggregation on them.
CubeCreate a multi-dimensional cube for the current DataFrame using the specified columns, so we can run aggregation on them.
AggAggregates on the DataFrame for the given column-aggregate function mapping
JoinJoin with another DataFrame - Cartesian join
JoinJoin with another DataFrame - Inner equi-join using given column name
JoinJoin with another DataFrame - Inner equi-join using given column name
JoinJoin with another DataFrame, using the specified JoinType
IntersectIntersect with another DataFrame. This is equivalent to `INTERSECT` in SQL. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, intersect(self, other)
UnionAllUnion with another DataFrame WITHOUT removing duplicated rows. This is equivalent to `UNION ALL` in SQL. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, unionAll(self, other)
SubtractReturns a new DataFrame containing rows in this frame but not in another frame. This is equivalent to `EXCEPT` in SQL. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, subtract(self, other)
DropReturns a new DataFrame with a column dropped. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, drop(self, col)
DropNaReturns a new DataFrame omitting rows with null values. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, dropna(self, how='any', thresh=None, subset=None)
NaReturns a DataFrameNaFunctions for working with missing data.
FillNaReplace null values, alias for ``na.fill()`
DropDuplicatesReturns a new DataFrame with duplicate rows removed, considering only the subset of columns. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, dropDuplicates(self, subset=None)
Replace``1Returns a new DataFrame replacing a value with another value. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, replace(self, to_replace, value, subset=None)
ReplaceAll``1Returns a new DataFrame replacing values with other values. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, replace(self, to_replace, value, subset=None)
ReplaceAll``1Returns a new DataFrame replacing values with another value. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, replace(self, to_replace, value, subset=None)
RandomSplitRandomly splits this DataFrame with the provided weights. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, randomSplit(self, weights, seed=None)
ColumnsReturns all column names as a list. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, columns(self)
DTypesReturns all column names and their data types. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, dtypes(self)
SortReturns a new DataFrame sorted by the specified column(s). Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, sort(self, *cols, **kwargs)
SortReturns a new DataFrame sorted by the specified column(s). Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, sort(self, *cols, **kwargs)
SortWithinPartitionsReturns a new DataFrame sorted by the specified column(s). Reference to https://github.com/apache/spark/blob/branch-1.6/python/pyspark/sql/dataframe.py, sortWithinPartitions(self, *cols, **kwargs)
SortWithinPartitionReturns a new DataFrame sorted by the specified column(s). Reference to https://github.com/apache/spark/blob/branch-1.6/python/pyspark/sql/dataframe.py, sortWithinPartitions(self, *cols, **kwargs)
AliasReturns a new DataFrame with an alias set. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, alias(self, alias)
WithColumnReturns a new DataFrame by adding a column. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, withColumn(self, colName, col)
WithColumnRenamedReturns a new DataFrame by renaming an existing column. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, withColumnRenamed(self, existing, new)
CorrCalculates the correlation of two columns of a DataFrame as a double value. Currently only supports the Pearson Correlation Coefficient. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, corr(self, col1, col2, method=None)
CovCalculate the sample covariance of two columns as a double value. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, cov(self, col1, col2)
FreqItemsFinding frequent items for columns, possibly with false positives. Using the frequent element count algorithm described in "http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou". Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, freqItems(self, cols, support=None) Note: This function is meant for exploratory data analysis, as we make no guarantee about the backward compatibility of the schema of the resulting DataFrame.
CrosstabComputes a pair-wise frequency table of the given columns. Also known as a contingency table. The number of distinct values for each column should be less than 1e4. At most 1e6 non-zero pair frequencies will be returned. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, crosstab(self, col1, col2)
DescribeComputes statistics for numeric columns. This include count, mean, stddev, min, and max. If no columns are given, this function computes statistics for all numerical columns.
LimitReturns a new DataFrame by taking the first `n` rows. The difference between this function and `head` is that `head` returns an array while `limit` returns a new DataFrame.
HeadReturns the first `n` rows.
FirstReturns the first row.
TakeReturns the first `n` rows in the DataFrame.
DistinctReturns a new DataFrame that contains only the unique rows from this DataFrame.
CoalesceReturns a new DataFrame that has exactly `numPartitions` partitions. Similar to coalesce defined on an RDD, this operation results in a narrow dependency, e.g. if you go from 1000 partitions to 100 partitions, there will not be a shuffle, instead each of the 100 new partitions will claim 10 of the current partitions.
PersistPersist this DataFrame with the default storage level (`MEMORY_AND_DISK`)
UnpersistMark the DataFrame as non-persistent, and remove all blocks for it from memory and disk.
CachePersist this DataFrame with the default storage level (`MEMORY_AND_DISK`)
RepartitionReturns a new DataFrame that has exactly `numPartitions` partitions.
RepartitionReturns a new [[DataFrame]] partitioned by the given partitioning columns into . The resulting DataFrame is hash partitioned. optional. If not specified, keep current partitions.
RepartitionReturns a new [[DataFrame]] partitioned by the given partitioning columns into . The resulting DataFrame is hash partitioned. optional. If not specified, keep current partitions.
SampleReturns a new DataFrame by sampling a fraction of rows.
FlatMap``1Returns a new RDD by first applying a function to all rows of this DataFrame, and then flattening the results.
Map``1Returns a new RDD by applying a function to all rows of this DataFrame.
MapPartitions``1Returns a new RDD by applying a function to each partition of this DataFrame.
ForeachPartitionApplies a function f to each partition of this DataFrame.
ForeachApplies a function f to all rows.
WriteInterface for saving the content of the DataFrame out into external storage.
SaveAsParquetFileSaves the contents of this DataFrame as a parquet file, preserving the schema. Files that are written out using this method can be read back in as a DataFrame using the `parquetFile` function in SQLContext.
InsertIntoAdds the rows from this RDD to the specified table, optionally overwriting the existing data.
SaveAsTableCreates a table from the the contents of this DataFrame based on a given data source, SaveMode specified by mode, and a set of options. Note that this currently only works with DataFrames that are created from a HiveContext as there is no notion of a persisted catalog in a standard SQL context. Instead you can write an RDD out to a parquet file, and then register that file as a table. This "table" can then be the target of an `insertInto`. Also note that while this function can persist the table metadata into Hive's metastore, the table will NOT be accessible from Hive, until SPARK-7550 is resolved.
SaveSaves the contents of this DataFrame based on the given data source, SaveMode specified by mode, and a set of options.
Returns a new DataFrame that drops rows containing any null values.
Returns a new DataFrame that drops rows containing null values. If `how` is "any", then drop rows containing any null values. If `how` is "all", then drop rows only if every column is null for that row.
Returns a new [[DataFrame]] that drops rows containing null values in the specified columns. If `how` is "any", then drop rows containing any null values in the specified columns. If `how` is "all", then drop rows only if every specified column is null for that row.
Returns a new DataFrame that drops rows containing any null values in the specified columns.
Returns a new DataFrame that drops rows containing less than `minNonNulls` non-null values.
Returns a new DataFrame that drops rows containing less than `minNonNulls` non-null values values in the specified columns.
Returns a new DataFrame that replaces null values in numeric columns with `value`.
Returns a new DataFrame that replaces null values in string columns with `value`.
Returns a new DataFrame that replaces null values in specified numeric columns. If a specified column is not a numeric column, it is ignored.
Returns a new DataFrame that replaces null values in specified string columns. If a specified column is not a numeric column, it is ignored.
Replaces values matching keys in `replacement` map with the corresponding values. Key and value of `replacement` map must have the same type, and can only be doubles or strings. The value must be of the following type: `Integer`, `Long`, `Float`, `Double`, `String`. For example, the following replaces null values in column "A" with string "unknown", and null values in column "B" with numeric value 1.0. import com.google.common.collect.ImmutableMap; df.na.fill(ImmutableMap.of("A", "unknown", "B", 1.0));
Replaces values matching keys in `replacement` map with the corresponding values. Key and value of `replacement` map must have the same type, and can only be doubles or strings. If `col` is "*", then the replacement is applied on all string columns or numeric columns. Example: import com.google.common.collect.ImmutableMap; // Replaces all occurrences of 1.0 with 2.0 in column "height". df.replace("height", ImmutableMap.of(1.0, 2.0)); // Replaces all occurrences of "UNKNOWN" with "unnamed" in column "name". df.replace("name", ImmutableMap.of("UNKNOWN", "unnamed")); // Replaces all occurrences of "UNKNOWN" with "unnamed" in all string columns. df.replace("*", ImmutableMap.of("UNKNOWN", "unnamed"));
Replaces values matching keys in `replacement` map with the corresponding values. Key and value of `replacement` map must have the same type, and can only be doubles or strings. If `col` is "*", then the replacement is applied on all string columns or numeric columns. Example: import com.google.common.collect.ImmutableMap; // Replaces all occurrences of 1.0 with 2.0 in column "height" and "weight". df.replace(new String[] {"height", "weight"}, ImmutableMap.of(1.0, 2.0)); // Replaces all occurrences of "UNKNOWN" with "unnamed" in column "firstname" and "lastname". df.replace(new String[] {"firstname", "lastname"}, ImmutableMap.of("UNKNOWN", "unnamed"));
Specifies the input data source format.
Specifies the input schema. Some data sources (e.g. JSON) can infer the input schema automatically from data. By specifying the schema here, the underlying data source can skip the schema inference step, and thus speed up data loading.
Adds an input option for the underlying data source.
Adds input options for the underlying data source.
Loads input in as a [[DataFrame]], for data sources that require a path (e.g. data backed by a local or distributed file system).
Loads input in as a DataFrame, for data sources that don't require a path (e.g. external key-value stores).
Construct a [[DataFrame]] representing the database table accessible via JDBC URL, url named table and connection properties.
Construct a DataFrame representing the database table accessible via JDBC URL url named table. Partitions of the table will be retrieved in parallel based on the parameters passed to this function. Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash your external database systems.
Construct a DataFrame representing the database table accessible via JDBC URL url named table using connection properties. The `predicates` parameter gives a list expressions suitable for inclusion in WHERE clauses; each one defines one partition of the DataFrame. Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash your external database systems.
Loads a JSON file (one object per line) and returns the result as a DataFrame. This function goes through the input once to determine the input schema. If you know the schema in advance, use the version that specifies the schema to avoid the extra scan.
Loads a Parquet file, returning the result as a [[DataFrame]]. This function returns an empty DataFrame if no paths are passed in.
Loads a AVRO file (one object per line) and returns the result as a DataFrame. This function goes through the input once to determine the input schema. If you know the schema in advance, use the version that specifies the schema to avoid the extra scan.
Specifies the behavior when data or table already exists. Options include: - `SaveMode.Overwrite`: overwrite the existing data. - `SaveMode.Append`: append the data. - `SaveMode.Ignore`: ignore the operation (i.e. no-op). - `SaveMode.ErrorIfExists`: default option, throw an exception at runtime.
Specifies the behavior when data or table already exists. Options include: - `SaveMode.Overwrite`: overwrite the existing data. - `SaveMode.Append`: append the data. - `SaveMode.Ignore`: ignore the operation (i.e. no-op). - `SaveMode.ErrorIfExists`: default option, throw an exception at runtime.
Specifies the underlying output data source. Built-in options include "parquet", "json", etc.
Adds an output option for the underlying data source.
Adds output options for the underlying data source.
Partitions the output by the given columns on the file system. If specified, the output is laid out on the file system similar to Hive's partitioning scheme. This is only applicable for Parquet at the moment.
Saves the content of the DataFrame at the specified path.
Saves the content of the DataFrame as the specified table.
Inserts the content of the DataFrame to the specified table. It requires that the schema of the DataFrame is the same as the schema of the table. Because it inserts data to an existing table, format or options will be ignored.
Saves the content of the DataFrame as the specified table. In the case the table already exists, behavior of this function depends on the save mode, specified by the `mode` function (default to throwing an exception). When `mode` is `Overwrite`, the schema of the DataFrame does not need to be the same as that of the existing table. When `mode` is `Append`, the schema of the DataFrame need to be the same as that of the existing table, and format or options will be ignored.
Saves the content of the DataFrame to a external database table via JDBC. In the case the table already exists in the external database, behavior of this function depends on the save mode, specified by the `mode` function (default to throwing an exception). Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash your external database systems.
Saves the content of the DataFrame in JSON format at the specified path. This is equivalent to: Format("json").Save(path)
Saves the content of the DataFrame in JSON format at the specified path. This is equivalent to: Format("parquet").Save(path)
Saves the content of the DataFrame in AVRO format at the specified path. This is equivalent to: Format("com.databricks.spark.avro").Save(path)
--- @@ -688,7 +688,7 @@ ####Methods -
NameDescription
FormatSpecifies the input data source format.
SchemaSpecifies the input schema. Some data sources (e.g. JSON) can infer the input schema automatically from data. By specifying the schema here, the underlying data source can skip the schema inference step, and thus speed up data loading.
OptionAdds an input option for the underlying data source.
OptionsAdds input options for the underlying data source.
LoadLoads input in as a [[DataFrame]], for data sources that require a path (e.g. data backed by a local or distributed file system).
LoadLoads input in as a DataFrame, for data sources that don't require a path (e.g. external key-value stores).
JdbcConstruct a [[DataFrame]] representing the database table accessible via JDBC URL, url named table and connection properties.
JdbcConstruct a DataFrame representing the database table accessible via JDBC URL url named table. Partitions of the table will be retrieved in parallel based on the parameters passed to this function. Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash your external database systems.
JdbcConstruct a DataFrame representing the database table accessible via JDBC URL url named table using connection properties. The `predicates` parameter gives a list expressions suitable for inclusion in WHERE clauses; each one defines one partition of the DataFrame. Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash your external database systems.
JsonLoads a JSON file (one object per line) and returns the result as a DataFrame. This function goes through the input once to determine the input schema. If you know the schema in advance, use the version that specifies the schema to avoid the extra scan.
ParquetLoads a Parquet file, returning the result as a [[DataFrame]]. This function returns an empty DataFrame if no paths are passed in.
+
NameDescription
FormatSpecifies the input data source format.
SchemaSpecifies the input schema. Some data sources (e.g. JSON) can infer the input schema automatically from data. By specifying the schema here, the underlying data source can skip the schema inference step, and thus speed up data loading.
OptionAdds an input option for the underlying data source.
OptionsAdds input options for the underlying data source.
LoadLoads input in as a [[DataFrame]], for data sources that require a path (e.g. data backed by a local or distributed file system).
LoadLoads input in as a DataFrame, for data sources that don't require a path (e.g. external key-value stores).
JdbcConstruct a [[DataFrame]] representing the database table accessible via JDBC URL, url named table and connection properties.
JdbcConstruct a DataFrame representing the database table accessible via JDBC URL url named table. Partitions of the table will be retrieved in parallel based on the parameters passed to this function. Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash your external database systems.
JdbcConstruct a DataFrame representing the database table accessible via JDBC URL url named table using connection properties. The `predicates` parameter gives a list expressions suitable for inclusion in WHERE clauses; each one defines one partition of the DataFrame. Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash your external database systems.
JsonLoads a JSON file (one object per line) and returns the result as a DataFrame. This function goes through the input once to determine the input schema. If you know the schema in advance, use the version that specifies the schema to avoid the extra scan.
ParquetLoads a Parquet file, returning the result as a [[DataFrame]]. This function returns an empty DataFrame if no paths are passed in.
AvroLoads a AVRO file (one object per line) and returns the result as a DataFrame. This function goes through the input once to determine the input schema. If you know the schema in advance, use the version that specifies the schema to avoid the extra scan.
--- @@ -705,7 +705,7 @@ ####Methods -
NameDescription
ModeSpecifies the behavior when data or table already exists. Options include: - `SaveMode.Overwrite`: overwrite the existing data. - `SaveMode.Append`: append the data. - `SaveMode.Ignore`: ignore the operation (i.e. no-op). - `SaveMode.ErrorIfExists`: default option, throw an exception at runtime.
ModeSpecifies the behavior when data or table already exists. Options include: - `SaveMode.Overwrite`: overwrite the existing data. - `SaveMode.Append`: append the data. - `SaveMode.Ignore`: ignore the operation (i.e. no-op). - `SaveMode.ErrorIfExists`: default option, throw an exception at runtime.
FormatSpecifies the underlying output data source. Built-in options include "parquet", "json", etc.
OptionAdds an output option for the underlying data source.
OptionsAdds output options for the underlying data source.
PartitionByPartitions the output by the given columns on the file system. If specified, the output is laid out on the file system similar to Hive's partitioning scheme. This is only applicable for Parquet at the moment.
SaveSaves the content of the DataFrame at the specified path.
SaveSaves the content of the DataFrame as the specified table.
InsertIntoInserts the content of the DataFrame to the specified table. It requires that the schema of the DataFrame is the same as the schema of the table. Because it inserts data to an existing table, format or options will be ignored.
SaveAsTableSaves the content of the DataFrame as the specified table. In the case the table already exists, behavior of this function depends on the save mode, specified by the `mode` function (default to throwing an exception). When `mode` is `Overwrite`, the schema of the DataFrame does not need to be the same as that of the existing table. When `mode` is `Append`, the schema of the DataFrame need to be the same as that of the existing table, and format or options will be ignored.
JdbcSaves the content of the DataFrame to a external database table via JDBC. In the case the table already exists in the external database, behavior of this function depends on the save mode, specified by the `mode` function (default to throwing an exception). Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash your external database systems.
JsonSaves the content of the DataFrame in JSON format at the specified path. This is equivalent to: Format("json").Save(path)
ParquetSaves the content of the DataFrame in JSON format at the specified path. This is equivalent to: Format("parquet").Save(path)
+
NameDescription
ModeSpecifies the behavior when data or table already exists. Options include: - `SaveMode.Overwrite`: overwrite the existing data. - `SaveMode.Append`: append the data. - `SaveMode.Ignore`: ignore the operation (i.e. no-op). - `SaveMode.ErrorIfExists`: default option, throw an exception at runtime.
ModeSpecifies the behavior when data or table already exists. Options include: - `SaveMode.Overwrite`: overwrite the existing data. - `SaveMode.Append`: append the data. - `SaveMode.Ignore`: ignore the operation (i.e. no-op). - `SaveMode.ErrorIfExists`: default option, throw an exception at runtime.
FormatSpecifies the underlying output data source. Built-in options include "parquet", "json", etc.
OptionAdds an output option for the underlying data source.
OptionsAdds output options for the underlying data source.
PartitionByPartitions the output by the given columns on the file system. If specified, the output is laid out on the file system similar to Hive's partitioning scheme. This is only applicable for Parquet at the moment.
SaveSaves the content of the DataFrame at the specified path.
SaveSaves the content of the DataFrame as the specified table.
InsertIntoInserts the content of the DataFrame to the specified table. It requires that the schema of the DataFrame is the same as the schema of the table. Because it inserts data to an existing table, format or options will be ignored.
SaveAsTableSaves the content of the DataFrame as the specified table. In the case the table already exists, behavior of this function depends on the save mode, specified by the `mode` function (default to throwing an exception). When `mode` is `Overwrite`, the schema of the DataFrame does not need to be the same as that of the existing table. When `mode` is `Append`, the schema of the DataFrame need to be the same as that of the existing table, and format or options will be ignored.
JdbcSaves the content of the DataFrame to a external database table via JDBC. In the case the table already exists in the external database, behavior of this function depends on the save mode, specified by the `mode` function (default to throwing an exception). Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash your external database systems.
JsonSaves the content of the DataFrame in JSON format at the specified path. This is equivalent to: Format("json").Save(path)
ParquetSaves the content of the DataFrame in JSON format at the specified path. This is equivalent to: Format("parquet").Save(path)
AvroSaves the content of the DataFrame in AVRO format at the specified path. This is equivalent to: Format("com.databricks.spark.avro").Save(path)
--- diff --git a/csharp/AdapterTest/AccumulatorTest.cs b/csharp/AdapterTest/AccumulatorTest.cs index 24ccfb57..75fb938e 100644 --- a/csharp/AdapterTest/AccumulatorTest.cs +++ b/csharp/AdapterTest/AccumulatorTest.cs @@ -33,7 +33,7 @@ public void TestInitialize() // get accumulator server port and connect to accumuator server int serverPort = (sc.SparkContextProxy as MockSparkContextProxy).AccumulatorServerPort; sock = SocketFactory.CreateSocket(); - sock.Connect(IPAddress.Loopback, serverPort); + sock.Connect(IPAddress.Loopback, serverPort, null); } [TearDown] diff --git a/csharp/AdapterTest/AdapterTest.csproj b/csharp/AdapterTest/AdapterTest.csproj index c32ed7aa..cbea5478 100644 --- a/csharp/AdapterTest/AdapterTest.csproj +++ b/csharp/AdapterTest/AdapterTest.csproj @@ -35,22 +35,25 @@ 4 + + ..\packages\log4net.2.0.8\lib\net45-full\log4net.dll + ..\packages\Moq.4.2.1510.2205\lib\net40\Moq.dll True - - ..\packages\Newtonsoft.Json.7.0.1\lib\net45\Newtonsoft.Json.dll + + ..\packages\Newtonsoft.Json.11.0.2\lib\net45\Newtonsoft.Json.dll ..\packages\NUnit.3.0.1\lib\net45\nunit.framework.dll True - + ..\packages\Razorvine.Pyrolite.4.10.0.0\lib\net40\Razorvine.Pyrolite.dll - + ..\packages\Razorvine.Serpent.1.12.0.0\lib\net40\Razorvine.Serpent.dll diff --git a/csharp/AdapterTest/DataFrameTest.cs b/csharp/AdapterTest/DataFrameTest.cs index d54a9c31..34a6dfbc 100644 --- a/csharp/AdapterTest/DataFrameTest.cs +++ b/csharp/AdapterTest/DataFrameTest.cs @@ -12,6 +12,7 @@ using Microsoft.Spark.CSharp.Proxy; using NUnit.Framework; using Moq; +using Microsoft.Spark.CSharp.Network; namespace AdapterTest { @@ -65,10 +66,10 @@ public void TestDataFrameCount() [Test] public void TestShow() { - mockDataFrameProxy.Setup(m => m.GetShowString(It.IsAny(), It.IsAny())).Returns("Show"); + mockDataFrameProxy.Setup(m => m.GetShowString(It.IsAny(), It.IsAny(), It.IsAny())).Returns("Show"); var dataFrame = new DataFrame(mockDataFrameProxy.Object, null); dataFrame.Show(); - mockDataFrameProxy.Verify(m => m.GetShowString(20, true), Times.Once); + mockDataFrameProxy.Verify(m => m.GetShowString(20, 20, false), Times.Once); } [Test] @@ -135,9 +136,9 @@ public void TestDataFrameCollect() var expectedRows = new Row[] {new MockRow(), new MockRow()}; var mockRddProxy = new Mock(); var mockRddCollector = new Mock(); - mockRddCollector.Setup(m => m.Collect(It.IsAny(), It.IsAny(), It.IsAny())) + mockRddCollector.Setup(m => m.Collect(It.IsAny(), It.IsAny(), It.IsAny())) .Returns(expectedRows); - mockRddProxy.Setup(m => m.CollectAndServe()).Returns(123); + mockRddProxy.Setup(m => m.CollectAndServe()).Returns(new SocketInfo(123,null)); mockRddProxy.Setup(m => m.RDDCollector).Returns(mockRddCollector.Object); mockDataFrameProxy.Setup(m => m.JavaToCSharp()).Returns(mockRddProxy.Object); var dataFrame = new DataFrame(mockDataFrameProxy.Object, null); @@ -838,9 +839,9 @@ public void TestHead() var expectedRows = new Row[] {new MockRow(), new MockRow(), new MockRow(), new MockRow(), new MockRow()}; var mockRddProxy = new Mock(); var mockRddCollector = new Mock(); - mockRddCollector.Setup(m => m.Collect(It.IsAny(), It.IsAny(), It.IsAny())) + mockRddCollector.Setup(m => m.Collect(It.IsAny(), It.IsAny(), It.IsAny())) .Returns(expectedRows); - mockRddProxy.Setup(m => m.CollectAndServe()).Returns(123); + mockRddProxy.Setup(m => m.CollectAndServe()).Returns(new SocketInfo(123, null)); mockRddProxy.Setup(m => m.RDDCollector).Returns(mockRddCollector.Object); mockDataFrameProxy.Setup(m => m.JavaToCSharp()).Returns(mockRddProxy.Object); mockDataFrameProxy.Setup(m => m.Limit(It.IsAny())).Returns(mockDataFrameProxy.Object); @@ -868,9 +869,9 @@ public void TestFirst() var expectedRows = new Row[] { new MockRow(), new MockRow(), new MockRow(), new MockRow(), new MockRow() }; var mockRddProxy = new Mock(); var mockRddCollector = new Mock(); - mockRddCollector.Setup(m => m.Collect(It.IsAny(), It.IsAny(), It.IsAny())) + mockRddCollector.Setup(m => m.Collect(It.IsAny(), It.IsAny(), It.IsAny())) .Returns(expectedRows); - mockRddProxy.Setup(m => m.CollectAndServe()).Returns(123); + mockRddProxy.Setup(m => m.CollectAndServe()).Returns(new SocketInfo(123, null)); mockRddProxy.Setup(m => m.RDDCollector).Returns(mockRddCollector.Object); mockDataFrameProxy.Setup(m => m.JavaToCSharp()).Returns(mockRddProxy.Object); mockDataFrameProxy.Setup(m => m.Limit(It.IsAny())).Returns(mockDataFrameProxy.Object); @@ -892,9 +893,9 @@ public void TestTake() var expectedRows = new Row[] { new MockRow(), new MockRow(), new MockRow(), new MockRow(), new MockRow() }; var mockRddProxy = new Mock(); var mockRddCollector = new Mock(); - mockRddCollector.Setup(m => m.Collect(It.IsAny(), It.IsAny(), It.IsAny())) + mockRddCollector.Setup(m => m.Collect(It.IsAny(), It.IsAny(), It.IsAny())) .Returns(expectedRows); - mockRddProxy.Setup(m => m.CollectAndServe()).Returns(123); + mockRddProxy.Setup(m => m.CollectAndServe()).Returns(new SocketInfo(123, null)); mockRddProxy.Setup(m => m.RDDCollector).Returns(mockRddCollector.Object); mockDataFrameProxy.Setup(m => m.JavaToCSharp()).Returns(mockRddProxy.Object); mockDataFrameProxy.Setup(m => m.Limit(It.IsAny())).Returns(mockDataFrameProxy.Object); diff --git a/csharp/AdapterTest/DatasetTest.cs b/csharp/AdapterTest/DatasetTest.cs index 7ee59db9..b9000410 100644 --- a/csharp/AdapterTest/DatasetTest.cs +++ b/csharp/AdapterTest/DatasetTest.cs @@ -38,12 +38,12 @@ public void TestCleanUp() public void TestShow() { Mock mockDataFrameProxy = new Mock(); - mockDataFrameProxy.Setup(m => m.GetShowString(It.IsAny(), It.IsAny())).Returns("Show"); + mockDataFrameProxy.Setup(m => m.GetShowString(It.IsAny(), It.IsAny(), It.IsAny())).Returns("Show"); mockDatasetProxy.Setup(m => m.ToDF()).Returns(mockDataFrameProxy.Object); var dataset = new Dataset(mockDatasetProxy.Object); dataset.Show(); - mockDataFrameProxy.Verify(m => m.GetShowString(20, true), Times.Once); + mockDataFrameProxy.Verify(m => m.GetShowString(20, 20, false), Times.Once); } [Test] diff --git a/csharp/AdapterTest/Mocks/MockDataFrameProxy.cs b/csharp/AdapterTest/Mocks/MockDataFrameProxy.cs index a68d4082..60e84fbc 100644 --- a/csharp/AdapterTest/Mocks/MockDataFrameProxy.cs +++ b/csharp/AdapterTest/Mocks/MockDataFrameProxy.cs @@ -9,6 +9,7 @@ using System.Net; using System.Net.Sockets; using System.IO; +using Microsoft.Spark.CSharp.Core; using Microsoft.Spark.CSharp.Sql; using Razorvine.Pickle; using Microsoft.Spark.CSharp.Proxy; @@ -64,7 +65,7 @@ public string GetExecutedPlan() throw new NotImplementedException(); } - public string GetShowString(int numberOfRows, bool truncate) + public string GetShowString(int numberOfRows, int truncate, bool vertical) { throw new NotImplementedException(); } @@ -240,7 +241,12 @@ public IDataFrameProxy Sample(bool withReplacement, double fraction, long seed) throw new NotImplementedException(); } - public IDataFrameWriterProxy Write() + public IDataFrameProxy Broadcast() + { + throw new NotImplementedException(); + } + + public IDataFrameWriterProxy Write() { throw new NotImplementedException(); } diff --git a/csharp/AdapterTest/Mocks/MockRDDCollector.cs b/csharp/AdapterTest/Mocks/MockRDDCollector.cs index 2ec5c627..e9c8c5c7 100644 --- a/csharp/AdapterTest/Mocks/MockRDDCollector.cs +++ b/csharp/AdapterTest/Mocks/MockRDDCollector.cs @@ -4,12 +4,13 @@ using System.Text; using System.Threading.Tasks; using Microsoft.Spark.CSharp.Core; +using Microsoft.Spark.CSharp.Network; namespace AdapterTest.Mocks { class MockRDDCollector : IRDDCollector { - public IEnumerable Collect(int port, SerializedMode serializedMode, Type type) + public IEnumerable Collect(SocketInfo port, SerializedMode serializedMode, Type type) { throw new NotImplementedException(); } diff --git a/csharp/AdapterTest/Mocks/MockRddProxy.cs b/csharp/AdapterTest/Mocks/MockRddProxy.cs index 03b01427..9188ea40 100644 --- a/csharp/AdapterTest/Mocks/MockRddProxy.cs +++ b/csharp/AdapterTest/Mocks/MockRddProxy.cs @@ -15,6 +15,7 @@ using Microsoft.Spark.CSharp.Proxy; using Microsoft.Spark.CSharp.Interop.Ipc; using NUnit.Framework; +using Microsoft.Spark.CSharp.Network; namespace AdapterTest.Mocks { @@ -60,7 +61,7 @@ public IRDDProxy Union(IRDDProxy javaRddReferenceOther) return union; } - public int CollectAndServe() + public SocketInfo CollectAndServe() { return MockSparkContextProxy.RunJob(this); } diff --git a/csharp/AdapterTest/Mocks/MockRow.cs b/csharp/AdapterTest/Mocks/MockRow.cs index bfa5b73b..a6a9a86e 100644 --- a/csharp/AdapterTest/Mocks/MockRow.cs +++ b/csharp/AdapterTest/Mocks/MockRow.cs @@ -8,6 +8,13 @@ namespace AdapterTest.Mocks { public class MockRow : Row { + public override dynamic[] Values + { + get + { + throw new NotImplementedException(); + } + } public override int Size() { diff --git a/csharp/AdapterTest/Mocks/MockSparkContextProxy.cs b/csharp/AdapterTest/Mocks/MockSparkContextProxy.cs index 609e591c..da8b853c 100644 --- a/csharp/AdapterTest/Mocks/MockSparkContextProxy.cs +++ b/csharp/AdapterTest/Mocks/MockSparkContextProxy.cs @@ -195,7 +195,7 @@ public IUDFProxy CreateUserDefinedCSharpFunction(string name, byte[] command, st throw new NotImplementedException(); } - internal static int RunJob(IRDDProxy rdd) + internal static SocketInfo RunJob(IRDDProxy rdd) { var mockRdd = (rdd as MockRddProxy); IEnumerable result = mockRdd.pickle ? mockRdd.result.Cast() : @@ -222,10 +222,12 @@ internal static int RunJob(IRDDProxy rdd) ns.Flush(); } }); - return (listener.LocalEndPoint as IPEndPoint).Port; + + SocketInfo socketInfo = new SocketInfo((listener.LocalEndPoint as IPEndPoint).Port, null); + return socketInfo; } - public int RunJob(IRDDProxy rdd, IEnumerable partitions) + public SocketInfo RunJob(IRDDProxy rdd, IEnumerable partitions) { return RunJob(rdd); } diff --git a/csharp/AdapterTest/SocketWrapperTest.cs b/csharp/AdapterTest/SocketWrapperTest.cs index 3c7fac3d..63c2ef8d 100644 --- a/csharp/AdapterTest/SocketWrapperTest.cs +++ b/csharp/AdapterTest/SocketWrapperTest.cs @@ -86,9 +86,9 @@ private void SocketTest(ISocketWrapper serverSocket) Assert.Throws(() => clientSock.GetStream()); Assert.Throws(() => clientSock.Receive()); Assert.Throws(() => clientSock.Send(null)); - Assert.Throws(() => clientSock.Connect(IPAddress.Any, 1024)); + Assert.Throws(() => clientSock.Connect(IPAddress.Any, 1024, null)); - clientSock.Connect(IPAddress.Loopback, port); + clientSock.Connect(IPAddress.Loopback, port, null); // Valid invalid operation var byteBuf = ByteBufPool.Default.Allocate(); diff --git a/csharp/AdapterTest/TestWithMoqDemo.cs b/csharp/AdapterTest/TestWithMoqDemo.cs index 337794b5..706413c3 100644 --- a/csharp/AdapterTest/TestWithMoqDemo.cs +++ b/csharp/AdapterTest/TestWithMoqDemo.cs @@ -80,7 +80,7 @@ public void TestInitialize() ns.Flush(); } }); - return (listener.LocalEndPoint as IPEndPoint).Port; + return new SocketInfo((listener.LocalEndPoint as IPEndPoint).Port, null); }); _mockRddProxy.Setup(m => m.RDDCollector).Returns(new RDDCollector()); diff --git a/csharp/AdapterTest/packages.config b/csharp/AdapterTest/packages.config index c3a926b6..c7cc11eb 100644 --- a/csharp/AdapterTest/packages.config +++ b/csharp/AdapterTest/packages.config @@ -1,10 +1,11 @@  + - + + - \ No newline at end of file diff --git a/csharp/Repl/Repl.csproj b/csharp/Repl/Repl.csproj index 35d8bd68..faf98f4f 100644 --- a/csharp/Repl/Repl.csproj +++ b/csharp/Repl/Repl.csproj @@ -34,6 +34,9 @@ false + + ..\packages\log4net.2.0.8\lib\net45-full\log4net.dll + False ..\packages\Microsoft.Net.Compilers.1.1.1\tools\Microsoft.CodeAnalysis.dll @@ -50,11 +53,13 @@ False ..\packages\Microsoft.Net.Compilers.1.1.1\tools\Microsoft.CodeAnalysis.Scripting.dll + + ..\packages\Newtonsoft.Json.11.0.2\lib\net45\Newtonsoft.Json.dll + - False ..\packages\Razorvine.Pyrolite.4.10.0.0\lib\net40\Razorvine.Pyrolite.dll - + ..\packages\Razorvine.Serpent.1.12.0.0\lib\net40\Razorvine.Serpent.dll diff --git a/csharp/Repl/packages.config b/csharp/Repl/packages.config index 76ea838a..7c1ac611 100644 --- a/csharp/Repl/packages.config +++ b/csharp/Repl/packages.config @@ -1,8 +1,8 @@  - + - - - + + + \ No newline at end of file diff --git a/csharp/Samples/Microsoft.Spark.CSharp/DataFrameSamples.cs b/csharp/Samples/Microsoft.Spark.CSharp/DataFrameSamples.cs index 5f4e5b49..cb6bac8c 100644 --- a/csharp/Samples/Microsoft.Spark.CSharp/DataFrameSamples.cs +++ b/csharp/Samples/Microsoft.Spark.CSharp/DataFrameSamples.cs @@ -1867,5 +1867,72 @@ internal static void DFSaveSample() SparkCLRSamples.FileSystemHelper.DeleteDirectory(path, true); Console.WriteLine("Remove directory: {0}", path); } + + /// + /// Single UDF Sample + /// + [Sample] + internal static void SingleUDFSample() + { + var sqlContext = GetSqlContext(); + var peopleDataFrame = sqlContext.Read().Json(SparkCLRSamples.Configuration.GetInputDataPath(PeopleJson)); + peopleDataFrame.RegisterTempTable("peopleDataFrame"); + + sqlContext.RegisterFunction("UDF", (int x, int y) => { return x + y; }); + + var rowSet = sqlContext.Sql("SELECT * FROM peopleDataFrame where UDF(age, 20) > 60"); + + rowSet.Show(); + + if (SparkCLRSamples.Configuration.IsValidationEnabled) + { + Assert.AreEqual(rowSet.Count() ,2); + } + } + + /// + /// Single UDF Sample with duplicate values + /// + [Sample] + internal static void SingleUDFWithDupSample() + { + var sqlContext = GetSqlContext(); + var peopleDataFrame = sqlContext.Read().Json(SparkCLRSamples.Configuration.GetInputDataPath(PeopleJson)); + peopleDataFrame.RegisterTempTable("peopleDataFrame"); + + sqlContext.RegisterFunction("UDF", (int x, int y) => { return x + y; }); + + var rowSet = sqlContext.Sql("SELECT * FROM peopleDataFrame where UDF(age, age) < 50"); + + rowSet.Show(); + + if (SparkCLRSamples.Configuration.IsValidationEnabled) + { + Assert.AreEqual(rowSet.Count(), 1); + } + } + + /// + /// Multiple UDFs sample + /// + [Sample] + internal static void MultipleUDFSample() + { + var sqlContext = GetSqlContext(); + var peopleDataFrame = sqlContext.Read().Json(SparkCLRSamples.Configuration.GetInputDataPath(PeopleJson)); + peopleDataFrame.RegisterTempTable("peopleDataFrame"); + + sqlContext.RegisterFunction("UDF1", (int x, int y) => { return x + y; }); + sqlContext.RegisterFunction("UDF2", (string name, string id) => { return name + ":" + id; }); + + var rowSet = sqlContext.Sql("SELECT id, name, UDF1(age, 20) AS UDF1, UDF2(name, id) AS UDF2 FROM peopleDataFrame where UDF1(age, 20) > 60"); + + rowSet.Show(); + + if (SparkCLRSamples.Configuration.IsValidationEnabled) + { + Assert.AreEqual(rowSet.Count(), 2); + } + } } } diff --git a/csharp/Samples/Microsoft.Spark.CSharp/Program.cs b/csharp/Samples/Microsoft.Spark.CSharp/Program.cs index 1f25fa26..f9b5af55 100644 --- a/csharp/Samples/Microsoft.Spark.CSharp/Program.cs +++ b/csharp/Samples/Microsoft.Spark.CSharp/Program.cs @@ -66,8 +66,10 @@ static void Main(string[] args) if (Configuration.IsValidationEnabled && !status) { - Environment.Exit(1); + Environment.Exit(2); } + + Environment.Exit(1); } // Creates and returns a context diff --git a/csharp/Samples/Microsoft.Spark.CSharp/Samples.csproj b/csharp/Samples/Microsoft.Spark.CSharp/Samples.csproj index 880feb27..d28e1d69 100644 --- a/csharp/Samples/Microsoft.Spark.CSharp/Samples.csproj +++ b/csharp/Samples/Microsoft.Spark.CSharp/Samples.csproj @@ -33,9 +33,11 @@ 4 - - ..\..\packages\Newtonsoft.Json.7.0.1\lib\net45\Newtonsoft.Json.dll - True + + ..\..\packages\log4net.2.0.8\lib\net45-full\log4net.dll + + + ..\..\packages\Newtonsoft.Json.11.0.2\lib\net45\Newtonsoft.Json.dll ..\..\packages\NUnit.3.0.1\lib\net45\nunit.framework.dll diff --git a/csharp/Samples/Microsoft.Spark.CSharp/packages.config b/csharp/Samples/Microsoft.Spark.CSharp/packages.config index 4abe7e92..fc5be339 100644 --- a/csharp/Samples/Microsoft.Spark.CSharp/packages.config +++ b/csharp/Samples/Microsoft.Spark.CSharp/packages.config @@ -1,5 +1,6 @@  - + + \ No newline at end of file diff --git a/csharp/Tests.Common/Tests.Common.csproj b/csharp/Tests.Common/Tests.Common.csproj index 361031e7..a2ca2c97 100644 --- a/csharp/Tests.Common/Tests.Common.csproj +++ b/csharp/Tests.Common/Tests.Common.csproj @@ -36,11 +36,10 @@ 4 - - False - ..\packages\Newtonsoft.Json.7.0.1\lib\net45\Newtonsoft.Json.dll - + + ..\packages\Newtonsoft.Json.11.0.2\lib\net45\Newtonsoft.Json.dll + ..\packages\Razorvine.Pyrolite.4.10.0.0\lib\net40\Razorvine.Pyrolite.dll diff --git a/csharp/Utils/Microsoft.Spark.CSharp/FileSystem/HdfsFileStatus.cs b/csharp/Utils/Microsoft.Spark.CSharp/FileSystem/HdfsFileStatus.cs new file mode 100644 index 00000000..02228494 --- /dev/null +++ b/csharp/Utils/Microsoft.Spark.CSharp/FileSystem/HdfsFileStatus.cs @@ -0,0 +1,57 @@ +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE file in the project root for full license information. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Microsoft.Spark.CSharp.Interop.Ipc; +using Microsoft.Spark.CSharp.Proxy.Ipc; + +namespace Microsoft.Spark.CSharp.Utils.FileSystem +{ + /// + /// See https://hadoop.apache.org/docs/r2.6.1/api/org/apache/hadoop/fs/FileStatus.html + /// + public class HdfsFileStatus + { + public long Length => _status.Value.Length; + public long ModificationTime => _status.Value.Time; + public string Owner => _status.Value.Owner; + public string Path => _status.Value.Path; + public bool IsFile => _status.Value.IsFile; + public bool IsDirectory => _status.Value.IsDirectory; + public bool IsSymlink => _status.Value.IsSymlink; + + private Lazy _status; + + internal HdfsFileStatus(JvmObjectReference obj) + { + _status = new Lazy(()=>new Status(obj)); + } + + private class Status + { + public long Length; + public long Time; + public string Owner; + public string Path; + public bool IsFile; + public bool IsDirectory; + public bool IsSymlink; + + public Status(JvmObjectReference obj) + { + Length = (long) SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(obj, "getLen"); + Time = (long)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(obj, "getModificationTime"); + Owner = (string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(obj, "getOwner"); + IsFile = (bool)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(obj, "isFile"); + IsDirectory = (bool)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(obj, "isDirectory"); + IsSymlink = (bool)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(obj, "isSymlink"); + var pr = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(obj, "getPath")); + Path = (string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(pr, "getName"); + } + } + } +} diff --git a/csharp/Utils/Microsoft.Spark.CSharp/FileSystem/HdfsFileSystemHelper.cs b/csharp/Utils/Microsoft.Spark.CSharp/FileSystem/HdfsFileSystemHelper.cs index 52d20c3b..c88c93b1 100644 --- a/csharp/Utils/Microsoft.Spark.CSharp/FileSystem/HdfsFileSystemHelper.cs +++ b/csharp/Utils/Microsoft.Spark.CSharp/FileSystem/HdfsFileSystemHelper.cs @@ -4,8 +4,11 @@ using System; using System.Collections.Generic; using System.Diagnostics.CodeAnalysis; +using System.Linq; +using Microsoft.Spark.CSharp.Interop; using Microsoft.Spark.CSharp.Interop.Ipc; using Microsoft.Spark.CSharp.Proxy.Ipc; +using Microsoft.Spark.CSharp.Utils.FileSystem; namespace Microsoft.Spark.CSharp.Utils { @@ -18,7 +21,7 @@ public class HdfsFileSystemHelper : IFileSystemHelper public HdfsFileSystemHelper() { - var jvmConfReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.hadoop.conf.Configuration"); + var jvmConfReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.hadoop.conf.Configuration"); jvmHdfsReference = new JvmObjectReference((string) SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.hadoop.fs.FileSystem", "get", jvmConfReference)); } @@ -39,16 +42,25 @@ public IEnumerable EnumerateFiles(string path) for (var i = 0; i < statusList.Count; i++) { var subPathJvmReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(statusList[i], "getPath")); - files[i] = (string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(subPathJvmReference, "getName"); + files[i] = (string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(subPathJvmReference, "getName"); } return files; } - /// - /// Build a temp file path under '/tmp' path on HDFS. - /// - public string GetTempFileName() + /// + /// List the names of all the files under the given path. + /// + public IEnumerable ListStatus(string path) + { + var pathJvmReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.hadoop.fs.Path", path); + return ((List)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmHdfsReference, "listStatus", pathJvmReference)).Select(r=>new HdfsFileStatus(r)); + } + + /// + /// Build a temp file path under '/tmp' path on HDFS. + /// + public string GetTempFileName() { return "/tmp/" + Guid.NewGuid().ToString("N"); } @@ -91,5 +103,37 @@ internal bool Delete(string path, bool recursive) var pathJvmReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.hadoop.fs.Path", path); return (bool)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmHdfsReference, "delete", pathJvmReference, recursive); } - } + + public bool IsFile(string path) + { + var pathJvmReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.hadoop.fs.Path", path); + return (bool)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmHdfsReference, "isFile", pathJvmReference); + } + + public bool IsDirectory(string path) + { + var pathJvmReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.hadoop.fs.Path", path); + return (bool)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmHdfsReference, "isDirectory", pathJvmReference); + } + + public bool Touch(string path) + { + var pathJvmReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.hadoop.fs.Path", path); + return (bool)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmHdfsReference, "createNewFile", pathJvmReference); + } + + public void CopyFromLocalFile(string src, string dest) + { + var from = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.hadoop.fs.Path", new Uri(src).AbsoluteUri); + var to = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.hadoop.fs.Path", dest); + SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmHdfsReference, "copyFromLocalFile", from, to); + } + + public void CopyToLocalFile(string src, string dest) + { + var to = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.hadoop.fs.Path", new Uri(dest).AbsoluteUri); + var from = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.hadoop.fs.Path", src); + SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmHdfsReference, "copyToLocalFile", from, to); + } + } } diff --git a/csharp/Utils/Microsoft.Spark.CSharp/Utils.csproj b/csharp/Utils/Microsoft.Spark.CSharp/Utils.csproj index 60657c71..d089d7db 100644 --- a/csharp/Utils/Microsoft.Spark.CSharp/Utils.csproj +++ b/csharp/Utils/Microsoft.Spark.CSharp/Utils.csproj @@ -40,6 +40,7 @@ + diff --git a/csharp/Worker/Microsoft.Spark.CSharp/MultiThreadWorker.cs b/csharp/Worker/Microsoft.Spark.CSharp/MultiThreadWorker.cs index f73e90b8..6fb9e1a0 100644 --- a/csharp/Worker/Microsoft.Spark.CSharp/MultiThreadWorker.cs +++ b/csharp/Worker/Microsoft.Spark.CSharp/MultiThreadWorker.cs @@ -111,7 +111,8 @@ private void StartDaemonServer(ISocketWrapper listener) bool sparkReuseWorker = false; string envVar = Environment.GetEnvironmentVariable("SPARK_REUSE_WORKER"); // this envVar is set in JVM side - if ((envVar != null) && envVar.Equals("1")) + var secret = Environment.GetEnvironmentVariable("PYTHON_WORKER_FACTORY_SECRET"); + if ((envVar != null) && envVar.Equals("1")) { sparkReuseWorker = true; } @@ -130,7 +131,7 @@ private void StartDaemonServer(ISocketWrapper listener) SerDe.Write(s, trId); // write taskRunnerId to JVM side s.Flush(); } - TaskRunner taskRunner = new TaskRunner(trId, socket, sparkReuseWorker); + TaskRunner taskRunner = new TaskRunner(trId, socket, sparkReuseWorker, secret); waitingTaskRunners.Add(taskRunner); taskRunnerRegistry[trId] = taskRunner; trId++; diff --git a/csharp/Worker/Microsoft.Spark.CSharp/TaskRunner.cs b/csharp/Worker/Microsoft.Spark.CSharp/TaskRunner.cs index fb88e431..fb398567 100644 --- a/csharp/Worker/Microsoft.Spark.CSharp/TaskRunner.cs +++ b/csharp/Worker/Microsoft.Spark.CSharp/TaskRunner.cs @@ -3,7 +3,9 @@ using System; using System.IO; +using System.Net; using System.Runtime.CompilerServices; +using System.Text; using System.Threading; using Microsoft.Spark.CSharp.Configuration; using Microsoft.Spark.CSharp.Interop.Ipc; @@ -13,106 +15,116 @@ [assembly: InternalsVisibleTo("WorkerTest")] namespace Microsoft.Spark.CSharp { - /// - /// TaskRunner is used to run Spark task assigned by JVM side. It uses a TCP socket to - /// communicate with JVM side. This socket may be reused to run multiple Spark tasks. - /// - internal class TaskRunner - { - private static ILoggerService logger; - private static ILoggerService Logger - { - get - { - if (logger != null) return logger; - logger = LoggerServiceFactory.GetLogger(typeof(TaskRunner)); - return logger; - } - } + /// + /// TaskRunner is used to run Spark task assigned by JVM side. It uses a TCP socket to + /// communicate with JVM side. This socket may be reused to run multiple Spark tasks. + /// + internal class TaskRunner + { + private static ILoggerService logger; + private static ILoggerService Logger + { + get + { + if (logger != null) return logger; + logger = LoggerServiceFactory.GetLogger(typeof(TaskRunner)); + return logger; + } + } - private readonly ISocketWrapper socket; // Socket to communicate with JVM - private volatile bool stop; - private readonly bool socketReuse; // whether the socket can be reused to run multiple Spark tasks + private readonly ISocketWrapper socket; // Socket to communicate with JVM + private volatile bool stop; + private readonly bool socketReuse; // whether the socket can be reused to run multiple Spark tasks + private string secret; - /// - /// Task runner Id - /// - public int TaskId { get; private set; } + /// + /// Task runner Id + /// + public int TaskId { get; private set; } - public TaskRunner(int trId, ISocketWrapper socket, bool socketReuse) - { - TaskId = trId; - this.socket = socket; - this.socketReuse = socketReuse; - } + public TaskRunner(int trId, ISocketWrapper socket, bool socketReuse, string secret) + { + TaskId = trId; + this.socket = socket; + this.socketReuse = socketReuse; + this.secret = secret; + } - public void Run() - { - Logger.LogInfo("TaskRunner [{0}] is running ...", TaskId); + public void Run() + { + Logger.LogInfo("TaskRunner [{0}] is running ...", TaskId); - try - { - while (!stop) - { - using (var inputStream = socket.GetInputStream()) - using (var outputStream = socket.GetOutputStream()) - { - byte[] bytes = SerDe.ReadBytes(inputStream, sizeof(int)); - if (bytes != null) - { - int splitIndex = SerDe.ToInt(bytes); - bool readComplete = Worker.ProcessStream(inputStream, outputStream, splitIndex); - outputStream.Flush(); - if (!readComplete) // if the socket is not read through completely, then it can't be reused - { - stop = true; - // wait for server to complete, otherwise server may get 'connection reset' exception - Logger.LogInfo("Sleep 500 millisecond to close socket ..."); - Thread.Sleep(500); - } - else if (!socketReuse) - { - stop = true; - // wait for server to complete, otherwise server gets 'connection reset' exception - // Use SerDe.ReadBytes() to detect java side has closed socket properly - // ReadBytes() will block until the socket is closed - Logger.LogInfo("waiting JVM side to close socket..."); - SerDe.ReadBytes(inputStream); - Logger.LogInfo("JVM side has closed socket"); - } - } - else - { - stop = true; - Logger.LogWarn("read null splitIndex, socket is closed by JVM"); - } - } - } - } - catch (Exception e) - { - stop = true; - Logger.LogError("TaskRunner [{0}] exeption, will dispose this TaskRunner", TaskId); - Logger.LogException(e); - } - finally - { - try - { - socket.Close(); - } - catch (Exception ex) - { - Logger.LogWarn("close socket exception: {0}", ex); - } - Logger.LogInfo("TaskRunner [{0}] finished", TaskId); - } - } + try + { + while (!stop) + { + using (var inputStream = socket.GetInputStream()) + using (var outputStream = socket.GetOutputStream()) + { + if (!string.IsNullOrEmpty(secret)) + { + SerDe.Write(outputStream, secret); + outputStream.Flush(); + var reply = SerDe.ReadString(inputStream); + Logger.LogDebug("Connect back to JVM: " + reply); + secret = null; + } + byte[] bytes = SerDe.ReadBytes(inputStream, sizeof(int)); + if (bytes != null) + { + int splitIndex = SerDe.ToInt(bytes); + bool readComplete = Worker.ProcessStream(inputStream, outputStream, splitIndex); + outputStream.Flush(); + if (!readComplete) // if the socket is not read through completely, then it can't be reused + { + stop = true; + // wait for server to complete, otherwise server may get 'connection reset' exception + Logger.LogInfo("Sleep 500 millisecond to close socket ..."); + Thread.Sleep(500); + } + else if (!socketReuse) + { + stop = true; + // wait for server to complete, otherwise server gets 'connection reset' exception + // Use SerDe.ReadBytes() to detect java side has closed socket properly + // ReadBytes() will block until the socket is closed + Logger.LogInfo("waiting JVM side to close socket..."); + SerDe.ReadBytes(inputStream); + Logger.LogInfo("JVM side has closed socket"); + } + } + else + { + stop = true; + Logger.LogWarn("read null splitIndex, socket is closed by JVM"); + } + } + } + } + catch (Exception e) + { + stop = true; + Logger.LogError("TaskRunner [{0}] exeption, will dispose this TaskRunner", TaskId); + Logger.LogException(e); + } + finally + { + try + { + socket.Close(); + } + catch (Exception ex) + { + Logger.LogWarn("close socket exception: {0}", ex); + } + Logger.LogInfo("TaskRunner [{0}] finished", TaskId); + } + } - public void Stop() - { - Logger.LogInfo("try to stop TaskRunner [{0}]", TaskId); - stop = true; - } - } + public void Stop() + { + Logger.LogInfo("try to stop TaskRunner [{0}]", TaskId); + stop = true; + } + } } diff --git a/csharp/Worker/Microsoft.Spark.CSharp/UDFCommand.cs b/csharp/Worker/Microsoft.Spark.CSharp/UDFCommand.cs new file mode 100644 index 00000000..43cf6b5c --- /dev/null +++ b/csharp/Worker/Microsoft.Spark.CSharp/UDFCommand.cs @@ -0,0 +1,391 @@ +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE file in the project root for full license information. + +using Microsoft.Spark.CSharp.Core; +using Microsoft.Spark.CSharp.Interop.Ipc; +using Microsoft.Spark.CSharp.Services; +using Microsoft.Spark.CSharp.Sql; +using Razorvine.Pickle; +using System; +using System.Collections; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Linq; +using System.Runtime.Serialization; +using System.Runtime.Serialization.Formatters.Binary; + +namespace Microsoft.Spark.CSharp +{ + /// + /// This class execute user defined methods. + /// + + internal class UDFCommand + { + private readonly DateTime UnixTimeEpoch = new DateTime(1970, 1, 1, 0, 0, 0, DateTimeKind.Utc); + private ILoggerService logger; + private Stream inputStream; + private Stream outputStream; + private int splitIndex; + private DateTime bootTime; + private string deserializerMode; + private string serializerMode; + private IFormatter formatter; + private Stopwatch commandProcessWatch; + private int isSqlUdf; + private List workerFuncList; + private int stageId; + + public UDFCommand(Stream inputStream, Stream outputStream, int splitIndex, DateTime bootTime, + string deserializerMode, string serializerMode, IFormatter formatter, + Stopwatch commandProcessWatch, int isSqlUdf, List workerFuncList, int stageId) + { + this.inputStream = inputStream; + this.outputStream = outputStream; + this.splitIndex = splitIndex; + this.bootTime = bootTime; + this.deserializerMode = deserializerMode; + this.serializerMode = serializerMode; + this.formatter = formatter; + this.commandProcessWatch = commandProcessWatch; + this.isSqlUdf = isSqlUdf; + this.workerFuncList = workerFuncList; + this.stageId = stageId; + + InitializeLogger(); + } + + private void InitializeLogger() + { + try + { + // if there exists exe.config file, then use log4net + if (File.Exists(AppDomain.CurrentDomain.SetupInformation.ConfigurationFile)) + { + LoggerServiceFactory.SetLoggerService(Log4NetLoggerService.Instance); + } + + logger = LoggerServiceFactory.GetLogger(typeof(UDFCommand)); + } + catch (Exception e) + { + Console.WriteLine("InitializeLogger exception {0}, will exit", e); + Environment.Exit(-1); + } + } + + internal void Execute() + { + if (isSqlUdf == 0) + { + ExecuteNonSqlUDF(); + } + else + { + ExecuteSqlUDF(); + } + } + + private void ExecuteNonSqlUDF() + { + int count = 0; + int nullMessageCount = 0; + logger.LogDebug("Beginning to execute non sql func"); + WorkerFunc workerFunc = workerFuncList[0]; + var func = workerFunc.CharpWorkerFunc.Func; + + var funcProcessWatch = Stopwatch.StartNew(); + DateTime initTime = DateTime.UtcNow; + foreach (var message in func(splitIndex, GetIterator(inputStream, deserializerMode, isSqlUdf))) + { + funcProcessWatch.Stop(); + + if (object.ReferenceEquals(null, message)) + { + nullMessageCount++; + continue; + } + + try + { + WriteOutput(outputStream, serializerMode, message, formatter); + } + catch (Exception ex) + { + logger.LogError("WriteOutput() failed at iteration {0}, execption {1}", count, ex); + throw; + } + + count++; + funcProcessWatch.Start(); + } + + logger.LogInfo("Output entries count: " + count); + logger.LogDebug("Null messages count: " + nullMessageCount); + + WriteDiagnosticsInfo(outputStream, bootTime, initTime); + + commandProcessWatch.Stop(); + + // log statistics + logger.LogInfo("func process time: {0}", funcProcessWatch.ElapsedMilliseconds); + logger.LogInfo("stage {0}, command process time: {1}", stageId, commandProcessWatch.ElapsedMilliseconds); + } + + private void ExecuteSqlUDF() + { + int count = 0; + int nullMessageCount = 0; + logger.LogDebug("Beginning to execute sql func"); + + var funcProcessWatch = Stopwatch.StartNew(); + DateTime initTime = DateTime.UtcNow; + + foreach (var row in GetIterator(inputStream, deserializerMode, isSqlUdf)) + { + List messages = new List(); + + foreach (WorkerFunc workerFunc in workerFuncList) + { + List args = new List(); + foreach (int offset in workerFunc.ArgOffsets) + { + args.Add(row[offset]); + } + + foreach (var message in workerFunc.CharpWorkerFunc.Func(splitIndex, new[] { args.ToArray()})) + { + funcProcessWatch.Stop(); + + if (object.ReferenceEquals(null, message)) + { + nullMessageCount++; + continue; + } + + messages.Add(message); + } + } + + try + { + dynamic res = messages.ToArray(); + if (messages.Count == 1) + { + res = messages[0]; + } + + WriteOutput(outputStream, serializerMode, res, formatter); + } + catch (Exception ex) + { + logger.LogError("WriteOutput() failed at iteration {0}, exception error {1}", count, ex.Message); + throw; + } + + count++; + funcProcessWatch.Start(); + } + + logger.LogInfo("Output entries count: " + count); + logger.LogDebug("Null messages count: " + nullMessageCount); + + WriteDiagnosticsInfo(outputStream, bootTime, initTime); + + commandProcessWatch.Stop(); + + // log statistics + logger.LogInfo("func process time: {0}", funcProcessWatch.ElapsedMilliseconds); + logger.LogInfo("stage {0}, command process time: {0}", stageId, commandProcessWatch.ElapsedMilliseconds); + } + + private void WriteOutput(Stream networkStream, string serializerMode, dynamic message, IFormatter formatter) + { + var buffer = GetSerializedMessage(serializerMode, message, formatter); + if (buffer == null) + { + logger.LogError("Buffer is null"); + } + + if (buffer.Length <= 0) + { + logger.LogError("Buffer length {0} cannot be <= 0", buffer.Length); + } + + SerDe.Write(networkStream, buffer.Length); + SerDe.Write(networkStream, buffer); + } + + private byte[] GetSerializedMessage(string serializerMode, dynamic message, IFormatter formatter) + { + byte[] buffer; + + switch ((SerializedMode)Enum.Parse(typeof(SerializedMode), serializerMode)) + { + case SerializedMode.None: + buffer = message as byte[]; + break; + + case SerializedMode.String: + buffer = SerDe.ToBytes(message as string); + break; + + case SerializedMode.Row: + var pickler = new Pickler(); + buffer = pickler.dumps(new ArrayList { message }); + break; + + default: + try + { + var ms = new MemoryStream(); + formatter.Serialize(ms, message); + buffer = ms.ToArray(); + } + catch (Exception ex) + { + logger.LogError("Exception serializing output: " + ex); + logger.LogError("{0} : {1}", message.GetType().Name, message.GetType().FullName); + throw; + } + break; + } + + return buffer; + } + + private void WriteDiagnosticsInfo(Stream networkStream, DateTime bootTime, DateTime initTime) + { + DateTime finishTime = DateTime.UtcNow; + const string format = "MM/dd/yyyy hh:mm:ss.fff tt"; + + logger.LogDebug("bootTime: {0}, initTime: {1}, finish_time: {2}", + bootTime.ToString(format), initTime.ToString(format), finishTime.ToString(format)); + + SerDe.Write(networkStream, (int)SpecialLengths.TIMING_DATA); + SerDe.Write(networkStream, ToUnixTime(bootTime)); + SerDe.Write(networkStream, ToUnixTime(initTime)); + SerDe.Write(networkStream, ToUnixTime(finishTime)); + + SerDe.Write(networkStream, 0L); //shuffle.MemoryBytesSpilled + SerDe.Write(networkStream, 0L); //shuffle.DiskBytesSpilled + } + + private long ToUnixTime(DateTime dt) + { + return (long)(dt - UnixTimeEpoch).TotalMilliseconds; + } + + private IEnumerable GetIterator(Stream inputStream, string serializedMode, int isFuncSqlUdf) + { + logger.LogInfo("Serialized mode in GetIterator: " + serializedMode); + IFormatter formatter = new BinaryFormatter(); + var mode = (SerializedMode)Enum.Parse(typeof(SerializedMode), serializedMode); + int messageLength; + Stopwatch watch = Stopwatch.StartNew(); + Row tempRow = null; + + while ((messageLength = SerDe.ReadInt(inputStream)) != (int)SpecialLengths.END_OF_DATA_SECTION) + { + watch.Stop(); + if (messageLength > 0 || messageLength == (int)SpecialLengths.NULL) + { + watch.Start(); + byte[] buffer = messageLength > 0 ? SerDe.ReadBytes(inputStream, messageLength) : null; + watch.Stop(); + switch (mode) + { + case SerializedMode.String: + { + if (messageLength > 0) + { + if (buffer == null) + { + logger.LogDebug("Buffer is null. Message length is {0}", messageLength); + } + yield return SerDe.ToString(buffer); + } + else + { + yield return null; + } + break; + } + + case SerializedMode.Row: + { + Debug.Assert(messageLength > 0); + var unpickledObjects = PythonSerDe.GetUnpickledObjects(buffer); + + if (isFuncSqlUdf == 0) + { + foreach (var row in unpickledObjects.Select(item => (item as RowConstructor).GetRow())) + { + yield return row; + } + } + else + { + foreach (var row in unpickledObjects) + { + yield return row; + } + } + + break; + } + + case SerializedMode.Pair: + { + byte[] pairKey = buffer; + byte[] pairValue; + + watch.Start(); + int valueLength = SerDe.ReadInt(inputStream); + if (valueLength > 0) + { + pairValue = SerDe.ReadBytes(inputStream, valueLength); + } + else if (valueLength == (int)SpecialLengths.NULL) + { + pairValue = null; + } + else + { + throw new Exception(string.Format("unexpected valueLength: {0}", valueLength)); + } + watch.Stop(); + + yield return new Tuple(pairKey, pairValue); + break; + } + + case SerializedMode.None: //just return raw bytes + { + yield return buffer; + break; + } + + default: + { + if (buffer != null) + { + var ms = new MemoryStream(buffer); + yield return formatter.Deserialize(ms); + } + else + { + yield return null; + } + break; + } + } + } + watch.Start(); + } + + logger.LogInfo("total receive time: {0}", watch.ElapsedMilliseconds); + } + } +} diff --git a/csharp/Worker/Microsoft.Spark.CSharp/Worker.cs b/csharp/Worker/Microsoft.Spark.CSharp/Worker.cs index 486a1bc7..c034ca6c 100644 --- a/csharp/Worker/Microsoft.Spark.CSharp/Worker.cs +++ b/csharp/Worker/Microsoft.Spark.CSharp/Worker.cs @@ -2,7 +2,6 @@ // Licensed under the MIT license. See LICENSE file in the project root for full license information. using System; -using System.Collections; using System.Collections.Concurrent; using System.Collections.Generic; using System.IO; @@ -17,8 +16,6 @@ using Microsoft.Spark.CSharp.Interop.Ipc; using Microsoft.Spark.CSharp.Network; using Microsoft.Spark.CSharp.Services; -using Microsoft.Spark.CSharp.Sql; -using Razorvine.Pickle; namespace Microsoft.Spark.CSharp { @@ -31,7 +28,6 @@ namespace Microsoft.Spark.CSharp /// public class Worker { - private static readonly DateTime UnixTimeEpoch = new DateTime(1970, 1, 1, 0, 0, 0, DateTimeKind.Utc); private static ILoggerService logger; private static SparkCLRAssemblyHandler assemblyHandler; @@ -81,11 +77,13 @@ private static void RunSimpleWorker() InitializeLogger(); logger.LogInfo("RunSimpleWorker ..."); PrintFiles(); - - int javaPort = int.Parse(Console.ReadLine()); //reading port number written from JVM - logger.LogDebug("Port number used to pipe in/out data between JVM and CLR {0}", javaPort); + //int javaPort = int.Parse(Console.ReadLine()); //reading port number written from JVM + var javaPort = int.Parse(Environment.GetEnvironmentVariable("PYTHON_WORKER_FACTORY_PORT")); + var secret = Environment.GetEnvironmentVariable("PYTHON_WORKER_FACTORY_SECRET"); + logger.LogDebug("Port and secret number used to pipe in/out data between JVM and CLR {0} {1}", javaPort, secret); var socket = InitializeSocket(javaPort); - TaskRunner taskRunner = new TaskRunner(0, socket, false); + //Microsoft.Spark.CSharp.Network.Utils.DoServerAuth(socket, secret); + TaskRunner taskRunner = new TaskRunner(0, socket, false, secret); taskRunner.Run(); } catch (Exception e) @@ -119,7 +117,7 @@ public static void InitializeLogger() private static ISocketWrapper InitializeSocket(int javaPort) { var socket = SocketFactory.CreateSocket(); - socket.Connect(IPAddress.Loopback, javaPort); + socket.Connect(IPAddress.Loopback, javaPort, null); return socket; } @@ -138,9 +136,13 @@ public static bool ProcessStream(Stream inputStream, Stream outputStream, int sp //// initialize global state //shuffle.MemoryBytesSpilled = 0 //shuffle.DiskBytesSpilled = 0 + SerDe.ReadInt(inputStream); + SerDe.ReadInt(inputStream); + SerDe.ReadInt(inputStream); + SerDe.ReadLong(inputStream); - // fetch name of workdir - string sparkFilesDir = SerDe.ReadString(inputStream); + // fetch name of workdir + string sparkFilesDir = SerDe.ReadString(inputStream); logger.LogDebug("spark_files_dir: " + sparkFilesDir); //SparkFiles._root_directory = sparkFilesDir //SparkFiles._is_running_on_worker = True @@ -149,7 +151,7 @@ public static bool ProcessStream(Stream inputStream, Stream outputStream, int sp ProcessBroadcastVariables(inputStream); - Accumulator.threadLocalAccumulatorRegistry = new Dictionary(); + Accumulator.threadLocalAccumulatorRegistry = new Dictionary(); var formatter = ProcessCommand(inputStream, outputStream, splitIndex, bootTime); @@ -255,94 +257,117 @@ private static IFormatter ProcessCommand(Stream inputStream, Stream outputStream logger.LogDebug("Is func Sql UDF = {0}", isSqlUdf); IFormatter formatter = new BinaryFormatter(); + UDFCommand command = null; if (isSqlUdf == 0) { - logger.LogDebug("Processing non-UDF command"); - int lengthOfCommandByteArray = SerDe.ReadInt(inputStream); - logger.LogDebug("Command length: " + lengthOfCommandByteArray); + command = ProcessNonUdfCommand(inputStream, outputStream, splitIndex, bootTime, formatter, isSqlUdf); + } + else + { + command = ProcessUdfCommand(inputStream, outputStream, splitIndex, bootTime, formatter, isSqlUdf); + } + + if (command != null) + { + command.Execute(); + } + + return formatter; + } + + private static UDFCommand ProcessNonUdfCommand(Stream inputStream, Stream outputStream, int splitIndex, + DateTime bootTime, IFormatter formatter, int isSqlUdf) + { + logger.LogDebug("Processing non-UDF command"); + int lengthOfCommandByteArray = SerDe.ReadInt(inputStream); + logger.LogDebug("Command length: " + lengthOfCommandByteArray); + + UDFCommand command = null; + if (lengthOfCommandByteArray > 0) + { + var commandProcessWatch = new Stopwatch(); + commandProcessWatch.Start(); + + int stageId; + string deserializerMode; + string serializerMode; + CSharpWorkerFunc cSharpWorkerFunc; + ReadCommand(inputStream, formatter, out stageId, out deserializerMode, out serializerMode, + out cSharpWorkerFunc); + + command = new UDFCommand(inputStream, outputStream, splitIndex, bootTime, deserializerMode, + serializerMode, formatter, commandProcessWatch, isSqlUdf, + new List() { new WorkerFunc(cSharpWorkerFunc, 0, null) }, stageId); - if (lengthOfCommandByteArray > 0) - { - var commandProcessWatch = new Stopwatch(); - commandProcessWatch.Start(); - - int stageId; - string deserializerMode; - string serializerMode; - CSharpWorkerFunc workerFunc; - ReadCommand(inputStream, formatter, out stageId, out deserializerMode, out serializerMode, - out workerFunc); - - ExecuteCommand(inputStream, outputStream, splitIndex, bootTime, deserializerMode, workerFunc, serializerMode, - formatter, commandProcessWatch, stageId, isSqlUdf); - } - else - { - logger.LogWarn("lengthOfCommandByteArray = 0. Nothing to execute :-("); - } } else { - logger.LogDebug("Processing UDF command"); - var udfCount = SerDe.ReadInt(inputStream); - logger.LogDebug("Count of UDFs = {0}", udfCount); + logger.LogWarn("lengthOfCommandByteArray = 0. Nothing to execute :-("); + } - if (udfCount == 1) + return command; + } + + private static UDFCommand ProcessUdfCommand(Stream inputStream, Stream outputStream, int splitIndex, + DateTime bootTime, IFormatter formatter, int isSqlUdf) + { + logger.LogDebug("Processing UDF command"); + var udfCount = SerDe.ReadInt(inputStream); + logger.LogDebug("Count of UDFs = {0}", udfCount); + + int stageId = -1; + string deserializerMode = null; + string serializerMode = null; + var commandProcessWatch = new Stopwatch(); + List workerFuncList = new List(); + + for(int udfIter = 0; udfIter < udfCount; udfIter++) + { + CSharpWorkerFunc func = null; + var argCount = SerDe.ReadInt(inputStream); + logger.LogDebug("Count of args = {0}", argCount); + + List argOffsets = new List(); + for (int argIndex = 0; argIndex < argCount; argIndex++) { - CSharpWorkerFunc func = null; - var argCount = SerDe.ReadInt(inputStream); - logger.LogDebug("Count of args = {0}", argCount); + var offset = SerDe.ReadInt(inputStream); + logger.LogDebug("UDF argIndex = {0}, Offset = {1}", argIndex, offset); + argOffsets.Add(offset); + } + + var chainedFuncCount = SerDe.ReadInt(inputStream); + logger.LogDebug("Count of chained func = {0}", chainedFuncCount); - var argOffsets = new List(); + for (int funcIndex = 0; funcIndex < chainedFuncCount; funcIndex++) + { + int lengthOfCommandByteArray = SerDe.ReadInt(inputStream); + logger.LogDebug("UDF command length: " + lengthOfCommandByteArray); - for (int argIndex = 0; argIndex < argCount; argIndex++) + if (lengthOfCommandByteArray > 0) { - var offset = SerDe.ReadInt(inputStream); - logger.LogDebug("UDF argIndex = {0}, Offset = {1}", argIndex, offset); - argOffsets.Add(offset); + CSharpWorkerFunc workerFunc; + ReadCommand(inputStream, formatter, out stageId, out deserializerMode, out serializerMode, + out workerFunc); + + func = func == null ? workerFunc : CSharpWorkerFunc.Chain(func, workerFunc); } - var chainedFuncCount = SerDe.ReadInt(inputStream); - logger.LogDebug("Count of chained func = {0}", chainedFuncCount); - - var commandProcessWatch = new Stopwatch(); - int stageId = -1; - string deserializerMode = null; - string serializerMode = null; - for (int funcIndex = 0; funcIndex < chainedFuncCount; funcIndex++) + else { - int lengthOfCommandByteArray = SerDe.ReadInt(inputStream); - logger.LogDebug("UDF command length: " + lengthOfCommandByteArray) - ; - - if (lengthOfCommandByteArray > 0) - { - CSharpWorkerFunc workerFunc; - ReadCommand(inputStream, formatter, out stageId, out deserializerMode, out serializerMode, - out workerFunc); - - func = func == null ? workerFunc : CSharpWorkerFunc.Chain(func, workerFunc); - } - else - { - logger.LogWarn("UDF lengthOfCommandByteArray = 0. Nothing to execute :-("); - } + logger.LogWarn("UDF lengthOfCommandByteArray = 0. Nothing to execute :-("); } - - Debug.Assert(stageId != -1); - Debug.Assert(deserializerMode != null); - Debug.Assert(serializerMode != null); - Debug.Assert(func != null); - ExecuteCommand(inputStream, outputStream, splitIndex, bootTime, deserializerMode, func, serializerMode, formatter, - commandProcessWatch, stageId, isSqlUdf); - } - else - { - throw new NotSupportedException(); //TODO - add support for multiple UDFs } + + Debug.Assert(stageId != -1); + Debug.Assert(deserializerMode != null); + Debug.Assert(serializerMode != null); + Debug.Assert(func != null); + + workerFuncList.Add(new WorkerFunc(func, argCount, argOffsets)); } - return formatter; + return new UDFCommand(inputStream, outputStream, splitIndex, bootTime, deserializerMode, + serializerMode, formatter, commandProcessWatch, isSqlUdf, workerFuncList, stageId); } private static void ReadCommand(Stream networkStream, IFormatter formatter, out int stageId, @@ -388,116 +413,7 @@ private static IFormatter ProcessCommand(Stream inputStream, Stream outputStream "--------------------------------------------------------------------------------------------------------------"); logger.LogDebug(sb.ToString()); } - - private static void ExecuteCommand(Stream inputStream, Stream outputStream, int splitIndex, DateTime bootTime, - string deserializerMode, CSharpWorkerFunc workerFunc, string serializerMode, - IFormatter formatter, Stopwatch commandProcessWatch, int stageId, int isSqlUdf) - { - int count = 0; - int nullMessageCount = 0; - logger.LogDebug("Beginning to execute func"); - var func = workerFunc.Func; - - var funcProcessWatch = Stopwatch.StartNew(); - DateTime initTime = DateTime.UtcNow; - foreach (var message in func(splitIndex, GetIterator(inputStream, deserializerMode, isSqlUdf))) - { - funcProcessWatch.Stop(); - - if (object.ReferenceEquals(null, message)) - { - nullMessageCount++; - continue; - } - - try - { - WriteOutput(outputStream, serializerMode, message, formatter); - } - catch (Exception) - { - logger.LogError("WriteOutput() failed at iteration {0}", count); - throw; - } - - count++; - funcProcessWatch.Start(); - } - - logger.LogInfo("Output entries count: " + count); - logger.LogDebug("Null messages count: " + nullMessageCount); - - //if profiler: - // profiler.profile(process) - //else: - // process() - - WriteDiagnosticsInfo(outputStream, bootTime, initTime); - - commandProcessWatch.Stop(); - - // log statistics - logger.LogInfo("func process time: {0}", funcProcessWatch.ElapsedMilliseconds); - logger.LogInfo("stage {0}, command process time: {1}", stageId, commandProcessWatch.ElapsedMilliseconds); - } - - private static void WriteOutput(Stream networkStream, string serializerMode, dynamic message, IFormatter formatter) - { - var buffer = GetSerializedMessage(serializerMode, message, formatter); - if (buffer == null) - { - logger.LogError("Buffer is null"); - } - - if (buffer.Length <= 0) - { - logger.LogError("Buffer length {0} cannot be <= 0", buffer.Length); - } - - //Debug.Assert(buffer != null); - //Debug.Assert(buffer.Length > 0); - SerDe.Write(networkStream, buffer.Length); - SerDe.Write(networkStream, buffer); - } - - private static byte[] GetSerializedMessage(string serializerMode, dynamic message, IFormatter formatter) - { - byte[] buffer; - - switch ((SerializedMode)Enum.Parse(typeof(SerializedMode), serializerMode)) - { - case SerializedMode.None: - buffer = message as byte[]; - break; - - case SerializedMode.String: - buffer = SerDe.ToBytes(message as string); - break; - - case SerializedMode.Row: - var pickler = new Pickler(); - buffer = pickler.dumps(new ArrayList { message }); - break; - - default: - try - { - var ms = new MemoryStream(); - formatter.Serialize(ms, message); - buffer = ms.ToArray(); - } - catch (Exception) - { - logger.LogError("Exception serializing output"); - logger.LogError("{0} : {1}", message.GetType().Name, message.GetType().FullName); - throw; - } - break; - } - - return buffer; - } - + private static int ReadDiagnosticsInfo(Stream networkStream) { int rddId = SerDe.ReadInt(networkStream); @@ -505,22 +421,7 @@ private static int ReadDiagnosticsInfo(Stream networkStream) int partitionId = SerDe.ReadInt(networkStream); logger.LogInfo("rddInfo: rddId {0}, stageId {1}, partitionId {2}", rddId, stageId, partitionId); return stageId; - } - - private static void WriteDiagnosticsInfo(Stream networkStream, DateTime bootTime, DateTime initTime) - { - DateTime finishTime = DateTime.UtcNow; - const string format = "MM/dd/yyyy hh:mm:ss.fff tt"; - logger.LogDebug("bootTime: {0}, initTime: {1}, finish_time: {2}", - bootTime.ToString(format), initTime.ToString(format), finishTime.ToString(format)); - SerDe.Write(networkStream, (int)SpecialLengths.TIMING_DATA); - SerDe.Write(networkStream, ToUnixTime(bootTime)); - SerDe.Write(networkStream, ToUnixTime(initTime)); - SerDe.Write(networkStream, ToUnixTime(finishTime)); - - SerDe.Write(networkStream, 0L); //shuffle.MemoryBytesSpilled - SerDe.Write(networkStream, 0L); //shuffle.DiskBytesSpilled - } + } private static void WriteAccumulatorValues(Stream networkStream, IFormatter formatter) { @@ -564,121 +465,7 @@ public static void PrintFiles() logger.LogDebug("Files available in executor"); logger.LogDebug("Location: {0}{1}{2}", folder, Environment.NewLine, outfiles.ToString()); - } - - private static long ToUnixTime(DateTime dt) - { - return (long)(dt - UnixTimeEpoch).TotalMilliseconds; - } - - private static IEnumerable GetIterator(Stream inputStream, string serializedMode, int isFuncSqlUdf) - { - logger.LogInfo("Serialized mode in GetIterator: " + serializedMode); - IFormatter formatter = new BinaryFormatter(); - var mode = (SerializedMode)Enum.Parse(typeof(SerializedMode), serializedMode); - int messageLength; - Stopwatch watch = Stopwatch.StartNew(); - while ((messageLength = SerDe.ReadInt(inputStream)) != (int)SpecialLengths.END_OF_DATA_SECTION) - { - watch.Stop(); - if (messageLength > 0 || messageLength == (int)SpecialLengths.NULL) - { - watch.Start(); - byte[] buffer = messageLength > 0 ? SerDe.ReadBytes(inputStream, messageLength) : null; - watch.Stop(); - switch (mode) - { - case SerializedMode.String: - { - if (messageLength > 0) - { - if (buffer == null) - { - logger.LogDebug("Buffer is null. Message length is {0}", messageLength); - } - yield return SerDe.ToString(buffer); - } - else - { - yield return null; - } - break; - } - - case SerializedMode.Row: - { - Debug.Assert(messageLength > 0); - var unpickledObjects = PythonSerDe.GetUnpickledObjects(buffer); - - if (isFuncSqlUdf == 0) - { - foreach (var row in unpickledObjects.Select(item => (item as RowConstructor).GetRow())) - { - yield return row; - } - } - else - { - foreach (var row in unpickledObjects) - { - yield return row; - } - } - - break; - } - - case SerializedMode.Pair: - { - byte[] pairKey = buffer; - byte[] pairValue; - - watch.Start(); - int valueLength = SerDe.ReadInt(inputStream); - if (valueLength > 0) - { - pairValue = SerDe.ReadBytes(inputStream, valueLength); - } - else if (valueLength == (int)SpecialLengths.NULL) - { - pairValue = null; - } - else - { - throw new Exception(string.Format("unexpected valueLength: {0}", valueLength)); - } - watch.Stop(); - - yield return new Tuple(pairKey, pairValue); - break; - } - - case SerializedMode.None: //just return raw bytes - { - yield return buffer; - break; - } - - default: - { - if (buffer != null) - { - var ms = new MemoryStream(buffer); - yield return formatter.Deserialize(ms); - } - else - { - yield return null; - } - break; - } - } - } - watch.Start(); - } - - logger.LogInfo("total receive time: {0}", watch.ElapsedMilliseconds); - } + } internal class SparkCLRAssemblyHandler { diff --git a/csharp/Worker/Microsoft.Spark.CSharp/Worker.csproj b/csharp/Worker/Microsoft.Spark.CSharp/Worker.csproj index 36c9c1f2..2ba45523 100644 --- a/csharp/Worker/Microsoft.Spark.CSharp/Worker.csproj +++ b/csharp/Worker/Microsoft.Spark.CSharp/Worker.csproj @@ -46,6 +46,8 @@ + + diff --git a/csharp/Worker/Microsoft.Spark.CSharp/WorkerFunc.cs b/csharp/Worker/Microsoft.Spark.CSharp/WorkerFunc.cs new file mode 100644 index 00000000..0c6a6389 --- /dev/null +++ b/csharp/Worker/Microsoft.Spark.CSharp/WorkerFunc.cs @@ -0,0 +1,25 @@ +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE file in the project root for full license information. + +using System.Runtime.Serialization; +using Microsoft.Spark.CSharp.Core; +using System.Collections.Generic; + +namespace Microsoft.Spark.CSharp +{ + internal class WorkerFunc + { + internal CSharpWorkerFunc CharpWorkerFunc { get; } + + internal int ArgsCount { get; } + + internal List ArgOffsets { get; } + + internal WorkerFunc(CSharpWorkerFunc func, int argsCount, List argOffsets) + { + CharpWorkerFunc = func; + ArgsCount = argsCount; + ArgOffsets = argOffsets; + } + } +} diff --git a/csharp/WorkerTest/MultiThreadWorkerTest.cs b/csharp/WorkerTest/MultiThreadWorkerTest.cs index 0f0b307c..6488adeb 100644 --- a/csharp/WorkerTest/MultiThreadWorkerTest.cs +++ b/csharp/WorkerTest/MultiThreadWorkerTest.cs @@ -81,6 +81,7 @@ private int CreateServer(out Process worker, bool sparkReuseWorker) worker.Start(); int serverPort = 0; serverPort = SerDe.ReadInt(worker.StandardOutput.BaseStream); + Environment.SetEnvironmentVariable("PYTHON_WORKER_FACTORY_PORT", serverPort.ToString()); StreamReader stdoutReader = worker.StandardOutput; Task.Run(() => { @@ -119,7 +120,7 @@ private int CreateServer(out Process worker, bool sparkReuseWorker) private ISocketWrapper CreateSocket(int serverPort) { var socket =SocketFactory.CreateSocket(); - socket.Connect(IPAddress.Loopback, serverPort); + socket.Connect(IPAddress.Loopback, serverPort, null); return socket; } @@ -131,6 +132,10 @@ private void WritePayloadHeaderToWorker(Stream s) { SerDe.Write(s, splitIndex); SerDe.Write(s, ver); + SerDe.Write(s, 0); + SerDe.Write(s, 0); + SerDe.Write(s, 0); + SerDe.Write(s, 0L); SerDe.Write(s, sparkFilesDir); SerDe.Write(s, numberOfIncludesItems); SerDe.Write(s, numBroadcastVariables); diff --git a/csharp/WorkerTest/WorkerTest.cs b/csharp/WorkerTest/WorkerTest.cs index 18264375..1c0f6ea8 100644 --- a/csharp/WorkerTest/WorkerTest.cs +++ b/csharp/WorkerTest/WorkerTest.cs @@ -93,6 +93,7 @@ private ISocketWrapper CreateServer(out Process worker) } }; + Environment.SetEnvironmentVariable("PYTHON_WORKER_FACTORY_PORT", port.ToString()); lock (syncLock) { output.Clear(); @@ -125,6 +126,10 @@ private void WritePayloadHeaderToWorker(Stream s, int isSqlUdf = 0) { SerDe.Write(s, splitIndex); SerDe.Write(s, ver); + SerDe.Write(s, 0); + SerDe.Write(s, 0); + SerDe.Write(s, 0); + SerDe.Write(s, 0L); SerDe.Write(s, sparkFilesDir); SerDe.Write(s, numberOfIncludesItems); SerDe.Write(s, numBroadcastVariables); @@ -631,6 +636,10 @@ public void TestBroadcastVariablesInWorker() { SerDe.Write(s, splitIndex); SerDe.Write(s, ver); + SerDe.Write(s, 0); + SerDe.Write(s, 0); + SerDe.Write(s, 0); + SerDe.Write(s, 0L); SerDe.Write(s, sparkFilesDir); SerDe.Write(s, numberOfIncludesItems); @@ -802,6 +811,10 @@ public void TestUdfSerialization() using (var inputStream = new MemoryStream(500)) { SerDe.Write(inputStream, "1.0"); //version + SerDe.Write(inputStream, 0); + SerDe.Write(inputStream, 0); + SerDe.Write(inputStream, 0); + SerDe.Write(inputStream, 0L); SerDe.Write(inputStream, ""); //includes directory SerDe.Write(inputStream, 0); //number of included items SerDe.Write(inputStream, 0); //number of broadcast variables diff --git a/csharp/WorkerTest/WorkerTest.csproj b/csharp/WorkerTest/WorkerTest.csproj index 76c9ba87..8fa76dee 100644 --- a/csharp/WorkerTest/WorkerTest.csproj +++ b/csharp/WorkerTest/WorkerTest.csproj @@ -35,9 +35,8 @@ 4 - - False - ..\packages\Newtonsoft.Json.7.0.1\lib\net45\Newtonsoft.Json.dll + + ..\packages\Newtonsoft.Json.11.0.2\lib\net45\Newtonsoft.Json.dll False diff --git a/examples/Batch/WordCount/WordCount.csproj b/examples/Batch/WordCount/WordCount.csproj index b655eb8f..1961a0bc 100644 --- a/examples/Batch/WordCount/WordCount.csproj +++ b/examples/Batch/WordCount/WordCount.csproj @@ -32,17 +32,17 @@ 4 - + False - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\CSharpWorker.exe + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\CSharpWorker.exe False - ..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\log4net.dll - + False - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\Microsoft.Spark.CSharp.Adapter.dll + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\Microsoft.Spark.CSharp.Adapter.dll False @@ -84,4 +84,4 @@ --> - + \ No newline at end of file diff --git a/examples/Batch/pi/Pi.csproj b/examples/Batch/pi/Pi.csproj index df0916b5..464f4b5d 100644 --- a/examples/Batch/pi/Pi.csproj +++ b/examples/Batch/pi/Pi.csproj @@ -35,17 +35,17 @@ 4 - + False - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\CSharpWorker.exe + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\CSharpWorker.exe False - ..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\log4net.dll - + False - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\Microsoft.Spark.CSharp.Adapter.dll + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\Microsoft.Spark.CSharp.Adapter.dll False @@ -84,4 +84,4 @@ --> - + \ No newline at end of file diff --git a/examples/Examples.sln b/examples/Examples.sln index 5ba0d238..3eaad7a5 100644 --- a/examples/Examples.sln +++ b/examples/Examples.sln @@ -1,6 +1,6 @@ Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio 14 -VisualStudioVersion = 14.0.25123.0 +VisualStudioVersion = 14.0.25420.1 MinimumVisualStudioVersion = 10.0.40219.1 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "HdfsWordCount", "Streaming\HdfsWordCount\HdfsWordCount.csproj", "{6A2C7CF9-D64E-490D-9841-269EE14F7932}" EndProject diff --git a/examples/Sql/CassandraDataFrame/CassandraDataFrame.csproj b/examples/Sql/CassandraDataFrame/CassandraDataFrame.csproj index 30fd07f3..2f38f466 100644 --- a/examples/Sql/CassandraDataFrame/CassandraDataFrame.csproj +++ b/examples/Sql/CassandraDataFrame/CassandraDataFrame.csproj @@ -34,14 +34,17 @@ 4 - - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\CSharpWorker.exe + + False + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\CSharpWorker.exe - - ..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll + + False + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\log4net.dll - - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\Microsoft.Spark.CSharp.Adapter.dll + + False + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\Microsoft.Spark.CSharp.Adapter.dll False @@ -80,4 +83,4 @@ --> - + \ No newline at end of file diff --git a/examples/Sql/HiveDataFrame/HiveDataFrame.csproj b/examples/Sql/HiveDataFrame/HiveDataFrame.csproj index 0040a3eb..c826a80f 100644 --- a/examples/Sql/HiveDataFrame/HiveDataFrame.csproj +++ b/examples/Sql/HiveDataFrame/HiveDataFrame.csproj @@ -33,17 +33,17 @@ 4 - + False - ..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\CSharpWorker.exe - - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\CSharpWorker.exe - True + + False + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\log4net.dll - - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\Microsoft.Spark.CSharp.Adapter.dll - True + + False + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\Microsoft.Spark.CSharp.Adapter.dll @@ -75,4 +75,4 @@ --> - + \ No newline at end of file diff --git a/examples/Sql/JdbcDataFrame/JdbcDataFrame.csproj b/examples/Sql/JdbcDataFrame/JdbcDataFrame.csproj index 24ecf84f..fb4fc633 100644 --- a/examples/Sql/JdbcDataFrame/JdbcDataFrame.csproj +++ b/examples/Sql/JdbcDataFrame/JdbcDataFrame.csproj @@ -34,17 +34,17 @@ 4 - + False - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\CSharpWorker.exe + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\CSharpWorker.exe False - ..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\log4net.dll - + False - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\Microsoft.Spark.CSharp.Adapter.dll + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\Microsoft.Spark.CSharp.Adapter.dll False @@ -82,4 +82,4 @@ --> - + \ No newline at end of file diff --git a/examples/Sql/SparkXml/SparkXml.csproj b/examples/Sql/SparkXml/SparkXml.csproj index d7701258..622b6a24 100644 --- a/examples/Sql/SparkXml/SparkXml.csproj +++ b/examples/Sql/SparkXml/SparkXml.csproj @@ -34,17 +34,17 @@ 4 - + False - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\CSharpWorker.exe + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\CSharpWorker.exe False - ..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\log4net.dll - + False - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\Microsoft.Spark.CSharp.Adapter.dll + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\Microsoft.Spark.CSharp.Adapter.dll False @@ -82,4 +82,4 @@ --> - + \ No newline at end of file diff --git a/examples/Streaming/EventHub/EventHub.csproj b/examples/Streaming/EventHub/EventHub.csproj index cc6d4e27..934eae56 100644 --- a/examples/Streaming/EventHub/EventHub.csproj +++ b/examples/Streaming/EventHub/EventHub.csproj @@ -34,16 +34,18 @@ 4 - - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\CSharpWorker.exe + + False + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\CSharpWorker.exe False - ..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\log4net.dll - - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\Microsoft.Spark.CSharp.Adapter.dll + + False + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\Microsoft.Spark.CSharp.Adapter.dll False @@ -85,4 +87,4 @@ --> - + \ No newline at end of file diff --git a/examples/Streaming/HdfsWordCount/HdfsWordCount.csproj b/examples/Streaming/HdfsWordCount/HdfsWordCount.csproj index 34facbb8..c58ceaee 100644 --- a/examples/Streaming/HdfsWordCount/HdfsWordCount.csproj +++ b/examples/Streaming/HdfsWordCount/HdfsWordCount.csproj @@ -32,21 +32,21 @@ 4 - + False - ..\..\packages\Newtonsoft.Json.7.0.1\lib\net45\Newtonsoft.Json.dll + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\CSharpWorker.exe - + False - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\CSharpWorker.exe + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\log4net.dll - + False - ..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\Microsoft.Spark.CSharp.Adapter.dll - + False - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\Microsoft.Spark.CSharp.Adapter.dll + ..\..\packages\Newtonsoft.Json.7.0.1\lib\net45\Newtonsoft.Json.dll False @@ -80,4 +80,4 @@ --> - + \ No newline at end of file diff --git a/examples/Streaming/Kafka/Kafka.csproj b/examples/Streaming/Kafka/Kafka.csproj index 2bdaa816..68b15a7e 100644 --- a/examples/Streaming/Kafka/Kafka.csproj +++ b/examples/Streaming/Kafka/Kafka.csproj @@ -32,15 +32,17 @@ 4 - - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\CSharpWorker.exe + + False + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\CSharpWorker.exe - + False - ..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\log4net.dll - - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\Microsoft.Spark.CSharp.Adapter.dll + + False + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\Microsoft.Spark.CSharp.Adapter.dll False @@ -79,4 +81,4 @@ --> - + \ No newline at end of file diff --git a/examples/fsharp/JsonDataFrame/JsonDataFrame.fsproj b/examples/fsharp/JsonDataFrame/JsonDataFrame.fsproj index e91905a4..81f5a19d 100644 --- a/examples/fsharp/JsonDataFrame/JsonDataFrame.fsproj +++ b/examples/fsharp/JsonDataFrame/JsonDataFrame.fsproj @@ -66,13 +66,13 @@ - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\CSharpWorker.exe + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\CSharpWorker.exe - ..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\log4net.dll - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\Microsoft.Spark.CSharp.Adapter.dll + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\Microsoft.Spark.CSharp.Adapter.dll @@ -98,4 +98,4 @@ --> - + \ No newline at end of file diff --git a/examples/fsharp/WordCount/WordCountFSharp.fsproj b/examples/fsharp/WordCount/WordCountFSharp.fsproj index af96e494..86c3bdaa 100644 --- a/examples/fsharp/WordCount/WordCountFSharp.fsproj +++ b/examples/fsharp/WordCount/WordCountFSharp.fsproj @@ -71,20 +71,17 @@ - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\CSharpWorker.exe - True + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\CSharpWorker.exe ..\..\packages\FSharp.Core.4.0.0.1\lib\net40\FSharp.Core.dll True - ..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll - True + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\log4net.dll - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\Microsoft.Spark.CSharp.Adapter.dll - True + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\Microsoft.Spark.CSharp.Adapter.dll @@ -110,4 +107,4 @@ --> - + \ No newline at end of file diff --git a/notes/running-mobius-app.md b/notes/running-mobius-app.md index b430a0d7..ea776a39 100644 --- a/notes/running-mobius-app.md +++ b/notes/running-mobius-app.md @@ -145,7 +145,7 @@ The following sample commands show how to run Mobius examples in local mode. Usi Computes the _approximate_ value of Pi using two appropaches and displays the value. ### WordCount Example (Batch) -* Run `sparkclr-submit.cmd --exe SparkClrWordCount.exe C:\Git\Mobius\examples\Batch\WordCount\bin\Debug ` +* Run `sparkclr-submit.cmd --exe SparkClrPi.exe C:\Git\Mobius\examples\Batch\WordCount\bin\Debug ` `InputFilePath` should be in one of the following formats: * `hdfs://path/to/inputfile` diff --git a/scala/pom.xml b/scala/pom.xml index cb9ce900..ec526cda 100644 --- a/scala/pom.xml +++ b/scala/pom.xml @@ -2,7 +2,7 @@ 4.0.0 com.microsoft.sparkclr spark-clr_2.11 - 2.0.200-SNAPSHOT + 2.3.1-SNAPSHOT Mobius Project C# language binding and extensions to Apache Spark https://github.com/Microsoft/Mobius @@ -35,7 +35,7 @@ 1.5 UTF-8 2.11.8 - 2.0.2 + 2.3.1 2.11 @@ -106,14 +106,19 @@ org.apache.spark spark-hive_2.11 - 2.0.0 + ${spark.version} com.databricks - spark-csv_2.10 - 1.4.0 + spark-csv_2.11 + 1.5.0 + + + com.databricks + spark-avro_2.11 + 4.0.0 diff --git a/scala/src/main/org/apache/spark/api/csharp/CSharpRDD.scala b/scala/src/main/org/apache/spark/api/csharp/CSharpRDD.scala index d48e9f3b..57ca3616 100644 --- a/scala/src/main/org/apache/spark/api/csharp/CSharpRDD.scala +++ b/scala/src/main/org/apache/spark/api/csharp/CSharpRDD.scala @@ -12,6 +12,7 @@ import java.util.{List => JList, Map => JMap} import org.apache.hadoop.io.compress.CompressionCodec import org.apache.spark.api.python._ +import org.apache.spark.api.python.PythonAccumulatorV2 import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import org.apache.spark._ @@ -34,7 +35,7 @@ class CSharpRDD( cSharpWorkerExecutable: String, unUsedVersionIdentifier: String, broadcastVars: JList[Broadcast[PythonBroadcast]], - accumulator: Accumulator[JList[Array[Byte]]]) + accumulator: PythonAccumulatorV2) extends PythonRDD ( parent, SQLUtils.createCSharpFunction(command, envVars, cSharpIncludes, cSharpWorkerExecutable, @@ -95,7 +96,7 @@ class CSharpRDD( logInfo("Env vars: " + envVars.asScala.mkString(", ")) val runner = new PythonRunner( - Seq(ChainedPythonFunctions(Seq(func))), bufferSize, reuse_worker, false, Array(Array(0))) + Seq(ChainedPythonFunctions(Seq(func))), bufferSize, reuseWorker) runner.compute(firstParent.iterator(split, context), split.index, context) } diff --git a/scala/src/main/org/apache/spark/sql/api/csharp/SQLUtils.scala b/scala/src/main/org/apache/spark/sql/api/csharp/SQLUtils.scala index c01d76a7..79af72c3 100644 --- a/scala/src/main/org/apache/spark/sql/api/csharp/SQLUtils.scala +++ b/scala/src/main/org/apache/spark/sql/api/csharp/SQLUtils.scala @@ -8,6 +8,7 @@ package org.apache.spark.sql.api.csharp import java.io.{ByteArrayOutputStream, DataOutputStream} import org.apache.spark.{Accumulator, SparkContext} +import org.apache.spark.api.python.PythonAccumulatorV2 import org.apache.spark.api.csharp.SerDe import org.apache.spark.api.java.{JavaRDD, JavaSparkContext} import org.apache.spark.api.python.{PythonBroadcast, PythonFunction, SerDeUtil} @@ -51,7 +52,7 @@ object SQLUtils { cSharpWorkerExecutable: String, unUsedVersionIdentifier: String, broadcastVars: JList[Broadcast[PythonBroadcast]], - accumulator: Accumulator[JList[Array[Byte]]]) : PythonFunction = { + accumulator: PythonAccumulatorV2) : PythonFunction = { PythonFunction(command, envVars, cSharpIncludes, cSharpWorkerExecutable, unUsedVersionIdentifier, broadcastVars, accumulator) } diff --git a/scala/src/main/org/apache/spark/util/csharp/Utils.scala b/scala/src/main/org/apache/spark/util/csharp/Utils.scala index 7bb74190..7294cae6 100644 --- a/scala/src/main/org/apache/spark/util/csharp/Utils.scala +++ b/scala/src/main/org/apache/spark/util/csharp/Utils.scala @@ -127,17 +127,17 @@ object Utils extends Logging { timer.schedule(new TimerTask() { @Override def run() { - Runtime.getRuntime.halt(status) + if (status!=0) { Runtime.getRuntime.halt(status); } } }, maxDelayMillis) // try to exit nicely - System.exit(status); + if (status!=0) { System.exit(status); } } catch { // exit nastily if we have a problem case ex: Throwable => Runtime.getRuntime.halt(status) } finally { // should never get here - Runtime.getRuntime.halt(status) + if (status!=0) { Runtime.getRuntime.halt(status); } } } @@ -147,7 +147,7 @@ object Utils extends Logging { * @param status the exit status, zero for OK, non-zero for error */ def exit(status: Int): Unit = { - exit(status, 1000) + exit(status, 1000); } private[spark] def listZipFileEntries(file: File): Array[String] = { diff --git a/scripts/sparkclr-submit.cmd b/scripts/sparkclr-submit.cmd index c6e1d501..5f119c87 100644 --- a/scripts/sparkclr-submit.cmd +++ b/scripts/sparkclr-submit.cmd @@ -42,7 +42,7 @@ if not exist "%SPARK_JARS_DIR%" ( set SPARK_JARS_CLASSPATH=%SPARK_JARS_DIR%\* -if not defined SPARKCLR_JAR (set SPARKCLR_JAR=spark-clr_2.11-2.0.200-SNAPSHOT.jar) +if not defined SPARKCLR_JAR (set SPARKCLR_JAR=spark-clr_2.11-2.3.1-SNAPSHOT.jar) echo [sparkclr-submit.cmd] SPARKCLR_JAR=%SPARKCLR_JAR% set SPARKCLR_CLASSPATH=%SPARKCLR_HOME%\lib\%SPARKCLR_JAR% REM SPARKCLR_DEBUGMODE_EXT_JARS environment variable is used to specify external dependencies to use in debug mode @@ -105,4 +105,4 @@ goto :eof @echo Example 2: @echo sparkclr-submit.cmd [--verbose] [--master local] [--deploy-mode client] [--name testapp] --exe csdriver.exe c:\sparkclrapp\driver.zip arg1 arg2 arg3 @echo Example 3: - @echo sparkclr-submit.cmd [--verbose] --master spark://host:port --deploy-mode cluster [--name testapp] --exe csdriver.exe --remote-sparkclr-jar hdfs://path/to/spark-clr-1.6.1-SNAPSHOT.jar hdfs://path/to/driver.zip arg1 arg2 arg3 + @echo sparkclr-submit.cmd [--verbose] --master spark://host:port --deploy-mode cluster [--name testapp] --exe csdriver.exe --remote-sparkclr-jar hdfs://path/to/spark-clr_2.11-2.3.1-SNAPSHOT.jar hdfs://path/to/driver.zip arg1 arg2 arg3 diff --git a/scripts/sparkclr-submit.sh b/scripts/sparkclr-submit.sh index 5d94efa3..e4ca34f4 100755 --- a/scripts/sparkclr-submit.sh +++ b/scripts/sparkclr-submit.sh @@ -32,7 +32,7 @@ function usage() { echo "Example 2:" echo "sparkclr-submit.sh [--verbose] [--master local] [--deploy-mode client] [--name testapp] --exe csdriver.exe sparkclrapp/driver.zip arg1 arg2 arg3" echo "Example 3:" - echo "sparkclr-submit.sh [--verbose] --master spark://host:port --deploy-mode cluster [--name testapp] --exe csdriver.exe --remote-sparkclr-jar --remote-sparkclr-jar hdfs://path/to/spark-clr_2.10-1.6.1-SNAPSHOT.jar hdfs://path/to/driver.zip arg1 arg2 arg3" + echo "sparkclr-submit.sh [--verbose] --master spark://host:port --deploy-mode cluster [--name testapp] --exe csdriver.exe --remote-sparkclr-jar --remote-sparkclr-jar hdfs://path/to/spark-clr_2.11-2.3.1-SNAPSHOT.jar hdfs://path/to/driver.zip arg1 arg2 arg3" } [ "$SPARK_HOME" = "" ] && spark_home_error @@ -57,7 +57,7 @@ fi export SPARK_JARS_CLASSPATH="$SPARK_JARS_DIR/*" -export SPARKCLR_JAR=spark-clr_2.11-2.0.200-SNAPSHOT.jar +export SPARKCLR_JAR=spark-clr_2.11-2.3.1-SNAPSHOT.jar export SPARKCLR_CLASSPATH="$SPARKCLR_HOME/lib/$SPARKCLR_JAR" # SPARKCLR_DEBUGMODE_EXT_JARS environment variable is used to specify external dependencies to use in debug mode [ ! "$SPARKCLR_DEBUGMODE_EXT_JARS" = "" ] && export SPARKCLR_CLASSPATH="$SPARKCLR_CLASSPATH:$SPARKCLR_DEBUGMODE_EXT_JARS"