In [0]:
spark

In [0]:
type(spark)

Out[3]: pyspark.sql.session.SparkSession

In [0]:
dir(spark)

Out[4]: ['Builder',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_activeSession',
 '_conf',
 '_convert_from_pandas',
 '_createFromLocal',
 '_createFromLocalTrusted',
 '_createFromRDD',
 '_create_dataframe',
 '_create_from_pandas_with_arrow',
 '_create_rdd_from_local_trusted',
 '_create_shell_session',
 '_getActiveSessionOrCreate',
 '_get_numpy_record_dtype',
 '_inferSchema',
 '_inferSchemaFromList',
 '_instantiatedSession',
 '_jconf',
 '_jsc',
 '_jsparkSession',
 '_jvm',
 '_repr_html_',
 '_sc',
 '_wrap_data_schema',
 '_write_to_trusted_path',
 'builder',
 'catalog',
 'conf',
 'createDataFrame',
 'getActiveSession',
 'n

In [0]:
help(spark.createDataFrame)

Help on method createDataFrame in module pyspark.sql.session:

createDataFrame(data: Union[pyspark.rdd.RDD[Any], Iterable[Any], ForwardRef('PandasDataFrameLike'), ForwardRef('ArrayLike')], schema: Union[pyspark.sql.types.AtomicType, pyspark.sql.types.StructType, str, NoneType] = None, samplingRatio: Optional[float] = None, verifySchema: bool = True) -> pyspark.sql.dataframe.DataFrame method of pyspark.sql.session.SparkSession instance
    Creates a :class:`DataFrame` from an :class:`RDD`, a list, a :class:`pandas.DataFrame`
    or a :class:`numpy.ndarray`.
    
    .. versionadded:: 2.0.0
    
    .. versionchanged:: 3.4.0
        Support Spark Connect.
    
    Parameters
    ----------
    data : :class:`RDD` or iterable
        an RDD of any kind of SQL data representation (:class:`Row`,
        :class:`tuple`, ``int``, ``boolean``, etc.), or :class:`list`,
        :class:`pandas.DataFrame` or :class:`numpy.ndarray`.
    schema : :class:`pyspark.sql.types.DataType`, str or list, opt

In [0]:
data = [(1,'Maheer'),(2, 'wafa')]

df = spark.createDataFrame(data = data, schema = ['Id','Name'])
df.show()
df.printSchema()

+---+------+
| Id|  Name|
+---+------+
|  1|Maheer|
|  2|  wafa|
+---+------+

root
 |-- Id: long (nullable = true)
 |-- Name: string (nullable = true)



In [0]:
from pyspark.sql.types import *

data = [(1,'Maheer'),(2, 'wafa')]

schema = StructType([StructField(name = 'Id', dataType = IntegerType()),
            StructField(name = 'Name', dataType = StringType())])

type(schema)

df = spark.createDataFrame(data = data, schema = schema)

df.show()
df.printSchema()

+---+------+
| Id|  Name|
+---+------+
|  1|Maheer|
|  2|  wafa|
+---+------+

root
 |-- Id: integer (nullable = true)
 |-- Name: string (nullable = true)



In [0]:
data = [{'id':1,'name':'Maheer'},
        {'id':2,'name':'wafa'}]

df = spark.createDataFrame(data=data)

df.show()
df.printSchema()

+---+------+
| id|  name|
+---+------+
|  1|Maheer|
|  2|  wafa|
+---+------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)



In [0]:
## Reading csv file

df = spark.read.csv(path = 'dbfs:/FileStore/tables/simplelinearregression-2.csv', header=True,)

df.show()

df.printSchema()

+---+-------+
|Age|Premium|
+---+-------+
| 18|  10000|
| 22|  15000|
| 23|  18000|
| 26|  21000|
| 28|  24000|
| 31|  26500|
| 33|  27000|
+---+-------+

root
 |-- Age: string (nullable = true)
 |-- Premium: string (nullable = true)



In [0]:
df = spark.read.format('csv').option(key="header", value=True).load(path = 'dbfs:/FileStore/tables/simplelinearregression-2.csv')

df.show()

+---+-------+
|Age|Premium|
+---+-------+
| 18|  10000|
| 22|  15000|
| 23|  18000|
| 26|  21000|
| 28|  24000|
| 31|  26500|
| 33|  27000|
+---+-------+



In [0]:
#read multiple csv files from different paths/folders

df1 = spark.read.csv(path = ["dbfs:/FileStore/tables/simplelinearregression-1.csv", "dbfs:/FileStore/tables/simplelinearregression-2.csv"], header=True)

df1.show()

+---+-------+
|Age|Premium|
+---+-------+
| 18|  10000|
| 22|  15000|
| 23|  18000|
| 26|  21000|
| 28|  24000|
| 31|  26500|
| 33|  27000|
| 18|  10000|
| 22|  15000|
| 23|  18000|
| 26|  21000|
| 28|  24000|
| 31|  26500|
| 33|  27000|
+---+-------+



In [0]:
#read all csv files in a folder

df1 = spark.read.csv(path = "dbfs:/FileStore/tables/", header = True)

In [0]:
df1.show()

+--------------------+--------------+------+-------+----+-----------+----+-----+--------------+--------------------+------------+------------------+------------+
|                 App|      Category|Rating|Reviews|Size|   Installs|Type|Price|Content Rating|              Genres|Last Updated|       Current Ver| Android Ver|
+--------------------+--------------+------+-------+----+-----------+----+-----+--------------+--------------------+------------+------------------+------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159| 19M|    10,000+|Free|    0|      Everyone|        Art & Design|    7-Jan-18|             1.0.0|4.0.3 and up|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967| 14M|   500,000+|Free|    0|      Everyone|Art & Design;Pret...|   15-Jan-18|             2.0.0|4.0.3 and up|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510|8.7M| 5,000,000+|Free|    0|      Everyone|        Art & Design|    1-Aug-18|             1.2.4|4.0.3 and up|
|Sketch - Draw & P...|ART_AN

In [0]:

from pyspark.sql.types import *

schema = StructType().add("Age", IntegerType(), True).add("Premium", IntegerType(), True, None)

df = spark.read.csv(path = 'dbfs:/FileStore/tables/simplelinearregression-2.csv', schema = schema)

In [0]:

df.show()

df.printSchema()

+----+-------+
| Age|Premium|
+----+-------+
|null|   null|
|  18|  10000|
|  22|  15000|
|  23|  18000|
|  26|  21000|
|  28|  24000|
|  31|  26500|
|  33|  27000|
+----+-------+

root
 |-- Age: integer (nullable = true)
 |-- Premium: integer (nullable = true)



In [0]:
#write dataframe to csv

from pyspark.sql import *

In [0]:
help(DataFrameWriter)

Help on class DataFrameWriter in module pyspark.sql.readwriter:

class DataFrameWriter(OptionUtils)
 |  DataFrameWriter(df: 'DataFrame')
 |  
 |  Interface used to write a :class:`DataFrame` to external storage systems
 |  (e.g. file systems, key-value stores, etc). Use :attr:`DataFrame.write`
 |  to access this.
 |  
 |  .. versionadded:: 1.4.0
 |  
 |  .. versionchanged:: 3.4.0
 |      Support Spark Connect.
 |  
 |  Method resolution order:
 |      DataFrameWriter
 |      OptionUtils
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, df: 'DataFrame')
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  bucketBy(self, numBuckets: int, col: Union[str, List[str], Tuple[str, ...]], *cols: Optional[str]) -> 'DataFrameWriter'
 |      Buckets the output by the given columns. If specified,
 |      the output is laid out on the file system similar to Hive's bucketing scheme,
 |      but with a different bucket hash function and is not co

In [0]:
help(DataFrame)

Help on class DataFrame in module pyspark.sql.dataframe:

class DataFrame(pyspark.sql.pandas.map_ops.PandasMapOpsMixin, pyspark.sql.pandas.conversion.PandasConversionMixin)
 |  DataFrame(jdf: py4j.java_gateway.JavaObject, sql_ctx: Union[ForwardRef('SQLContext'), ForwardRef('SparkSession')])
 |  
 |  A distributed collection of data grouped into named columns.
 |  
 |  .. versionadded:: 1.3.0
 |  
 |  .. versionchanged:: 3.4.0
 |      Support Spark Connect.
 |  
 |  Examples
 |  --------
 |  A :class:`DataFrame` is equivalent to a relational table in Spark SQL,
 |  and can be created using various functions in :class:`SparkSession`:
 |  
 |  >>> people = spark.createDataFrame([
 |  ...     {"deptId": 1, "age": 40, "name": "Hyukjin Kwon", "gender": "M", "salary": 50},
 |  ...     {"deptId": 1, "age": 50, "name": "Takuya Ueshin", "gender": "M", "salary": 100},
 |  ...     {"deptId": 2, "age": 60, "name": "Xinrong Meng", "gender": "F", "salary": 150},
 |  ...     {"deptId": 3, "age": 20, "

In [0]:
from pyspark.sql.types import *

data = [(1,'Maheer'),(2, 'wafa')]

schema = StructType([StructField(name = 'Id', dataType = IntegerType()),
            StructField(name = 'Name', dataType = StringType())])

type(schema)

df = spark.createDataFrame(data = data, schema = schema)

df.show()
df.printSchema()

df.write.csv(path = 'dbfs:/FileStore/tables/dataset2', header = True)

+---+------+
| Id|  Name|
+---+------+
|  1|Maheer|
|  2|  wafa|
+---+------+

root
 |-- Id: integer (nullable = true)
 |-- Name: string (nullable = true)



In [0]:
df.write.csv(path = 'dbfs:/FileStore/tables/dataset2', header = True, mode = 'overwrite')

df.show()

display(spark.read.csv(path = 'dbfs:/FileStore/tables/dataset2', header = True))

+---+------+
| Id|  Name|
+---+------+
|  1|Maheer|
|  2|  wafa|
+---+------+



Id,Name
1,Maheer
2,wafa


In [0]:
df.write.csv(path = 'dbfs:/FileStore/tables/dataset2', header = True, mode = 'append')

df.show()

+---+------+
| Id|  Name|
+---+------+
|  1|Maheer|
|  2|  wafa|
+---+------+



In [0]:
display(spark.read.csv(path = 'dbfs:/FileStore/tables/dataset2', header = True))

Id,Name
1,Maheer
1,Maheer
1,Maheer
2,wafa
2,wafa
2,wafa


In [0]:
df.write.csv(path = 'dbfs:/FileStore/tables/dataset2', header = True, mode = 'ignore')

display(spark.read.csv(path = 'dbfs:/FileStore/tables/dataset2', header = True))

Id,Name
1,Maheer
2,wafa


In [0]:
df.write.csv(path = 'dbfs:/FileStore/tables/dataset2', header = True, mode = 'error')

display(spark.read.csv(path = 'dbfs:/FileStore/tables/dataset2', header = True))

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-3409072812134660>:1[0m
[0;32m----> 1[0m df[38;5;241m.[39mwrite[38;5;241m.[39mcsv(path [38;5;241m=[39m [38;5;124m'[39m[38;5;124mdbfs:/FileStore/tables/dataset2[39m[38;5;124m'[39m, header [38;5;241m=[39m [38;5;28;01mTrue[39;00m, mode [38;5;241m=[39m [38;5;124m'[39m[38;5;124merror[39m[38;5;124m'[39m)
[1;32m      3[0m display(spark[38;5;241m.[39mread[38;5;241m.[39mcsv(path [38;5;241m=[39m [38;5;124m'[39m[38;5;124mdbfs:/FileStore/tables/dataset2[39m[38;5;124m'[39m, header [38;5;241m=[39m [38;5;28;01mTrue[39;00m))

File [0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:48[0m, in [0;36m_wrap_function.<locals>.wrapper[0;34m(*args, **kwargs)[0m
[1;32m     46[0m start [38;5;241m=[39m time[38;5;241m.[39mperf_counter()
[1;32m     47[0m

In [0]:
## error above is expected 