In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[1]').appName('pyspark_fresh').getOrCreate()

In [3]:
user_list = [{'user_id':1,'first_name':'dewil'},
             {'user_id':2,'first_name':'williom'},
             {'user_id':3,'first_name':'smith'},
             {'user_id':4,'first_name':'maithew'}
    ]

In [4]:
spark.createDataFrame(user_list)

DataFrame[first_name: string, user_id: bigint]

In [5]:
spark.createDataFrame(user_list).show()

+----------+-------+
|first_name|user_id|
+----------+-------+
|     dewil|      1|
|   williom|      2|
|     smith|      3|
|   maithew|      4|
+----------+-------+



## It's depricated we need convert into Row then create df

In [6]:
from pyspark.sql import Row

In [7]:
user_rows = [Row(*user.values()) for user in user_list]

In [8]:
user_rows

[<Row(1, 'dewil')>,
 <Row(2, 'williom')>,
 <Row(3, 'smith')>,
 <Row(4, 'maithew')>]

In [9]:
spark.createDataFrame(user_rows)

DataFrame[_1: bigint, _2: string]

In [10]:
spark.createDataFrame(user_rows).show()

+---+-------+
| _1|     _2|
+---+-------+
|  1|  dewil|
|  2|williom|
|  3|  smith|
|  4|maithew|
+---+-------+



In [11]:
spark.createDataFrame(user_rows,schema='user_id int,first_name string').show()

+-------+----------+
|user_id|first_name|
+-------+----------+
|      1|     dewil|
|      2|   williom|
|      3|     smith|
|      4|   maithew|
+-------+----------+



In [12]:
user_rows = [Row(**user) for user in user_list]

In [13]:
spark.createDataFrame(user_rows,schema='user_id int,first_name string').show()

+-------+----------+
|user_id|first_name|
+-------+----------+
|      1|     dewil|
|      2|   williom|
|      3|     smith|
|      4|   maithew|
+-------+----------+



In [14]:
def demo(*arg):
    print(arg)

In [15]:
demo(5)

(5,)


In [16]:
demo(5,6,7)

(5, 6, 7)


In [17]:
demo([2,4,6,8])

([2, 4, 6, 8],)


In [18]:
def demo(**arg):
    print(arg)

In [19]:
demo(5)

TypeError: demo() takes 0 positional arguments but 1 was given

In [None]:
demo(id=1)

In [20]:
demo(id=[1,2,3,4,5,6,7,9])

{'id': [1, 2, 3, 4, 5, 6, 7, 9]}


In [21]:
import datetime
data = [ {
        "id" : -1,
        "name" : "sid",
        "dataTypeName" : "meta_data",
        "fieldName" : ":sid",
        "position" : 0,
        "renderTypeName" : "meta_data",
        "format" : '',
        "flags" : "hidden",
        "date" : datetime.date(2023,10,20),
        "timestamp":datetime.datetime(2023,10,20, 10,36,45)
      }, {
        "id" : -1,
        "name" : "id",
        "dataTypeName" : "meta_data",
        "fieldName" : ":id",
        "position" : 0,
        "renderTypeName" : "meta_data",
        "format" : 'json',
        "flags" : "hidden",
        "date" : datetime.date(2023,10,20),
        "timestamp":datetime.datetime(2023,10,20, 10,36,45)
      }, {
        "id" : -1,
        "name" : "position",
        "dataTypeName" : "meta_data",
        "fieldName" : ":position",
        "position" : 0,
        "renderTypeName" : "meta_data",
        "format" : 'json',
        "flags" :  "hidden",
        "date" : datetime.date(2023,10,20),
        "timestamp":datetime.datetime(2023,10,20, 10,36,45)
      }, {
        "id" : -1,
        "name" : "created_at",
        "dataTypeName" : "meta_data",
        "fieldName" : ":created_at",
        "position" : 0,
        "renderTypeName" : "meta_data",
        "format" : 'json',
        "flags" :  "hidden",
        "date" : datetime.date(2023,10,20),
        "timestamp":datetime.datetime(2023,10,20, 10,36,45)
      }]

In [22]:
rows = [Row(**d) for d in data]

In [23]:
spark.createDataFrame(rows).show()

+---+----------+------------+-----------+--------+--------------+------+------+----------+-------------------+
| id|      name|dataTypeName|  fieldName|position|renderTypeName|format| flags|      date|          timestamp|
+---+----------+------------+-----------+--------+--------------+------+------+----------+-------------------+
| -1|       sid|   meta_data|       :sid|       0|     meta_data|      |hidden|2023-10-20|2023-10-20 10:36:45|
| -1|        id|   meta_data|        :id|       0|     meta_data|  json|hidden|2023-10-20|2023-10-20 10:36:45|
| -1|  position|   meta_data|  :position|       0|     meta_data|  json|hidden|2023-10-20|2023-10-20 10:36:45|
| -1|created_at|   meta_data|:created_at|       0|     meta_data|  json|hidden|2023-10-20|2023-10-20 10:36:45|
+---+----------+------------+-----------+--------+--------------+------+------+----------+-------------------+



In [24]:
df = spark.createDataFrame(rows)

In [25]:
df.dtypes

[('id', 'bigint'),
 ('name', 'string'),
 ('dataTypeName', 'string'),
 ('fieldName', 'string'),
 ('position', 'bigint'),
 ('renderTypeName', 'string'),
 ('format', 'string'),
 ('flags', 'string'),
 ('date', 'date'),
 ('timestamp', 'timestamp')]

In [26]:
schema = """
id INT,
name STRING,
dataTypeName STRING,
fieldName STRING,
position INT,
renderTypeName STRING,
format STRING,
flags STRING,
date DATE,
timestamp TIMESTAMP
"""

In [27]:
dff = spark.createDataFrame(rows,schema=schema)

In [29]:
from pyspark.sql.types import StructType

In [30]:
help(StructType)

Help on class StructType in module pyspark.sql.types:

class StructType(DataType)
 |  StructType(fields: Optional[List[pyspark.sql.types.StructField]] = None)
 |  
 |  Struct type, consisting of a list of :class:`StructField`.
 |  
 |  This is the data type representing a :class:`Row`.
 |  
 |  Iterating a :class:`StructType` will iterate over its :class:`StructField`\s.
 |  A contained :class:`StructField` can be accessed by its name or position.
 |  
 |  Examples
 |  --------
 |  >>> from pyspark.sql.types import *
 |  >>> struct1 = StructType([StructField("f1", StringType(), True)])
 |  >>> struct1["f1"]
 |  StructField('f1', StringType(), True)
 |  >>> struct1[0]
 |  StructField('f1', StringType(), True)
 |  
 |  >>> struct1 = StructType([StructField("f1", StringType(), True)])
 |  >>> struct2 = StructType([StructField("f1", StringType(), True)])
 |  >>> struct1 == struct2
 |  True
 |  >>> struct1 = StructType([StructField("f1", CharType(10), True)])
 |  >>> struct2 = StructType([S

In [31]:
from pyspark.sql.types import *

In [36]:
s_schema = StructType([StructField('id',IntegerType()),
           StructField('name',StringType()),
           StructField('dataTypeName',StringType()),
           StructField('fieldName',StringType()),
           StructField('position',IntegerType()),
           StructField('renderTypeName',StringType()),
           StructField('format',StringType()),
           StructField('flags',StringType()),
           StructField('date',DateType()),
           StructField('timestamp',TimestampType())])

In [32]:
dff.show()

+---+----------+------------+-----------+--------+--------------+------+------+----------+-------------------+
| id|      name|dataTypeName|  fieldName|position|renderTypeName|format| flags|      date|          timestamp|
+---+----------+------------+-----------+--------+--------------+------+------+----------+-------------------+
| -1|       sid|   meta_data|       :sid|       0|     meta_data|      |hidden|2023-10-20|2023-10-20 10:36:45|
| -1|        id|   meta_data|        :id|       0|     meta_data|  json|hidden|2023-10-20|2023-10-20 10:36:45|
| -1|  position|   meta_data|  :position|       0|     meta_data|  json|hidden|2023-10-20|2023-10-20 10:36:45|
| -1|created_at|   meta_data|:created_at|       0|     meta_data|  json|hidden|2023-10-20|2023-10-20 10:36:45|
+---+----------+------------+-----------+--------+--------------+------+------+----------+-------------------+



In [37]:
spark.createDataFrame(rows,schema=s_schema).show()

+---+----------+------------+-----------+--------+--------------+------+------+----------+-------------------+
| id|      name|dataTypeName|  fieldName|position|renderTypeName|format| flags|      date|          timestamp|
+---+----------+------------+-----------+--------+--------------+------+------+----------+-------------------+
| -1|       sid|   meta_data|       :sid|       0|     meta_data|      |hidden|2023-10-20|2023-10-20 10:36:45|
| -1|        id|   meta_data|        :id|       0|     meta_data|  json|hidden|2023-10-20|2023-10-20 10:36:45|
| -1|  position|   meta_data|  :position|       0|     meta_data|  json|hidden|2023-10-20|2023-10-20 10:36:45|
| -1|created_at|   meta_data|:created_at|       0|     meta_data|  json|hidden|2023-10-20|2023-10-20 10:36:45|
+---+----------+------------+-----------+--------+--------------+------+------+----------+-------------------+



## Create Spark Dataframe from pandas dataframe

In [38]:
data

[{'id': -1,
  'name': 'sid',
  'dataTypeName': 'meta_data',
  'fieldName': ':sid',
  'position': 0,
  'renderTypeName': 'meta_data',
  'format': '',
  'flags': 'hidden',
  'date': datetime.date(2023, 10, 20),
  'timestamp': datetime.datetime(2023, 10, 20, 10, 36, 45)},
 {'id': -1,
  'name': 'id',
  'dataTypeName': 'meta_data',
  'fieldName': ':id',
  'position': 0,
  'renderTypeName': 'meta_data',
  'format': 'json',
  'flags': 'hidden',
  'date': datetime.date(2023, 10, 20),
  'timestamp': datetime.datetime(2023, 10, 20, 10, 36, 45)},
 {'id': -1,
  'name': 'position',
  'dataTypeName': 'meta_data',
  'fieldName': ':position',
  'position': 0,
  'renderTypeName': 'meta_data',
  'format': 'json',
  'flags': 'hidden',
  'date': datetime.date(2023, 10, 20),
  'timestamp': datetime.datetime(2023, 10, 20, 10, 36, 45)},
 {'id': -1,
  'name': 'created_at',
  'dataTypeName': 'meta_data',
  'fieldName': ':created_at',
  'position': 0,
  'renderTypeName': 'meta_data',
  'format': 'json',
  'flag

In [39]:
import pandas as pd
spark.createDataFrame(pd.DataFrame(data=data)).show()

+---+----------+------------+-----------+--------+--------------+------+------+----------+-------------------+
| id|      name|dataTypeName|  fieldName|position|renderTypeName|format| flags|      date|          timestamp|
+---+----------+------------+-----------+--------+--------------+------+------+----------+-------------------+
| -1|       sid|   meta_data|       :sid|       0|     meta_data|      |hidden|2023-10-20|2023-10-20 10:36:45|
| -1|        id|   meta_data|        :id|       0|     meta_data|  json|hidden|2023-10-20|2023-10-20 10:36:45|
| -1|  position|   meta_data|  :position|       0|     meta_data|  json|hidden|2023-10-20|2023-10-20 10:36:45|
| -1|created_at|   meta_data|:created_at|       0|     meta_data|  json|hidden|2023-10-20|2023-10-20 10:36:45|
+---+----------+------------+-----------+--------+--------------+------+------+----------+-------------------+

