In [2]:
from datetime import datetime, date
import pandas as pd
from pyspark.sql import SparkSession, Row

spark = SparkSession.builder.getOrCreate()
df = spark.createDataFrame([
	Row(a=1, b=4., c='GFG1', d=date(2000, 8, 1),
		e=datetime(2000, 8, 1, 12, 0)),

	Row(a=2, b=8., c='GFG2', d=date(2000, 6, 2),
		e=datetime(2000, 6, 2, 12, 0)),

	Row(a=4, b=5., c='GFG3', d=date(2000, 5, 3),
		e=datetime(2000, 5, 3, 12, 0))
])

df.show()

df.printSchema()

[Stage 0:>                                                          (0 + 1) / 1]

+---+---+----+----------+-------------------+
|  a|  b|   c|         d|                  e|
+---+---+----+----------+-------------------+
|  1|4.0|GFG1|2000-08-01|2000-08-01 12:00:00|
|  2|8.0|GFG2|2000-06-02|2000-06-02 12:00:00|
|  4|5.0|GFG3|2000-05-03|2000-05-03 12:00:00|
+---+---+----+----------+-------------------+

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



                                                                                

In [3]:
from datetime import datetime, date
import pandas as pd
from pyspark.sql import Row
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
df = spark.createDataFrame([
	(1, 4., 'GFG1', date(2000, 8, 1),
	datetime(2000, 8, 1, 12, 0)),

	(2, 8., 'GFG2', date(2000, 6, 2),
	datetime(2000, 6, 2, 12, 0)),

	(3, 5., 'GFG3', date(2000, 5, 3),
	datetime(2000, 5, 3, 12, 0))
], schema='a long, b double, c string, d date, e timestamp')

df.show()

df.printSchema()

+---+---+----+----------+-------------------+
|  a|  b|   c|         d|                  e|
+---+---+----+----------+-------------------+
|  1|4.0|GFG1|2000-08-01|2000-08-01 12:00:00|
|  2|8.0|GFG2|2000-06-02|2000-06-02 12:00:00|
|  3|5.0|GFG3|2000-05-03|2000-05-03 12:00:00|
+---+---+----+----------+-------------------+

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



In [4]:
from datetime import datetime, date
import pandas as pd
from pyspark.sql import Row
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
pandas_df = pd.DataFrame({
    'a': [1, 2, 3],

    'b': [4., 8., 5.],

    'c': ['GFG1', 'GFG2', 'GFG3'],

    'd': [date(2000, 8, 1), date(2000, 6, 2),
          date(2000, 5, 3)],

    'e': [datetime(2000, 8, 1, 12, 0),
          datetime(2000, 6, 2, 12, 0),
          datetime(2000, 5, 3, 12, 0)]
})

df = spark.createDataFrame(pandas_df)
df

df.show()

df.printSchema()

+---+---+----+----------+-------------------+
|  a|  b|   c|         d|                  e|
+---+---+----+----------+-------------------+
|  1|4.0|GFG1|2000-08-01|2000-08-01 12:00:00|
|  2|8.0|GFG2|2000-06-02|2000-06-02 12:00:00|
|  3|5.0|GFG3|2000-05-03|2000-05-03 12:00:00|
+---+---+----+----------+-------------------+

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



In [5]:
from datetime import datetime, date
import pandas as pd
from pyspark.sql import Row
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
rdd = spark.sparkContext.parallelize([
    (1, 4., 'GFG1', date(2000, 8, 1), datetime(2000, 8, 1, 12, 0)),
    (2, 8., 'GFG2', date(2000, 6, 2), datetime(2000, 6, 2, 12, 0)),
    (3, 5., 'GFG3', date(2000, 5, 3), datetime(2000, 5, 3, 12, 0))
])
df = spark.createDataFrame(rdd, schema=['a', 'b', 'c', 'd', 'e'])
df

df.show()

df.printSchema()

+---+---+----+----------+-------------------+
|  a|  b|   c|         d|                  e|
+---+---+----+----------+-------------------+
|  1|4.0|GFG1|2000-08-01|2000-08-01 12:00:00|
|  2|8.0|GFG2|2000-06-02|2000-06-02 12:00:00|
|  3|5.0|GFG3|2000-05-03|2000-05-03 12:00:00|
+---+---+----+----------+-------------------+

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



In [6]:
from datetime import datetime, date
import pandas as pd
from pyspark.sql import Row
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
df = spark.createDataFrame(pd.read_csv('content/train_dataset-1.csv'))
df

df.show()

df.printSchema()

+------+---+--------+-----------+-----------------+-------------+------------+-------------------------+
|Gender|Age|openness|neuroticism|conscientiousness|agreeableness|extraversion|Personality (Class label)|
+------+---+--------+-----------+-----------------+-------------+------------+-------------------------+
|  Male| 17|       7|          4|                7|            3|           2|              extraverted|
|  Male| 19|       4|          5|                4|            6|           6|                  serious|
|Female| 18|       7|          6|                4|            5|           5|               dependable|
|Female| 22|       5|          6|                7|            4|           3|              extraverted|
|Female| 19|       7|          4|                6|            5|           4|                   lively|
|  Male| 18|       5|          7|                7|            6|           4|                   lively|
|Female| 17|       5|          6|                5|    

In [7]:
from datetime import datetime, date
import pandas as pd
from pyspark.sql import Row
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
df = spark.createDataFrame(pd.read_csv('content/text_file.txt', delimiter="\t"))
df

df.show()

df.printSchema()

+---+---+----+----------+-------------------+
|  a|  b|   c|         d|                  e|
+---+---+----+----------+-------------------+
|  1|4.0|GFG1|2000-08-01|2000-08-01 12:00:00|
|  2|8.0|GFG2|2000-06-02|2000-06-02 12:00:00|
|  3|5.0|GFG3|2000-05-03|2000-05-03 12:00:00|
+---+---+----+----------+-------------------+

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: string (nullable = true)
 |-- e: string (nullable = true)



In [8]:
from datetime import datetime, date
import pandas as pd
from pyspark.sql import Row
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

df = spark.createDataFrame(pd.read_json('content/json_data.json'))
df

df.show()

df.printSchema()

+---+---+----+------------+------------+
|  a|  b|   c|           d|           e|
+---+---+----+------------+------------+
|  1|  4|GFG1|965088000000|965131200000|
|  2|  8|GFG2|959904000000|959947200000|
|  3|  5|GFG3|957312000000|957355200000|
+---+---+----+------------+------------+

root
 |-- a: long (nullable = true)
 |-- b: long (nullable = true)
 |-- c: string (nullable = true)
 |-- d: long (nullable = true)
 |-- e: long (nullable = true)



In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

spark = SparkSession.builder.appName("ReadBigJSON").getOrCreate()

schema = StructType([
    StructField("a", StringType()),
    StructField("b", DoubleType()),
    StructField("c", StringType()),
    StructField("d", DoubleType()),
    StructField("e", DoubleType())
])

df = spark.read.schema(schema).json('content/json_data_2.json')

df.printSchema()
df.show(10)

root
 |-- a: string (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: double (nullable = true)
 |-- e: double (nullable = true)

+---+---+----+----------+-----------+
|  a|  b|   c|         d|          e|
+---+---+----+----------+-----------+
|  1|4.0|GFG1|9.65088E11|9.651312E11|
|  2|8.0|GFG2|9.59904E11|9.599472E11|
|  3|5.0|GFG3|9.57312E11|9.573552E11|
+---+---+----+----------+-----------+



In [17]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

spark = SparkSession.builder.appName("ReadBigJSON").getOrCreate()

df = spark.read.option("multiLine", "false") \
               .option("allowUnquotedFieldNames", "true") \
               .option("allowComments", "true") \
               .option("allowSingleQuotes", "true") \
               .option("mode", "PERMISSIVE") \
               .json("content/json_data_3.json")

df.printSchema()
df.show(10)

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: long (nullable = true)
 |-- e: long (nullable = true)
 |-- info: string (nullable = true)

+---+----+---------+------------+------------+--------+
|  a|   b|        c|           d|           e|    info|
+---+----+---------+------------+------------+--------+
|  1|NULL|     NULL|        NULL|        NULL|{"x":10}|
|  2|NULL|     NULL|        NULL|        NULL|       5|
|  3|NULL|     NULL|        NULL|        NULL|    NULL|
|  1| 4.0|     NULL|        NULL|        NULL|    NULL|
|  2|NULL|     NULL|        NULL|        NULL|    NULL|
|  3| 5.0|extra_col|        NULL|        NULL|    NULL|
|  1| 4.0|     GFG1|965088000000|965131200000|    NULL|
|  2| 8.0|     GFG2|959904000000|959947200000|    NULL|
|  3| 5.0|     GFG3|957312000000|957355200000|    NULL|
+---+----+---------+------------+------------+--------+

