In [1]:
import findspark
findspark.init()

In [2]:
import pyspark
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.getOrCreate()

In [4]:
df = spark.read.csv("D:\\code\\spark\\spark-basics\\data\\flight-data\\csv\\2010-summary.csv")

In [26]:
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)



In [27]:
df.show(2)

+-----------------+-------------------+-----+
|              _c0|                _c1|  _c2|
+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
|    United States|            Romania|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



In [28]:
df2 = spark.read.format("csv").load("D:\\code\\spark\\spark-basics\\data\\flight-data\\csv\\2010-summary.csv")

In [30]:
df3 = spark.read.format("csv").option("path", "D:\\code\\spark\\spark-basics\\data\\flight-data\\csv\\2010-summary.csv").load()

In [32]:
df = spark.read\
    .option("header", "true")\
    .csv("D:\\code\\spark\\spark-basics\\data\\flight-data\\csv\\2010-summary.csv")
df.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



In [35]:
df = spark.read\
    .option("header", "true")\
    .csv("D:\\code\\spark\\spark-basics\\data\\flight-data\\csv")
df.count()

1502

In [38]:
df = spark.read\
    .option("header", "true")\
    .csv("D:\\code\\spark\\spark-basics\\data\\flight-data\\csv\\2010-summary.csv")
df.count()

255

In [39]:
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: string (nullable = true)



In [5]:
df = spark.read\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .csv("D:\\code\\spark\\spark-basics\\data\\flight-data\\csv\\2010-summary.csv")
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)



In [6]:
from pyspark.sql.types import StructField, StructType, StringType,LongType
custom_schema = StructType([
    StructField("destination", StringType(), True),
    StructField("source", StringType(), True),
    StructField("total_flights", LongType(), True),
])
df = spark.read.format("csv") \
    .schema(custom_schema) \
    .option("header", True) \
    .load("D:\\code\\spark\\spark-basics\\data\\flight-data\\csv\\2010-summary.csv")
df.show(2)

+-------------+-------+-------------+
|  destination| source|total_flights|
+-------------+-------+-------------+
|United States|Romania|            1|
|United States|Ireland|          264|
+-------------+-------+-------------+
only showing top 2 rows



In [11]:
df = spark.read\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .option("sep", "|") \
    .csv("D:\\code\\spark\\spark-basics\\data\\flight-data\\csv\\2010-summary.csv")

In [13]:
df = spark.read\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .csv("D:\\code\\spark\\spark-basics\\data\\flight-data\\csv\\sample_data.csv")
df.show()

+----+-------------+-----+
|name|      address|  age|
+----+-------------+-----+
| abc|123 some road| city|
+----+-------------+-----+



In [15]:
df = spark.read\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .option("escapeQuotes", "true")\
    .csv("D:\\code\\spark\\spark-basics\\data\\flight-data\\csv\\sample_data.csv")
df.show()

+----+-------------------+-----+
|name|            address|  age|
+----+-------------------+-----+
| abc|      123 some road| city|
| abc|123 some road, city|   30|
+----+-------------------+-----+



In [18]:
df = spark.read\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .option("sep", "\n")\
    .csv("D:\\code\\spark\\spark-basics\\data\\flight-data\\csv\\sample_data1.csv")
df.show()

+---------+
|col1,col2|
+---------+
|  a,(1,2)|
|  b,(1,2)|
+---------+



In [31]:
df2 = df.withColumnRenamed("col1,col2", "data")

In [32]:
df2.show()

+-------+
|   data|
+-------+
|a,(1,2)|
|b,(1,2)|
+-------+



In [33]:
from pyspark.sql.functions import expr,col,split
df3 = df2.select(split("data", ","))

In [45]:
df2.select(split("data", "\(")[1]).show()

+----------------------+
|split(data, \(, -1)[1]|
+----------------------+
|                  1,2)|
|                  1,2)|
+----------------------+

