In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('TestDataframe').master('local').getOrCreate()

In [2]:
spark

In [3]:
data1 = [(1, 'a', 111),(2, 'b', 222)]
columns = ['id','name','rno']
cdf = spark.createDataFrame(data=data1, schema=columns)
cdf.show()

+---+----+---+
| id|name|rno|
+---+----+---+
|  1|   a|111|
|  2|   b|222|
+---+----+---+



In [4]:
type(cdf)

pyspark.sql.dataframe.DataFrame

In [5]:
cdf.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- rno: long (nullable = true)



In [6]:
# read the dataset
import os
print(os.getcwd())
df_sp = spark.read.option('header','true').csv('work/test.csv', inferSchema=True)

/home/jovyan


In [7]:
df_sp.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)



In [8]:
# with header
df_sp = spark.read.option('header','true').csv('work/test.csv', inferSchema=True, header=True)
df_sp.show()

+----+---+----------+
|name|age|experience|
+----+---+----------+
|  nq| 31|        10|
|  nh| 30|         8|
|  nj| 29|         4|
+----+---+----------+



In [9]:
df_sp.select(['name','age']).show()

+----+---+
|name|age|
+----+---+
|  nq| 31|
|  nh| 30|
|  nj| 29|
+----+---+



In [10]:
df_sp['age']
df_sp.dtypes

[('name', 'string'), ('age', 'int'), ('experience', 'int')]

In [11]:
df_sp.describe().show()

+-------+----+----+-----------------+
|summary|name| age|       experience|
+-------+----+----+-----------------+
|  count|   3|   3|                3|
|   mean|null|30.0|7.333333333333333|
| stddev|null| 1.0|3.055050463303893|
|    min|  nh|  29|                4|
|    max|  nq|  31|               10|
+-------+----+----+-----------------+



In [12]:
# add new column
df_sp = df_sp.withColumn('Experience after 2 years', df_sp['experience']+2)

In [13]:
df_sp.show()

+----+---+----------+------------------------+
|name|age|experience|Experience after 2 years|
+----+---+----------+------------------------+
|  nq| 31|        10|                      12|
|  nh| 30|         8|                      10|
|  nj| 29|         4|                       6|
+----+---+----------+------------------------+



In [14]:
# drop the columns
df_sp = df_sp.drop('Experience after 2 years')
df_sp.show()

+----+---+----------+
|name|age|experience|
+----+---+----------+
|  nq| 31|        10|
|  nh| 30|         8|
|  nj| 29|         4|
+----+---+----------+



In [15]:
df_sp = df_sp.withColumnRenamed('name','Name')
df_sp.show()

+----+---+----------+
|Name|age|experience|
+----+---+----------+
|  nq| 31|        10|
|  nh| 30|         8|
|  nj| 29|         4|
+----+---+----------+

