In [1]:
from pyspark.sql import SparkSession

In [2]:
sc=SparkSession.builder.appName('Dataframe').getOrCreate()

In [3]:
sc

In [5]:
# read the dataset
df = sc.read.option('header',True).csv('test1.csv')

In [6]:
df.show()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
| Manidhar| 38|        14|
|     Siva| 37|        13|
|Manjunath| 28|         8|
+---------+---+----------+



In [7]:
# check the schema
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Experience: string (nullable = true)



In [8]:
df = sc.read.option('header',True).csv('test1.csv',inferSchema=True)

In [9]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [10]:
df=sc.read.csv('test1.csv',header=True,inferSchema=True)
df.show()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
| Manidhar| 38|        14|
|     Siva| 37|        13|
|Manjunath| 28|         8|
+---------+---+----------+



In [11]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [12]:
type(df)

pyspark.sql.dataframe.DataFrame

In [13]:
df.columns

['Name', 'Age', 'Experience']

In [14]:
df.head(3)

[Row(Name='Manidhar', Age=38, Experience=14),
 Row(Name='Siva', Age=37, Experience=13),
 Row(Name='Manjunath', Age=28, Experience=8)]

In [17]:
df.show()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
| Manidhar| 38|        14|
|     Siva| 37|        13|
|Manjunath| 28|         8|
+---------+---+----------+



In [20]:
type(df.select('Name'))

pyspark.sql.dataframe.DataFrame

In [22]:
df.select('Name').show()

+---------+
|     Name|
+---------+
| Manidhar|
|     Siva|
|Manjunath|
+---------+



In [23]:
df.select(['Name','Experience']).show()

+---------+----------+
|     Name|Experience|
+---------+----------+
| Manidhar|        14|
|     Siva|        13|
|Manjunath|         8|
+---------+----------+



In [24]:
df.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [26]:
df.describe().show()

+-------+--------+------------------+------------------+
|summary|    Name|               Age|        Experience|
+-------+--------+------------------+------------------+
|  count|       3|                 3|                 3|
|   mean|    null|34.333333333333336|11.666666666666666|
| stddev|    null| 5.507570547286102|3.2145502536643185|
|    min|Manidhar|                28|                 8|
|    max|    Siva|                38|                14|
+-------+--------+------------------+------------------+



In [28]:
## Adding columns to pyspark data frame

df=df.withColumn('Experiance After 2 year',df['Experience']+2)

In [29]:
df.show()

+---------+---+----------+-----------------------+
|     Name|Age|Experience|Experiance After 2 year|
+---------+---+----------+-----------------------+
| Manidhar| 38|        14|                     16|
|     Siva| 37|        13|                     15|
|Manjunath| 28|         8|                     10|
+---------+---+----------+-----------------------+



In [30]:
## Drop the column
df= df.drop('Experiance After 2 year')

In [31]:
df.show()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
| Manidhar| 38|        14|
|     Siva| 37|        13|
|Manjunath| 28|         8|
+---------+---+----------+



In [32]:
df=df.withColumnRenamed('Experience','Exp')
df.show()

+---------+---+---+
|     Name|Age|Exp|
+---------+---+---+
| Manidhar| 38| 14|
|     Siva| 37| 13|
|Manjunath| 28|  8|
+---------+---+---+

