# Dataframe Row and Column

In [1]:
from pyspark.sql import SparkSession
spark1 = SparkSession.builder.appName('RowCol').getOrCreate()

### Column Objects

In [2]:
df = spark1.read.json('people.json')

#### Check what are the columns

In [3]:
df.columns

['age', 'name']

#### Type of each column

In [4]:
type(df['age'])

pyspark.sql.column.Column

#### Extract column as a Dataframe

In [5]:
df.select('age')

DataFrame[age: bigint]

In [6]:
df.select('age').show()

+----+
| age|
+----+
|null|
|  30|
|  19|
+----+



### Row Objects

In [7]:
df.head(2)

[Row(age=None, name='Michael'), Row(age=30, name='Andy')]

In [8]:
df.head(2)[0]

Row(age=None, name='Michael')

#### datatype of row

In [9]:
type(df.head(2)[0])

pyspark.sql.types.Row

#### rows can be converted to dictionaries

In [11]:
row_dict = (df.head(2)[0]).asDict()

In [12]:
row_dict

{'age': None, 'name': 'Michael'}

### Create new Columns from existing columns

In [13]:
df.withColumn('doubleAge',df['age']*2).show()

+----+-------+---------+
| age|   name|doubleAge|
+----+-------+---------+
|null|Michael|     null|
|  30|   Andy|       60|
|  19| Justin|       38|
+----+-------+---------+



#### Rename existing columns

In [16]:
df.withColumnRenamed('age','newAge').show()

+------+-------+
|newAge|   name|
+------+-------+
|  null|Michael|
|    30|   Andy|
|    19| Justin|
+------+-------+



#### Opeartions with multiple columns

In [17]:
df1 = df.withColumn('halfAge',df['age']/2)

In [18]:
df1.show()

+----+-------+-------+
| age|   name|halfAge|
+----+-------+-------+
|null|Michael|   null|
|  30|   Andy|   15.0|
|  19| Justin|    9.5|
+----+-------+-------+



In [20]:
df1 = df1.withColumn('newAge',df1['age']+df1['halfAge'])

In [21]:
df1.show()

+----+-------+-------+------+
| age|   name|halfAge|newAge|
+----+-------+-------+------+
|null|Michael|   null|  null|
|  30|   Andy|   15.0|  45.0|
|  19| Justin|    9.5|  28.5|
+----+-------+-------+------+



#### Datatypes of new columns are set automatically