In [1]:
import pyspark 

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark=SparkSession.builder.appName('Dataframe').getOrCreate()

In [4]:
spark

In [7]:
df_spark=spark.read.option('header','true').csv("test1.csv")

In [10]:
df_spark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [13]:
df_spark.summary()

DataFrame[summary: string, Name: string, age: string, Experience: string, Salary: string]

In [14]:
df_spark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- Experience: string (nullable = true)
 |-- Salary: string (nullable = true)



In [15]:
#since we know that we do have integer values in out dataframe but pyspark reads them as a string categoty, so we change this string type to integer type 

In [16]:
df_pyspark=spark.read.option('header','true',).csv("test1.csv",inferSchema=True)

In [22]:
df_pyspark

DataFrame[Name: string, age: int, Experience: int, Salary: int]

In [26]:
#Most optimal way of printing out a schema 

df=spark.read.csv('test1.csv',header=True,inferSchema=True)
df

DataFrame[Name: string, age: int, Experience: int, Salary: int]

In [27]:
df.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [28]:
type(df)

pyspark.sql.dataframe.DataFrame

In [29]:
df.columns

['Name', 'age', 'Experience', 'Salary']

In [31]:
df.head(4) #list of data 

[Row(Name='Krish', age=31, Experience=10, Salary=30000),
 Row(Name='Sudhanshu', age=30, Experience=8, Salary=25000),
 Row(Name='Sunny', age=29, Experience=4, Salary=20000),
 Row(Name='Paul', age=24, Experience=3, Salary=20000)]

In [32]:
df.select("Name").show()

+---------+
|     Name|
+---------+
|    Krish|
|Sudhanshu|
|    Sunny|
|     Paul|
|   Harsha|
|  Shubham|
+---------+



In [34]:
type(df.select("Name"))

pyspark.sql.dataframe.DataFrame

In [36]:
df.select(["Name",'Experience']).show()

+---------+----------+
|     Name|Experience|
+---------+----------+
|    Krish|        10|
|Sudhanshu|         8|
|    Sunny|         4|
|     Paul|         3|
|   Harsha|         1|
|  Shubham|         2|
+---------+----------+



In [40]:
df.dtypes

[('Name', 'string'), ('age', 'int'), ('Experience', 'int'), ('Salary', 'int')]

In [41]:
df.describe()

DataFrame[summary: string, Name: string, age: string, Experience: string, Salary: string]

In [45]:
df.describe().show()

+-------+------+------------------+-----------------+------------------+
|summary|  Name|               age|       Experience|            Salary|
+-------+------+------------------+-----------------+------------------+
|  count|     6|                 6|                6|                 6|
|   mean|  null|26.333333333333332|4.666666666666667|21333.333333333332|
| stddev|  null| 4.179314138308661|3.559026084010437| 5354.126134736337|
|    min|Harsha|                21|                1|             15000|
|    max| Sunny|                31|               10|             30000|
+-------+------+------------------+-----------------+------------------+



In [50]:
### Adding columns in datasframe
df= df.withColumn("Experience after 2 years",df['Experience']+2).show() #Not an inplace option like in pandas , we need to save it in a variable 

+---------+---+----------+------+------------------------+
|     Name|age|Experience|Salary|Experience after 2 years|
+---------+---+----------+------+------------------------+
|    Krish| 31|        10| 30000|                      12|
|Sudhanshu| 30|         8| 25000|                      10|
|    Sunny| 29|         4| 20000|                       6|
|     Paul| 24|         3| 20000|                       5|
|   Harsha| 21|         1| 15000|                       3|
|  Shubham| 23|         2| 18000|                       4|
+---------+---+----------+------+------------------------+



In [59]:
### Deleting columns 
df_new = df.drop("Experience after 2 years").show()

AttributeError: 'NoneType' object has no attribute 'drop'

In [None]:
###Columns renaming 
df.withColumnrenamed('Name', 'New Name').show()