In [3]:
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/45/b0/9d6860891ab14a39d4bddf80ba26ce51c2f9dc4805e5c6978ac0472c120a/pyspark-3.1.1.tar.gz (212.3MB)
[K     |████████████████████████████████| 212.3MB 70kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 42.7MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.1-py2.py3-none-any.whl size=212767604 sha256=65f844c1295b59bba91b4a2ebec785e345379f0a4e3f6fd84a54f55b472659a4
  Stored in directory: /root/.cache/pip/wheels/0b/90/c0/01de724414ef122bd05f056541fb6a0ecf47c7ca655f8b3c0f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.1


#### Creating Pyspark sessions

In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName('DataFrame_Practice').getOrCreate()

In [6]:
spark

In [9]:
import pandas as pd

In [7]:
data = [['Tom', 54, 120000],['Harry',23, 40000],['Charles',45,90000],['Wanda',32,50000]]

In [10]:
df = pd.DataFrame(data=data, columns=['Name','Age','Salary'])

In [11]:
df.to_csv('test_df.csv')

##### Reading the dataset

In [15]:
df_pyspark = spark.read.option('header','true').csv('test_df.csv')

In [13]:
spark.read.option('header','true').csv('test_df.csv').show()

+---+-------+---+------+
|_c0|   Name|Age|Salary|
+---+-------+---+------+
|  0|    Tom| 54|120000|
|  1|  Harry| 23| 40000|
|  2|Charles| 45| 90000|
|  3|  Wanda| 32| 50000|
+---+-------+---+------+



In [16]:
#### Check the schema
df_pyspark.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Salary: string (nullable = true)



#### Two ways to read CSV

In [40]:
df_pyspark = spark.read.option('header','true').csv('test_df.csv',inferSchema=True)

In [19]:
df_pyspark.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [53]:
df_pyspark = spark.read.csv('test_df.csv',header=True,inferSchema=True)

In [25]:
df_pyspark.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [26]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

#### Get column names

In [27]:
df_pyspark.columns

['_c0', 'Name', 'Age', 'Salary']

In [28]:
df_pyspark.head()

Row(_c0=0, Name='Tom', Age=54, Salary=120000)

In [30]:
df_pyspark.head(3)

[Row(_c0=0, Name='Tom', Age=54, Salary=120000),
 Row(_c0=1, Name='Harry', Age=23, Salary=40000),
 Row(_c0=2, Name='Charles', Age=45, Salary=90000)]

In [42]:
df_pyspark.show()

+---+-------+---+------+
|_c0|   Name|Age|Salary|
+---+-------+---+------+
|  0|    Tom| 54|120000|
|  1|  Harry| 23| 40000|
|  2|Charles| 45| 90000|
|  3|  Wanda| 32| 50000|
+---+-------+---+------+



#### Select dataframe column

In [33]:
df_pyspark.select('Name')

DataFrame[Name: string]

In [34]:
df_pyspark.select('Name').show()

+-------+
|   Name|
+-------+
|    Tom|
|  Harry|
|Charles|
|  Wanda|
+-------+



In [35]:
df_pyspark.select('Name','Age').show()

+-------+---+
|   Name|Age|
+-------+---+
|    Tom| 54|
|  Harry| 23|
|Charles| 45|
|  Wanda| 32|
+-------+---+



#### DataTypes

In [36]:
df_pyspark.dtypes

[('_c0', 'int'), ('Name', 'string'), ('Age', 'int'), ('Salary', 'int')]

In [37]:
df_pyspark.describe()

DataFrame[summary: string, _c0: string, Name: string, Age: string, Salary: string]

In [38]:
df_pyspark.describe().show()

+-------+------------------+-------+------------------+-----------------+
|summary|               _c0|   Name|               Age|           Salary|
+-------+------------------+-------+------------------+-----------------+
|  count|                 4|      4|                 4|                4|
|   mean|               1.5|   null|              38.5|          75000.0|
| stddev|1.2909944487358056|   null|13.723459233492601|36968.45502136472|
|    min|                 0|Charles|                23|            40000|
|    max|                 3|  Wanda|                54|           120000|
+-------+------------------+-------+------------------+-----------------+



#### Add or Drop Columns

In [54]:
df_pyspark = df_pyspark.withColumn('Salary with Bonus',df_pyspark['Salary']*1.05)

In [55]:
df_pyspark.show()

+---+-------+---+------+-----------------+
|_c0|   Name|Age|Salary|Salary with Bonus|
+---+-------+---+------+-----------------+
|  0|    Tom| 54|120000|         126000.0|
|  1|  Harry| 23| 40000|          42000.0|
|  2|Charles| 45| 90000|          94500.0|
|  3|  Wanda| 32| 50000|          52500.0|
+---+-------+---+------+-----------------+



In [56]:
df_pyspark.drop('Salary with Bonus').show()

+---+-------+---+------+
|_c0|   Name|Age|Salary|
+---+-------+---+------+
|  0|    Tom| 54|120000|
|  1|  Harry| 23| 40000|
|  2|Charles| 45| 90000|
|  3|  Wanda| 32| 50000|
+---+-------+---+------+



In [57]:
df_pyspark.show()

+---+-------+---+------+-----------------+
|_c0|   Name|Age|Salary|Salary with Bonus|
+---+-------+---+------+-----------------+
|  0|    Tom| 54|120000|         126000.0|
|  1|  Harry| 23| 40000|          42000.0|
|  2|Charles| 45| 90000|          94500.0|
|  3|  Wanda| 32| 50000|          52500.0|
+---+-------+---+------+-----------------+



In [58]:
df_pyspark = df_pyspark.drop('Salary with Bonus')

In [59]:
df_pyspark.show()

+---+-------+---+------+
|_c0|   Name|Age|Salary|
+---+-------+---+------+
|  0|    Tom| 54|120000|
|  1|  Harry| 23| 40000|
|  2|Charles| 45| 90000|
|  3|  Wanda| 32| 50000|
+---+-------+---+------+



#### Reaname Column

In [60]:
df_pyspark.withColumnRenamed('Name','First Name').show()

+---+----------+---+------+
|_c0|First Name|Age|Salary|
+---+----------+---+------+
|  0|       Tom| 54|120000|
|  1|     Harry| 23| 40000|
|  2|   Charles| 45| 90000|
|  3|     Wanda| 32| 50000|
+---+----------+---+------+

