In [1]:
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('Dataframe').getOrCreate()

In [37]:
## read dataset
df = spark.read.option('header','true').csv('test2.csv')

In [10]:
df.show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|Giuseppe| 23|        10|
|   Mario| 45|         8|
|   Fabio| 65|         4|
+--------+---+----------+



In [11]:
## check schema (data types)
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Experience: string (nullable = true)



In [13]:
## change string to int with inferSchema
df = spark.read.option('header','true').csv('test2.csv',inferSchema=True)

In [14]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [15]:
## shorter version
df = spark.read.csv('test2.csv',header=True,inferSchema=True)

In [16]:
df.show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|Giuseppe| 23|        10|
|   Mario| 45|         8|
|   Fabio| 65|         4|
+--------+---+----------+



In [17]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [19]:
type(df)

pyspark.sql.dataframe.DataFrame

In [None]:
#################################################

In [20]:
df.columns

['Name', 'Age', 'Experience']

In [21]:
## example first 2 elements
df.head(2)

[Row(Name='Giuseppe', Age=23, Experience=10),
 Row(Name='Mario', Age=45, Experience=8)]

In [23]:
## select only 1 column
df.select('Name').show()

+--------+
|    Name|
+--------+
|Giuseppe|
|   Mario|
|   Fabio|
+--------+



In [25]:
## select multiple columns
df.select(['Name','Experience']).show()

+--------+----------+
|    Name|Experience|
+--------+----------+
|Giuseppe|        10|
|   Mario|         8|
|   Fabio|         4|
+--------+----------+



In [26]:
## check data types
df.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [27]:
## describe function
df.describe().show()

+-------+-----+------------------+-----------------+
|summary| Name|               Age|       Experience|
+-------+-----+------------------+-----------------+
|  count|    3|                 3|                3|
|   mean| null|44.333333333333336|7.333333333333333|
| stddev| null|21.007935008784973|3.055050463303893|
|    min|Fabio|                23|                4|
|    max|Mario|                65|               10|
+-------+-----+------------------+-----------------+



In [38]:
## adding columns in df - need to assign to var to get it reflected
df=df.withColumn('Experience after 2 years',df['Experience']+2)

In [39]:
df.show()

+--------+---+----------+------------------------+
|    Name|Age|Experience|Experience after 2 years|
+--------+---+----------+------------------------+
|Giuseppe| 23|        10|                    12.0|
|   Mario| 45|         8|                    10.0|
|   Fabio| 65|         4|                     6.0|
+--------+---+----------+------------------------+



In [40]:
## dropping columns in df - need to assign to var to get it reflected
df=df.drop('Experience after 2 years')

In [41]:
df.show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|Giuseppe| 23|        10|
|   Mario| 45|         8|
|   Fabio| 65|         4|
+--------+---+----------+



In [44]:
## rename columns
df=df.withColumnRenamed('Name','New Name')

In [45]:
df.show()

+--------+---+----------+
|New Name|Age|Experience|
+--------+---+----------+
|Giuseppe| 23|        10|
|   Mario| 45|         8|
|   Fabio| 65|         4|
+--------+---+----------+



In [46]:
df=df.withColumnRenamed('New Name','Name')

In [47]:
df.show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|Giuseppe| 23|        10|
|   Mario| 45|         8|
|   Fabio| 65|         4|
+--------+---+----------+

