In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Dataframes').getOrCreate()

In [3]:
spark

In [4]:
## read the dataset
df_pyspark = spark.read.option('header','true').csv('game_of_thrones.csv', inferSchema=True)

In [5]:
## Check the schema
df_pyspark.printSchema()

root
 |-- Season: integer (nullable = true)
 |-- No. of Episode (Season): integer (nullable = true)
 |-- No. of Episode (Overall): integer (nullable = true)
 |-- Title of the Episode: string (nullable = true)
 |-- Running Time (Minutes): integer (nullable = true)
 |-- Directed by: string (nullable = true)
 |-- Written by: string (nullable = true)
 |-- Original Air Date: string (nullable = true)
 |-- U.S. Viewers (Millions): double (nullable = true)
 |-- Music by: string (nullable = true)
 |-- Cinematography by: string (nullable = true)
 |-- Editing by: string (nullable = true)
 |-- IMDb Rating: double (nullable = true)
 |-- Rotten Tomatoes Rating (Percentage): integer (nullable = true)
 |-- Metacritic Ratings: double (nullable = true)
 |-- Ordered: string (nullable = true)
 |-- Filming Duration: string (nullable = true)
 |-- Novel(s) Adapted: string (nullable = true)
 |-- Synopsis: string (nullable = true)



In [6]:
df_pyspark = spark.read.csv('game_of_thrones.csv', header=True, inferSchema=True)
df_pyspark.show()

+------+-----------------------+------------------------+--------------------+----------------------+--------------+--------------------+-----------------+-----------------------+-------------+-----------------+-----------------+-----------+-----------------------------------+------------------+--------------+-------------------+-----------------+--------------------+
|Season|No. of Episode (Season)|No. of Episode (Overall)|Title of the Episode|Running Time (Minutes)|   Directed by|          Written by|Original Air Date|U.S. Viewers (Millions)|     Music by|Cinematography by|       Editing by|IMDb Rating|Rotten Tomatoes Rating (Percentage)|Metacritic Ratings|       Ordered|   Filming Duration| Novel(s) Adapted|            Synopsis|
+------+-----------------------+------------------------+--------------------+----------------------+--------------+--------------------+-----------------+-----------------------+-------------+-----------------+-----------------+-----------+-----------------

In [7]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [8]:
df_pyspark.head(3)

[Row(Season=1, No. of Episode (Season)=1, No. of Episode (Overall)=1, Title of the Episode='Winter Is Coming', Running Time (Minutes)=61, Directed by='Tim Van Patten', Written by='David Benioff, D. B. Weiss', Original Air Date='17-Apr-2011', U.S. Viewers (Millions)=2.22, Music by='Ramin Djawadi', Cinematography by='Alik Sakharov', Editing by='Oral Norrie Ottey', IMDb Rating=8.9, Rotten Tomatoes Rating (Percentage)=100, Metacritic Ratings=9.1, Ordered='March 2, 2010', Filming Duration='Second half of 2010', Novel(s) Adapted='A Game of Thrones', Synopsis='"North of the Seven Kingdoms of Westeros, Night\'s Watch soldiers are attacked by supernatural White Walkers. One soldier escapes but is captured at Castle Winterfell. Eddard ""Ned"" Stark'),
 Row(Season=1, No. of Episode (Season)=2, No. of Episode (Overall)=2, Title of the Episode='The Kingsroad', Running Time (Minutes)=55, Directed by='Tim Van Patten', Written by='David Benioff, D. B. Weiss', Original Air Date='24-Apr-2011', U.S. View

In [9]:
df_pyspark.show()

+------+-----------------------+------------------------+--------------------+----------------------+--------------+--------------------+-----------------+-----------------------+-------------+-----------------+-----------------+-----------+-----------------------------------+------------------+--------------+-------------------+-----------------+--------------------+
|Season|No. of Episode (Season)|No. of Episode (Overall)|Title of the Episode|Running Time (Minutes)|   Directed by|          Written by|Original Air Date|U.S. Viewers (Millions)|     Music by|Cinematography by|       Editing by|IMDb Rating|Rotten Tomatoes Rating (Percentage)|Metacritic Ratings|       Ordered|   Filming Duration| Novel(s) Adapted|            Synopsis|
+------+-----------------------+------------------------+--------------------+----------------------+--------------+--------------------+-----------------+-----------------------+-------------+-----------------+-----------------+-----------+-----------------

In [10]:
df_pyspark.select(['Title of the Episode','IMDb Rating']).show()

+--------------------+-----------+
|Title of the Episode|IMDb Rating|
+--------------------+-----------+
|    Winter Is Coming|        8.9|
|       The Kingsroad|        8.6|
|           Lord Snow|        8.5|
|Cripples, Bastard...|        8.6|
|The Wolf and the ...|        9.0|
|      A Golden Crown|        9.1|
|  You Win or You Die|        9.1|
|      The Pointy End|        8.9|
|              Baelor|        9.6|
|      Fire and Blood|        9.4|
| The North Remembers|        8.6|
|     The Night Lands|        8.4|
|What Is Dead May ...|        8.7|
|     Garden of Bones|        8.6|
|The Ghost of Harr...|        8.6|
|The Old Gods and ...|        8.9|
| A Man Without Honor|        8.8|
|The Prince of Win...|        8.6|
|          Blackwater|        9.6|
|     Valar Morghulis|        9.3|
+--------------------+-----------+
only showing top 20 rows



In [11]:
df_pyspark['Title of the Episode']

Column<'Title of the Episode'>

In [12]:
df_pyspark.dtypes

[('Season', 'int'),
 ('No. of Episode (Season)', 'int'),
 ('No. of Episode (Overall)', 'int'),
 ('Title of the Episode', 'string'),
 ('Running Time (Minutes)', 'int'),
 ('Directed by', 'string'),
 ('Written by', 'string'),
 ('Original Air Date', 'string'),
 ('U.S. Viewers (Millions)', 'double'),
 ('Music by', 'string'),
 ('Cinematography by', 'string'),
 ('Editing by', 'string'),
 ('IMDb Rating', 'double'),
 ('Rotten Tomatoes Rating (Percentage)', 'int'),
 ('Metacritic Ratings', 'double'),
 ('Ordered', 'string'),
 ('Filming Duration', 'string'),
 ('Novel(s) Adapted', 'string'),
 ('Synopsis', 'string')]

In [13]:
df_pyspark.describe().show()

+-------+-----------------+-----------------------+------------------------+--------------------+----------------------+--------------+--------------+-----------------+-----------------------+-------------+-----------------+-------------+------------------+-----------------------------------+------------------+--------------+--------------------+--------------------+--------------------+
|summary|           Season|No. of Episode (Season)|No. of Episode (Overall)|Title of the Episode|Running Time (Minutes)|   Directed by|    Written by|Original Air Date|U.S. Viewers (Millions)|     Music by|Cinematography by|   Editing by|       IMDb Rating|Rotten Tomatoes Rating (Percentage)|Metacritic Ratings|       Ordered|    Filming Duration|    Novel(s) Adapted|            Synopsis|
+-------+-----------------+-----------------------+------------------------+--------------------+----------------------+--------------+--------------+-----------------+-----------------------+-------------+------------

In [14]:
## Adding Columns in data frame
df_pyspark = df_pyspark.withColumn('Running Time (Hours)',df_pyspark['Running Time (Minutes)']/60)

In [15]:
df_pyspark.show()

+------+-----------------------+------------------------+--------------------+----------------------+--------------+--------------------+-----------------+-----------------------+-------------+-----------------+-----------------+-----------+-----------------------------------+------------------+--------------+-------------------+-----------------+--------------------+--------------------+
|Season|No. of Episode (Season)|No. of Episode (Overall)|Title of the Episode|Running Time (Minutes)|   Directed by|          Written by|Original Air Date|U.S. Viewers (Millions)|     Music by|Cinematography by|       Editing by|IMDb Rating|Rotten Tomatoes Rating (Percentage)|Metacritic Ratings|       Ordered|   Filming Duration| Novel(s) Adapted|            Synopsis|Running Time (Hours)|
+------+-----------------------+------------------------+--------------------+----------------------+--------------+--------------------+-----------------+-----------------------+-------------+-----------------+-----

In [16]:
## Drop the columns
df_pyspark = df_pyspark.drop('Running Time (Hours)')

In [17]:
df_pyspark.show()

+------+-----------------------+------------------------+--------------------+----------------------+--------------+--------------------+-----------------+-----------------------+-------------+-----------------+-----------------+-----------+-----------------------------------+------------------+--------------+-------------------+-----------------+--------------------+
|Season|No. of Episode (Season)|No. of Episode (Overall)|Title of the Episode|Running Time (Minutes)|   Directed by|          Written by|Original Air Date|U.S. Viewers (Millions)|     Music by|Cinematography by|       Editing by|IMDb Rating|Rotten Tomatoes Rating (Percentage)|Metacritic Ratings|       Ordered|   Filming Duration| Novel(s) Adapted|            Synopsis|
+------+-----------------------+------------------------+--------------------+----------------------+--------------+--------------------+-----------------+-----------------------+-------------+-----------------+-----------------+-----------+-----------------

In [18]:
## Rename the columns
df_pyspark.withColumnRenamed('Title of the Episode','Name of the Episode').show()

+------+-----------------------+------------------------+--------------------+----------------------+--------------+--------------------+-----------------+-----------------------+-------------+-----------------+-----------------+-----------+-----------------------------------+------------------+--------------+-------------------+-----------------+--------------------+
|Season|No. of Episode (Season)|No. of Episode (Overall)| Name of the Episode|Running Time (Minutes)|   Directed by|          Written by|Original Air Date|U.S. Viewers (Millions)|     Music by|Cinematography by|       Editing by|IMDb Rating|Rotten Tomatoes Rating (Percentage)|Metacritic Ratings|       Ordered|   Filming Duration| Novel(s) Adapted|            Synopsis|
+------+-----------------------+------------------------+--------------------+----------------------+--------------+--------------------+-----------------+-----------------------+-------------+-----------------+-----------------+-----------+-----------------