In [1]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("MyApp") \
    .config("spark.driver.host", "localhost") \
    .config("spark.driver.bindAddress", "localhost") \
    .getOrCreate()

In [3]:
df=spark.read.csv(r"D:\Christ\Python\Datasets\gapminder_data_graphs.csv",inferSchema=True)
df.show()

+-----------+---------+----+--------+---------+-----------+----+--------+
|        _c0|      _c1| _c2|     _c3|      _c4|        _c5| _c6|     _c7|
+-----------+---------+----+--------+---------+-----------+----+--------+
|    country|continent|year|life_exp|hdi_index|co2_consump| gdp|services|
|Afghanistan|     Asia|1998|    53.3|    0.344|     0.0522|NULL|    24.4|
|Afghanistan|     Asia|1999|    54.7|    0.348|     0.0402|NULL|    24.6|
|Afghanistan|     Asia|2000|    54.7|     0.35|      0.037|NULL|    24.7|
|Afghanistan|     Asia|2001|    54.8|    0.353|     0.0376|NULL|    24.7|
|Afghanistan|     Asia|2002|    55.5|    0.384|     0.0471| 333|    25.6|
|Afghanistan|     Asia|2003|    56.5|    0.393|     0.0509| 346|    25.9|
|Afghanistan|     Asia|2004|    57.1|    0.409|     0.0368| 336|    26.1|
|Afghanistan|     Asia|2005|    57.6|    0.418|     0.0515| 360|    26.5|
|Afghanistan|     Asia|2006|      58|    0.429|     0.0622| 368|    26.9|
|Afghanistan|     Asia|2007|    58.5| 

In [15]:
print("\nColumn Names:")
print(df.columns)


Column Names:
['_c0', '_c1', '_c2', '_c3', '_c4', '_c5', '_c6', '_c7']


In [8]:
print("DataFrame Schema:")
df.printSchema()

DataFrame Schema:
root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)



In [4]:
selected_cols = df.select("_c0", "_c1")
selected_cols.show()

+-----------+---------+
|        _c0|      _c1|
+-----------+---------+
|    country|continent|
|Afghanistan|     Asia|
|Afghanistan|     Asia|
|Afghanistan|     Asia|
|Afghanistan|     Asia|
|Afghanistan|     Asia|
|Afghanistan|     Asia|
|Afghanistan|     Asia|
|Afghanistan|     Asia|
|Afghanistan|     Asia|
|Afghanistan|     Asia|
|Afghanistan|     Asia|
|Afghanistan|     Asia|
|Afghanistan|     Asia|
|Afghanistan|     Asia|
|Afghanistan|     Asia|
|Afghanistan|     Asia|
|Afghanistan|     Asia|
|Afghanistan|     Asia|
|Afghanistan|     Asia|
+-----------+---------+
only showing top 20 rows



In [5]:
df_with_new_col = df.withColumn("new_column", df._c0 + 100)
df_with_new_col.show()

+-----------+---------+----+--------+---------+-----------+----+--------+----------+
|        _c0|      _c1| _c2|     _c3|      _c4|        _c5| _c6|     _c7|new_column|
+-----------+---------+----+--------+---------+-----------+----+--------+----------+
|    country|continent|year|life_exp|hdi_index|co2_consump| gdp|services|      NULL|
|Afghanistan|     Asia|1998|    53.3|    0.344|     0.0522|NULL|    24.4|      NULL|
|Afghanistan|     Asia|1999|    54.7|    0.348|     0.0402|NULL|    24.6|      NULL|
|Afghanistan|     Asia|2000|    54.7|     0.35|      0.037|NULL|    24.7|      NULL|
|Afghanistan|     Asia|2001|    54.8|    0.353|     0.0376|NULL|    24.7|      NULL|
|Afghanistan|     Asia|2002|    55.5|    0.384|     0.0471| 333|    25.6|      NULL|
|Afghanistan|     Asia|2003|    56.5|    0.393|     0.0509| 346|    25.9|      NULL|
|Afghanistan|     Asia|2004|    57.1|    0.409|     0.0368| 336|    26.1|      NULL|
|Afghanistan|     Asia|2005|    57.6|    0.418|     0.0515| 360| 

In [6]:
renamed_df = df.withColumnRenamed("_c0", "column_1")
renamed_df.show()

+-----------+---------+----+--------+---------+-----------+----+--------+
|   column_1|      _c1| _c2|     _c3|      _c4|        _c5| _c6|     _c7|
+-----------+---------+----+--------+---------+-----------+----+--------+
|    country|continent|year|life_exp|hdi_index|co2_consump| gdp|services|
|Afghanistan|     Asia|1998|    53.3|    0.344|     0.0522|NULL|    24.4|
|Afghanistan|     Asia|1999|    54.7|    0.348|     0.0402|NULL|    24.6|
|Afghanistan|     Asia|2000|    54.7|     0.35|      0.037|NULL|    24.7|
|Afghanistan|     Asia|2001|    54.8|    0.353|     0.0376|NULL|    24.7|
|Afghanistan|     Asia|2002|    55.5|    0.384|     0.0471| 333|    25.6|
|Afghanistan|     Asia|2003|    56.5|    0.393|     0.0509| 346|    25.9|
|Afghanistan|     Asia|2004|    57.1|    0.409|     0.0368| 336|    26.1|
|Afghanistan|     Asia|2005|    57.6|    0.418|     0.0515| 360|    26.5|
|Afghanistan|     Asia|2006|      58|    0.429|     0.0622| 368|    26.9|
|Afghanistan|     Asia|2007|    58.5| 

In [7]:
dropped_df = df.drop("_c0")
dropped_df.show()

+---------+----+--------+---------+-----------+----+--------+
|      _c1| _c2|     _c3|      _c4|        _c5| _c6|     _c7|
+---------+----+--------+---------+-----------+----+--------+
|continent|year|life_exp|hdi_index|co2_consump| gdp|services|
|     Asia|1998|    53.3|    0.344|     0.0522|NULL|    24.4|
|     Asia|1999|    54.7|    0.348|     0.0402|NULL|    24.6|
|     Asia|2000|    54.7|     0.35|      0.037|NULL|    24.7|
|     Asia|2001|    54.8|    0.353|     0.0376|NULL|    24.7|
|     Asia|2002|    55.5|    0.384|     0.0471| 333|    25.6|
|     Asia|2003|    56.5|    0.393|     0.0509| 346|    25.9|
|     Asia|2004|    57.1|    0.409|     0.0368| 336|    26.1|
|     Asia|2005|    57.6|    0.418|     0.0515| 360|    26.5|
|     Asia|2006|      58|    0.429|     0.0622| 368|    26.9|
|     Asia|2007|    58.5|    0.447|     0.0838| 409|    27.7|
|     Asia|2008|    59.2|    0.447|      0.152| 415|    28.2|
|     Asia|2009|    59.9|     0.46|      0.238| 492|    29.5|
|     As

In [9]:
sorted_df = df.orderBy("_c0")
sorted_df_desc = df.orderBy(df._c0.desc()) 
sorted_df.show()

+-----------+----+----+----+-----+------+----+----+
|        _c0| _c1| _c2| _c3|  _c4|   _c5| _c6| _c7|
+-----------+----+----+----+-----+------+----+----+
|Afghanistan|Asia|2018|62.7|0.509| 0.254| 569|38.4|
|Afghanistan|Asia|1998|53.3|0.344|0.0522|NULL|24.4|
|Afghanistan|Asia|1999|54.7|0.348|0.0402|NULL|24.6|
|Afghanistan|Asia|2000|54.7| 0.35| 0.037|NULL|24.7|
|Afghanistan|Asia|2001|54.8|0.353|0.0376|NULL|24.7|
|Afghanistan|Asia|2002|55.5|0.384|0.0471| 333|25.6|
|Afghanistan|Asia|2003|56.5|0.393|0.0509| 346|25.9|
|Afghanistan|Asia|2004|57.1|0.409|0.0368| 336|26.1|
|Afghanistan|Asia|2005|57.6|0.418|0.0515| 360|26.5|
|Afghanistan|Asia|2006|  58|0.429|0.0622| 368|26.9|
|Afghanistan|Asia|2007|58.5|0.447|0.0838| 409|27.7|
|Afghanistan|Asia|2008|59.2|0.447| 0.152| 415|28.2|
|Afghanistan|Asia|2009|59.9| 0.46| 0.238| 492|29.5|
|Afghanistan|Asia|2010|60.5|0.472|  0.29| 547|30.7|
|Afghanistan|Asia|2011|  61|0.477| 0.406| 533|31.7|
|Afghanistan|Asia|2012|61.4|0.489| 0.345| 580|32.8|
|Afghanistan

In [10]:
grouped_df = df.groupBy("_c0").count()
grouped_df.show()

+----------------+-----+
|             _c0|count|
+----------------+-----+
|            Chad|   21|
|        Paraguay|   21|
|          Russia|   21|
|           Yemen|   21|
|Congo, Dem. Rep.|   21|
|         Senegal|   21|
|          Sweden|   21|
|          Guyana|   21|
|     Philippines|   21|
|        Djibouti|   21|
|           Tonga|   21|
|        Malaysia|   21|
|       Singapore|   21|
|            Fiji|   21|
|          Turkey|   21|
|          Malawi|   21|
|            Iraq|   21|
|         Germany|   21|
|         Comoros|   21|
|     Afghanistan|   21|
+----------------+-----+
only showing top 20 rows



In [11]:
multi_filter = df.filter((df._c0 > 100) & (df._c1 < 500))
multi_filter.show()

+---+---+---+---+---+---+---+---+
|_c0|_c1|_c2|_c3|_c4|_c5|_c6|_c7|
+---+---+---+---+---+---+---+---+
+---+---+---+---+---+---+---+---+



In [12]:
distinct_vals = df.select("_c0").distinct()
distinct_vals.show()

+----------------+
|             _c0|
+----------------+
|            Chad|
|        Paraguay|
|          Russia|
|           Yemen|
|Congo, Dem. Rep.|
|         Senegal|
|          Sweden|
|          Guyana|
|     Philippines|
|        Djibouti|
|           Tonga|
|        Malaysia|
|       Singapore|
|            Fiji|
|          Turkey|
|          Malawi|
|            Iraq|
|         Germany|
|         Comoros|
|     Afghanistan|
+----------------+
only showing top 20 rows



In [13]:
sampled_df = df.sample(fraction=0.1)
sampled_df.show()

+-----------+-------------+----+----+-----+------+-----+----+
|        _c0|          _c1| _c2| _c3|  _c4|   _c5|  _c6| _c7|
+-----------+-------------+----+----+-----+------+-----+----+
|Afghanistan|         Asia|2001|54.8|0.353|0.0376| NULL|24.7|
|Afghanistan|         Asia|2003|56.5|0.393|0.0509|  346|25.9|
|Afghanistan|         Asia|2016|  62|0.502| 0.245|  575|36.9|
|    Albania|       Europe|2004|75.8|0.696|  1.34| 2520|36.4|
|    Albania|       Europe|2008|77.6|0.728|  1.46| 3300|37.6|
|    Algeria|       Africa|2006|73.3| 0.69|  2.98| 3760|55.4|
|    Algeria|       Africa|2016|75.7|0.743|  3.64| 4220|  59|
|    Algeria|       Africa|2017|75.9|0.745|  3.56| 4190|59.2|
|     Angola|       Africa|2000|52.8|  0.4| 0.581| 2450|53.9|
|     Angola|       Africa|2008|58.8|0.501|  1.18| 4080|47.6|
|  Argentina|South America|2000|74.2|0.781|  3.85|10700|76.6|
|  Argentina|South America|2004|74.9|0.787|  4.08|10400|75.8|
|  Argentina|South America|2011|  76|0.835|  4.61|14200|75.4|
|  Argen

In [14]:
df.describe().show()

+-------+-----------+---------+-----------------+-----------------+------------------+-----------------+------------------+------------------+
|summary|        _c0|      _c1|              _c2|              _c3|               _c4|              _c5|               _c6|               _c7|
+-------+-----------+---------+-----------------+-----------------+------------------+-----------------+------------------+------------------+
|  count|       3676|     3676|             3676|             3676|              3564|             3672|              3634|              3676|
|   mean|       NULL|     NULL|           2008.0|69.84930612244898|0.6748635980914944|4.712731272132951| 11966.05367464905| 51.24870476190484|
| stddev|       NULL|     NULL|6.056124726802348|8.886562763476881|0.1648336098993854|6.567434962786295|17105.787952854793|18.312500738495686|
|    min|Afghanistan|   Africa|             1998|             32.5|             0.255|           0.0159|              1000|              10.2|

In [16]:
filtered_df=df.filter(df._c2>2000)
filtered_df.show()

+-----------+------+----+----+-----+------+----+----+
|        _c0|   _c1| _c2| _c3|  _c4|   _c5| _c6| _c7|
+-----------+------+----+----+-----+------+----+----+
|Afghanistan|  Asia|2001|54.8|0.353|0.0376|NULL|24.7|
|Afghanistan|  Asia|2002|55.5|0.384|0.0471| 333|25.6|
|Afghanistan|  Asia|2003|56.5|0.393|0.0509| 346|25.9|
|Afghanistan|  Asia|2004|57.1|0.409|0.0368| 336|26.1|
|Afghanistan|  Asia|2005|57.6|0.418|0.0515| 360|26.5|
|Afghanistan|  Asia|2006|  58|0.429|0.0622| 368|26.9|
|Afghanistan|  Asia|2007|58.5|0.447|0.0838| 409|27.7|
|Afghanistan|  Asia|2008|59.2|0.447| 0.152| 415|28.2|
|Afghanistan|  Asia|2009|59.9| 0.46| 0.238| 492|29.5|
|Afghanistan|  Asia|2010|60.5|0.472|  0.29| 547|30.7|
|Afghanistan|  Asia|2011|  61|0.477| 0.406| 533|31.7|
|Afghanistan|  Asia|2012|61.4|0.489| 0.345| 580|32.8|
|Afghanistan|  Asia|2013|61.9|0.496|  0.28| 592|33.8|
|Afghanistan|  Asia|2014|61.9|  0.5| 0.253| 588|34.8|
|Afghanistan|  Asia|2015|61.9|  0.5| 0.262| 578|35.8|
|Afghanistan|  Asia|2016|  6