In [5]:
import findspark
findspark.init(r"C:\Spark\spark-3.2.3-bin-hadoop3.2")

In [6]:
import urllib.request
from zipfile import ZipFile
from io import TextIOWrapper
import csv
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType

In [7]:
url = "https://databank.worldbank.org/data/download/WDI_CSV.zip"
filehandle, _ = urllib.request.urlretrieve(url)

with ZipFile(filehandle, "r") as zf:
    index = 0
    for count, name in enumerate(zf.namelist()):
        if name == "WDIData.csv":
            index += count
    target_file = zf.namelist()[index]
    with zf.open(target_file, "r") as infile:
        reader = csv.reader(TextIOWrapper(infile, 'utf-8'))
        rows = []
        for row in reader:
            rows.append(row)

In [8]:
headers = rows[0]
data = rows[1:]

In [9]:
structure = []
for h in headers:
    structure.append(StructField(name=h, dataType=StringType(), nullable=True))
schema = StructType(structure)

In [10]:
spark = SparkSession.builder.appName("world_data_bank").getOrCreate()

In [11]:
df = spark.createDataFrame(data=data,schema=schema).drop("")
df.printSchema()

root
 |-- ﻿"Country Name": string (nullable = true)
 |-- Country Code: string (nullable = true)
 |-- Indicator Name: string (nullable = true)
 |-- Indicator Code: string (nullable = true)
 |-- 1960: string (nullable = true)
 |-- 1961: string (nullable = true)
 |-- 1962: string (nullable = true)
 |-- 1963: string (nullable = true)
 |-- 1964: string (nullable = true)
 |-- 1965: string (nullable = true)
 |-- 1966: string (nullable = true)
 |-- 1967: string (nullable = true)
 |-- 1968: string (nullable = true)
 |-- 1969: string (nullable = true)
 |-- 1970: string (nullable = true)
 |-- 1971: string (nullable = true)
 |-- 1972: string (nullable = true)
 |-- 1973: string (nullable = true)
 |-- 1974: string (nullable = true)
 |-- 1975: string (nullable = true)
 |-- 1976: string (nullable = true)
 |-- 1977: string (nullable = true)
 |-- 1978: string (nullable = true)
 |-- 1979: string (nullable = true)
 |-- 1980: string (nullable = true)
 |-- 1981: string (nullable = true)
 |-- 1982: string (n

In [12]:
df = df.withColumnRenamed('﻿"Country Name"', "country_name") \
    .withColumnRenamed("Country Code", "country_code") \
    .withColumnRenamed("Indicator Name", "indicator_name") \
    .withColumnRenamed("Indicator Code", "indicator_code")
df.printSchema()

root
 |-- country_name: string (nullable = true)
 |-- country_code: string (nullable = true)
 |-- indicator_name: string (nullable = true)
 |-- indicator_code: string (nullable = true)
 |-- 1960: string (nullable = true)
 |-- 1961: string (nullable = true)
 |-- 1962: string (nullable = true)
 |-- 1963: string (nullable = true)
 |-- 1964: string (nullable = true)
 |-- 1965: string (nullable = true)
 |-- 1966: string (nullable = true)
 |-- 1967: string (nullable = true)
 |-- 1968: string (nullable = true)
 |-- 1969: string (nullable = true)
 |-- 1970: string (nullable = true)
 |-- 1971: string (nullable = true)
 |-- 1972: string (nullable = true)
 |-- 1973: string (nullable = true)
 |-- 1974: string (nullable = true)
 |-- 1975: string (nullable = true)
 |-- 1976: string (nullable = true)
 |-- 1977: string (nullable = true)
 |-- 1978: string (nullable = true)
 |-- 1979: string (nullable = true)
 |-- 1980: string (nullable = true)
 |-- 1981: string (nullable = true)
 |-- 1982: string (null

In [15]:
df.show(n=3)

+--------------------+------------+--------------------+-----------------+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----+
|        country_name|country_code|      indicator_name|   indicator_code|1960|1961|1962|1963|1964|1965|1966|1967|1968|1969|1970|1971|1972|1973|1974|1975|1976|1977|1978|1979|1980|1981|1982|1983|1984|1985|1986|1987|1988|1989|1990|1991|1992|1993|1994|1995|1996|1997|1998|1999|            2000|            2001|            2002|            2003|            2004|    