In [1]:
#create spark session
import findspark
findspark.init()

from pyspark.sql import SparkSession

def create_session(appName):
    """Create and return a SparkSession, creating one if it does not exist"""
    try:
        spark = SparkSession.builder \
            .appName(appName) \
            .getOrCreate()
        return spark
    except Exception as e:
        print("Error getting or creating Spark Session", str(e))
        
spark = create_session('fifa19')

In [2]:
dataset_path = 'dataset/fifa19.csv'

In [3]:
#read dataset
fifa_df = spark.read.csv(dataset_path, header=True)

In [4]:
#print 5 sample
df = fifa_df.select("ID", "Name", "Age", "Nationality")
df.limit(5).show(truncate=False)

+------+-----------------+---+-----------+
|ID    |Name             |Age|Nationality|
+------+-----------------+---+-----------+
|158023|L. Messi         |31 |Argentina  |
|20801 |Cristiano Ronaldo|33 |Portugal   |
|190871|Neymar Jr        |26 |Brazil     |
|193080|De Gea           |27 |Spain      |
|192985|K. De Bruyne     |27 |Belgium    |
+------+-----------------+---+-----------+



In [5]:
#print schema
df.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Nationality: string (nullable = true)



In [7]:
#modify data struct
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType

df = df.withColumn("ID", col("ID").cast(IntegerType()))\
    .withColumn( "Age", col("Age").cast(IntegerType()))
    
df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Nationality: string (nullable = true)

