In [1]:
#create spark session
import findspark
findspark.init()

from pyspark.sql import SparkSession

def create_session(appName):
    """Create and return a SparkSession, creating one if it does not exist"""
    try:
        spark = SparkSession.builder \
            .appName(appName) \
            .getOrCreate()
        return spark
    except Exception as e:
        print("Error getting or creating Spark Session", str(e))
        
spark = create_session('fifa19')

In [2]:
dataset_path = 'dataset/fifa19.csv'

In [3]:
#read dataset
fifa_df = spark.read.csv(dataset_path, header=True)

In [4]:
#print 5 sample
df = fifa_df.select("ID", "Name", "Age", "Nationality")
df.limit(5).show(truncate=False)

+------+-----------------+---+-----------+
|ID    |Name             |Age|Nationality|
+------+-----------------+---+-----------+
|158023|L. Messi         |31 |Argentina  |
|20801 |Cristiano Ronaldo|33 |Portugal   |
|190871|Neymar Jr        |26 |Brazil     |
|193080|De Gea           |27 |Spain      |
|192985|K. De Bruyne     |27 |Belgium    |
+------+-----------------+---+-----------+



In [5]:
#print schema
df.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Nationality: string (nullable = true)



In [6]:
#modify data struct
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType

df = df.withColumn("ID", col("ID").cast(IntegerType()))\
    .withColumn( "Age", col("Age").cast(IntegerType()))
    
df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Nationality: string (nullable = true)



In [7]:
from pyspark.sql.functions import desc, asc
df.orderBy(desc("Age")).show()
print('##################################################################################################################################')
df.orderBy(asc("Age")).show()


+------+---------------+---+-----------------+
|    ID|           Name|Age|      Nationality|
+------+---------------+---+-----------------+
|140029|       O. Pérez| 45|           Mexico|
| 51963|      T. Warner| 44|Trinidad & Tobago|
| 53748|  K. Pilkington| 44|          England|
|140183|    S. Narazaki| 42|            Japan|
|156092|      J. Villar| 41|         Paraguay|
|208927|   H. Sulaimani| 41|     Saudi Arabia|
|  3665|       B. Nivet| 41|           France|
| 18745|       M. Tyler| 41|          England|
|142998|       C. Muñoz| 41|        Argentina|
|176540|    B. Castillo| 40|         Colombia|
|100899|    S. Phillips| 40|          England|
| 49511|       F. Kippe| 40|           Norway|
|  1179|      G. Buffon| 40|            Italy|
|214372|        W. Díaz| 40|         Colombia|
| 14907|    A. Bizzarri| 40|        Argentina|
|156480|    Y. Nakazawa| 40|            Japan|
|153260|         Hilton| 40|           Brazil|
| 28157|P. van der Vlag| 40|      Netherlands|
|232543|     

In [8]:
#groupBy command need to use together with a statistic  function.
#Here, I will show you how to calculate the mean and standard deviation of each group using 'groupBy' and 'apply'.
#Here, we will use the groupBy

group_df = df.groupBy("Nationality").count()
group_df.show()

+--------------------+-----+
|         Nationality|count|
+--------------------+-----+
|                Chad|    2|
|              Russia|   79|
|            Paraguay|   85|
|             Senegal|  130|
|              Sweden|  397|
|         Philippines|    2|
|             Eritrea|    2|
|                Fiji|    1|
|              Turkey|  303|
|                Iraq|    7|
|             Germany| 1198|
|      St Kitts Nevis|    3|
|             Comoros|    6|
|         Ivory Coast|  100|
|              France|  914|
|              Greece|  102|
|              Kosovo|   33|
|Central African Rep.|    3|
|            DR Congo|   52|
|          Montserrat|    4|
+--------------------+-----+
only showing top 20 rows



In [9]:
#select data by condition using where  clause
fifa_df.select("Name", "Club").where(fifa_df.Club.like("%Barcelona%")).show(truncate=False)

+---------------+------------+
|Name           |Club        |
+---------------+------------+
|L. Messi       |FC Barcelona|
|L. Suárez      |FC Barcelona|
|M. ter Stegen  |FC Barcelona|
|Sergio Busquets|FC Barcelona|
|Coutinho       |FC Barcelona|
|S. Umtiti      |FC Barcelona|
|Jordi Alba     |FC Barcelona|
|I. Rakitić     |FC Barcelona|
|Piqué          |FC Barcelona|
|A. Vidal       |FC Barcelona|
|O. Dembélé     |FC Barcelona|
|Sergi Roberto  |FC Barcelona|
|Arthur         |FC Barcelona|
|Malcom         |FC Barcelona|
|C. Lenglet     |FC Barcelona|
|Rafinha        |FC Barcelona|
|J. Cillessen   |FC Barcelona|
|Nélson Semedo  |FC Barcelona|
|Denis Suárez   |FC Barcelona|
|Munir          |FC Barcelona|
+---------------+------------+
only showing top 20 rows



In [10]:
#substr to extract a substring based on string index
#str    abcdef.png
#index  0123456789
#rev_id -10-9-8-7-6-5-4-3-2-1
#start at id =-3 mean 'p'
#len 3 mean 'png'
photo_ext = fifa_df.select("Photo", fifa_df.Photo.substr(-3,3).alias("File_extension"))
photo_ext.show()

+--------------------+--------------+
|               Photo|File_extension|
+--------------------+--------------+
|https://cdn.sofif...|           png|
|https://cdn.sofif...|           png|
|https://cdn.sofif...|           png|
|https://cdn.sofif...|           png|
|https://cdn.sofif...|           png|
|https://cdn.sofif...|           png|
|https://cdn.sofif...|           png|
|https://cdn.sofif...|           png|
|https://cdn.sofif...|           png|
|https://cdn.sofif...|           png|
|https://cdn.sofif...|           png|
|https://cdn.sofif...|           png|
|https://cdn.sofif...|           png|
|https://cdn.sofif...|           png|
|https://cdn.sofif...|           png|
|https://cdn.sofif...|           png|
|https://cdn.sofif...|           png|
|https://cdn.sofif...|           png|
|https://cdn.sofif...|           png|
|https://cdn.sofif...|           png|
+--------------------+--------------+
only showing top 20 rows



In [11]:
from pyspark.sql.functions import regexp_extract

# Define a function to extract the filename
def get_filename(url_col):
  """
  Extracts the filename from a URL string.

  Args:
    url_col: The column containing the URL strings.

  Returns:
    A new column containing the extracted filenames.
  """
  # Extract the part after the last forward slash (/)
  filename_pattern = r".*/([^/]+)$"
  return regexp_extract(url_col, filename_pattern, 1)

# Apply the function to extract filenames
df = fifa_df.withColumn("filename", get_filename(col("Photo")))

# Show the DataFrame with the new column
df.select("Photo", "filename").show(truncate=False)


+----------------------------------------------+----------+
|Photo                                         |filename  |
+----------------------------------------------+----------+
|https://cdn.sofifa.org/players/4/19/158023.png|158023.png|
|https://cdn.sofifa.org/players/4/19/20801.png |20801.png |
|https://cdn.sofifa.org/players/4/19/190871.png|190871.png|
|https://cdn.sofifa.org/players/4/19/193080.png|193080.png|
|https://cdn.sofifa.org/players/4/19/192985.png|192985.png|
|https://cdn.sofifa.org/players/4/19/183277.png|183277.png|
|https://cdn.sofifa.org/players/4/19/177003.png|177003.png|
|https://cdn.sofifa.org/players/4/19/176580.png|176580.png|
|https://cdn.sofifa.org/players/4/19/155862.png|155862.png|
|https://cdn.sofifa.org/players/4/19/200389.png|200389.png|
|https://cdn.sofifa.org/players/4/19/188545.png|188545.png|
|https://cdn.sofifa.org/players/4/19/182521.png|182521.png|
|https://cdn.sofifa.org/players/4/19/182493.png|182493.png|
|https://cdn.sofifa.org/players/4/19/168

In [13]:
#replace file extension
from pyspark.sql.functions import col, regexp_replace
df = df.withColumn(
    "filename_without_ext",
    regexp_replace(col("filename"), r"\.[^.]+$","")
)

# Show the DataFrame with both original and modified filenames
df.select("Photo", "filename", "filename_without_ext").show(truncate=False)

+----------------------------------------------+----------+--------------------+
|Photo                                         |filename  |filename_without_ext|
+----------------------------------------------+----------+--------------------+
|https://cdn.sofifa.org/players/4/19/158023.png|158023.png|158023              |
|https://cdn.sofifa.org/players/4/19/20801.png |20801.png |20801               |
|https://cdn.sofifa.org/players/4/19/190871.png|190871.png|190871              |
|https://cdn.sofifa.org/players/4/19/193080.png|193080.png|193080              |
|https://cdn.sofifa.org/players/4/19/192985.png|192985.png|192985              |
|https://cdn.sofifa.org/players/4/19/183277.png|183277.png|183277              |
|https://cdn.sofifa.org/players/4/19/177003.png|177003.png|177003              |
|https://cdn.sofifa.org/players/4/19/176580.png|176580.png|176580              |
|https://cdn.sofifa.org/players/4/19/155862.png|155862.png|155862              |
|https://cdn.sofifa.org/play

In [20]:
#df1 = fifa_df.Club.isin("Barcelona", "Juventus")
df1 = fifa_df.select("Name", "Club").where(fifa_df.Club.isin(["Barcelona", "Juventus"]))
#df2 = fifa_df.Name.startswith("B").where(fifa_df.Name.endswith("a"))
df2 = fifa_df.select("Name").where(fifa_df.Name.startswith("B")).where(fifa_df.Name.endswith("a"))
df1.show()
df2.show()

+-----------------+--------+
|             Name|    Club|
+-----------------+--------+
|Cristiano Ronaldo|Juventus|
|        P. Dybala|Juventus|
|     G. Chiellini|Juventus|
|      Alex Sandro|Juventus|
|    Douglas Costa|Juventus|
|       L. Bonucci|Juventus|
|        M. Pjanić|Juventus|
|       M. Benatia|Juventus|
|      W. Szczęsny|Juventus|
|       S. Khedira|Juventus|
|       B. Matuidi|Juventus|
|         M. Perin|Juventus|
|      J. Cuadrado|Juventus|
|     M. Mandžukić|Juventus|
|      A. Barzagli|Juventus|
|  F. Bernardeschi|Juventus|
|        D. Rugani|Juventus|
|     João Cancelo|Juventus|
|           E. Can|Juventus|
|    M. De Sciglio|Juventus|
+-----------------+--------+
only showing top 20 rows

+----------------+
|            Name|
+----------------+
|  Bernardo Silva|
|          Bartra|
|           Bruma|
|    Borja García|
|     B. Espinosa|
|      B. Oczipka|
|     Bruno Viana|
|      Balenziaga|
|        Bustinza|
|   B. Santamaria|
|        B. Sagna|
|    Bruno V