# Install Spark Google Colab


---

In [None]:
# Connecting Drive to Colab
from google.colab import drive
drive.mount('/content/drive')

# Setting up PySpark in Colab
# 1. First task is to download Java.
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# 2. Install Apache Spark
!wget -q https://dlcdn.apache.org/spark/spark-3.4.0/spark-3.4.0-bin-hadoop3.tgz

!tar -xzvf /content/spark-3.4.0-bin-hadoop3.tgz

# 3. Install and that is the findspark library. It will locate Spark on the system and import it as a regular library.
!pip install -q findspark

# 4. set the environment path. 
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.4.0-bin-hadoop3"

# 5. We need to locate Spark in the system. For that, we import findspark and use the findspark.init() method.
import findspark
findspark.init('spark-3.4.0-bin-hadoop3')

# 6. Now, we can import SparkSession from pyspark.sql and create a SparkSession, which is the entry point to Spark.
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

spark

!rm /content/spark-3.4.0-bin-hadoop3.tgz

# ETL - Api Football

In [None]:
# Install packages
!pip install pandas
!pip install numpy
!pip install pprintpp
!pip install requests

In [5]:
# Import packages
import requests
import json
import pprint
import pandas as pd

In [11]:
# docs
# https://rapidapi.com/api-sports/api/api-football/
# https://www.api-football.com/documentation-v3

# Load leagues
url = "https://api-football-v1.p.rapidapi.com/v3/leagues"

headers = {
	"X-RapidAPI-Key": "81016ac729mshd42ae2fc25f8821p107a85jsn3fb584730546",
	"X-RapidAPI-Host": "api-football-v1.p.rapidapi.com"
}

response = requests.get(url, headers=headers).json()

with open("leagues.json", "w") as write_file:
    json.dump(response, write_file)

##

# Load teams countries
url = "https://api-football-v1.p.rapidapi.com/v3/teams/countries"

headers = {
	"X-RapidAPI-Key": "81016ac729mshd42ae2fc25f8821p107a85jsn3fb584730546",
	"X-RapidAPI-Host": "api-football-v1.p.rapidapi.com"
}

response = requests.get(url, headers=headers).json()

with open("teams_countries.json", "w") as write_file:
    json.dump(response, write_file)

In [13]:
# Load json file into leagues variable
with open('leagues.json','r') as leagues_file:
  leagues = json.load(leagues_file)

league_list = []
for league in leagues['response']:
  league_id = league['league']['id']
  league_name = league['league']['name']
  league_element = {
      'league_id' : league_id,
      'league_name' :league_name
  }

  league_list.append(league_element)

league_df = pd.DataFrame.from_dict(league_list)

league_df

Unnamed: 0,league_id,league_name
0,4,Euro Championship
1,21,Confederations Cup
2,61,Ligue 1
3,144,Jupiler Pro League
4,71,Serie A
...,...,...
995,965,AFC U20 Asian Cup
996,969,Primeira Divisão
997,970,CONMEBOL - U17
998,972,Super Cup


In [21]:
# Load json file into teams countries
with open('teams_countries.json','r') as teams_countries_file:
  t_countries = json.load(teams_countries_file)

teams_countries_list = []
for t_country in t_countries['response']:
	t_country_name = t_country['name']
	t_country_code = t_country['code']

	t_country_dict = {
			'country_name' : t_country_name,
			'country_code' : t_country_code
	}

	teams_countries_list.append(t_country_dict)

t_country_df = pd.DataFrame.from_dict(teams_countries_list)

t_country_df

Unnamed: 0,country_name,country_code
0,Afghanistan,
1,Albania,AL
2,Algeria,DZ
3,American-Samoa,
4,Andorra,AD
...,...,...
251,Wales,GB
252,Yemen,
253,Zambia,ZM
254,Zanzibar,
