In [None]:
from datetime import datetime
import requests
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
# import dotenv
# import os
# dotenv.load_dotenv()

GITHUB_USER = 'USER_AQU'
GITHUB_TOKEN = 'SEU_TOKEN_AQUI'
HEADERS = { "Authorization": f"token {GITHUB_TOKEN}"}

spark = SparkSession.builder \
    .appName("App") \
    .getOrCreate()

In [None]:

def clean_company(company:str) -> str:
    """
    Remove o caractere '@' do início do nome da empresa, se presente.
    """
    if company and company.startswith('@'):
        return company[1:]
    return company

def transform_date(date_str:str) -> str:
    """
    Transforma uma string de data no formato 'AAAA-MM-DDTHH:MM:SSZ' para o formato 'DD/MM/AAAA'.
    """
    date_obj = datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%SZ')
    formatted_date = date_obj.strftime('%d/%m/%Y')
    return formatted_date

def get_follower_details(followers:list) -> list:
    """
    Obtém detalhes dos seguidores a partir de uma lista de URLs e retorna uma lista de dicionários com os dados.

    Args:
        followers (list): Uma lista de dicionários contendo informações dos seguidores, incluindo URLs.

    Returns:
        list: Uma lista de dicionários contendo detalhes dos seguidores.
    """
    user_data = []
    for follower in followers:
        user_url = follower['url']
        response = requests.get(user_url, headers=HEADERS)
        if response.status_code == 200:
            user_info = response.json()
            
            print(f"Request follower: {user_info.get('name')}")
            user_data.append({
                'name': user_info.get('name'),
                'company': clean_company(user_info.get('company')),
                'blog': user_info.get('blog'),
                'email': user_info.get('email'),
                'bio': user_info.get('bio'),
                'public_repos': user_info.get('public_repos'),
                'followers': user_info.get('followers'),
                'following': user_info.get('following'),
                'created_at': transform_date(user_info.get('created_at'))
            })
        else:
            print("Erro request em follower")
    return user_data

In [None]:

# Request 1
followers_url = f"https://api.github.com/users/{GITHUB_USER}/followers"
response = requests.get(followers_url, headers=HEADERS)

if response.status_code == 200:
    print(f"Request feito com sucesso em {GITHUB_USER}")
    # Request 2
    followers = response.json()
    user_data = get_follower_details(followers)

    # Defining schema
    schema = StructType([
        StructField('name', StringType(), True),
        StructField('company', StringType(), True),
        StructField('blog', StringType(), True),
        StructField('email', StringType(), True),
        StructField('bio', StringType(), True),
        StructField('public_repos', IntegerType(), True),
        StructField('followers', IntegerType(), True),
        StructField('following', IntegerType(), True),
        StructField('created_at', StringType(), True)
    ])

    spark_df = spark.createDataFrame(user_data, schema=schema)
else:
    print(f"Erro ao acessar a API: {response.status_code}")

## Salvando arquivo CSV

In [None]:
file_path = "/FileStore/tables/github_followers.csv"
spark_df.write.csv(file_path, header=True)

## Lendo arquivo CSV

In [None]:
df = spark.read.csv(file_path, header=True, inferSchema=True)
df.show()