In [None]:
!pip install google-cloud-storage
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import row_number
from pyspark.sql.window import Window
from pyspark.sql.functions import *
from google.cloud import storage
import pandas as pd

: 

In [None]:
def download_csv(csv_name):
    bucket_path = "de2022_f1_jmc"
    client = storage.Client()
    bucket = client.bucket(bucket_path)
    if isinstance(csv_name, list):
        for csv in csv_name:
            bucket.blob(f'data/{csv}').download_to_filename(f"../data/{csv}")
    else:
        bucket.blob(f'data/{csv}').download_to_filename(f"../data/{csv_name}")

download_csv(["races.csv","drivers.csv", "results.csv", "F1_tweets.csv"])

f1_tweet_data_path = "../data/F1_tweets.csv"
f1_races_path = "../data/races.csv"
f1_score_path = "../data/results.csv"
f1_divers_path = "../data/drivers.csv"

In [None]:
# Create Spark session
spark = SparkSession.builder \
    .master("local") \
    .config("spark.sql.autoBroadcastJoinThreshold", -1) \
    .config("spark.executor.memory", "500mb") \
    .appName("app") \
    .getOrCreate()

def get_data(file_name):
    return spark.read.csv(file_name, header='true');

races = get_data(f1_races_path);
scores = get_data(f1_score_path);
drivers = get_data(f1_divers_path);

df = scores.join(races.alias("races"), scores.raceId ==  races.raceId,"left") \
     .orderBy(col("races.raceId")) \
     .join(drivers, scores.driverId == drivers.driverId, "left") \
     .join(races.alias("prevRace"), col("prevRace.raceId") == scores.raceId - 1, "left") \
     .filter(col("races.year") == 2021) \
     .filter(scores.position == 1) \
     .select(scores.raceId, col("races.name"), drivers.surname, to_date(col("races.date"),"yyyy-MM-dd").alias("to_date"), to_date(col("prevRace.date"), "yyyy-MM-dd").alias("from_date"));


In [None]:
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

# Use the Cloud Storage bucket for temporary BigQuery export data used by the connector.
bucket = "de_jads_temp-401"
spark.conf.set('temporaryGcsBucket', bucket)

df \
  .write.format('bigquery') \
  .option('table', 'de2022-362622.assignmentDatasets.racesSchedule') \
  .mode("overwrite") \
  .save()