In [2]:
!pip install pyspark
!pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.1.0


In [3]:
from pyspark.sql import SparkSession
from dotenv import load_dotenv
import os

In [4]:
# Initialize variables
from google.colab import userdata

load_dotenv("azure_connection.env")

storage_account_name = userdata.get('AZURE_ACCOUNT_NAME')
storage_account_key = userdata.get('AZURE_STORAGE_KEY')
storage_container_name = "kaggle-datasets"
parquet_blob_name = "github-dataset-full.parquet"

In [5]:
# Creating Spark session
spark = SparkSession.builder \
    .appName("Read Parquet from Azure Blob Storage") \
    .config(f"spark.hadoop.fs.azure.account.key.{storage_account_name}.blob.core.windows.net", storage_account_key) \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-azure:3.3.2,com.microsoft.azure:azure-storage:8.6.6") \
    .getOrCreate()

# Remove garbage error texts
spark.sparkContext.setLogLevel("ERROR")

In [6]:
# Step 3: (Optional) Set Hadoop configurations if not already set during builder
spark.conf.set(
    f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net",
    storage_account_key
)

In [13]:
# Step 4: Define path to the Parquet
parquet_path = f"wasbs://{storage_container_name}@{storage_account_name}.blob.core.windows.net/{parquet_blob_name}"

In [14]:
from pyspark.sql.types import StructType, StructField, StringType, LongType, BooleanType, ArrayType

schema = StructType([
    StructField("bio", StringType(), True),
    StructField("blog", StringType(), True),
    StructField("commit_list", ArrayType(
        StructType([
            StructField("author_id", LongType(), True),
            StructField("commit_at", StringType(), True),
            StructField("committer_id", LongType(), True),
            StructField("generate_at", StringType(), True),
            StructField("message", StringType(), True),
            StructField("repo_description", StringType(), True),
            StructField("repo_id", LongType(), True),
            StructField("repo_name", StringType(), True),
            StructField("repo_owner_id", LongType(), True)
        ])
    ), True),
    StructField("commits", LongType(), True),
    StructField("company", StringType(), True),
    StructField("created_at", StringType(), True),
    StructField("email", StringType(), True),
    StructField("follower_list", ArrayType(LongType(), True), True),
    StructField("followers", LongType(), True),
    StructField("following", LongType(), True),
    StructField("following_list", ArrayType(LongType(), True), True),
    StructField("hirable", BooleanType(), True),
    StructField("id", LongType(), True),
    StructField("is_suspicious", BooleanType(), True),
    StructField("location", StringType(), True),
    StructField("login", StringType(), True),
    StructField("name", StringType(), True),
    StructField("public_gists", LongType(), True),
    StructField("public_repos", LongType(), True),
    StructField("repo_list", ArrayType(
        StructType([
            StructField("created_at", StringType(), True),
            StructField("default_branch", StringType(), True),
            StructField("description", StringType(), True),
            StructField("fork", BooleanType(), True),
            StructField("forks_count", LongType(), True),
            StructField("full_name", StringType(), True),
            StructField("has_wiki", BooleanType(), True),
            StructField("id", LongType(), True),
            StructField("language", StringType(), True),
            StructField("license", StringType(), True),
            StructField("open_issues", LongType(), True),
            StructField("owner_id", LongType(), True),
            StructField("pushed_at", StringType(), True),
            StructField("size", LongType(), True),
            StructField("stargazers_count", LongType(), True),
            StructField("updated_at", StringType(), True)
        ])
    ), True),
    StructField("type", StringType(), True),
    StructField("updated_at", StringType(), True)
])

In [15]:

# Step 5: Read the Parquet file
df = spark.read.schema(schema).parquet(parquet_path)

In [11]:
# Step 6: Preview
df.printSchema()

root
 |-- bio: string (nullable = true)
 |-- blog: string (nullable = true)
 |-- commit_list: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- author_id: long (nullable = true)
 |    |    |-- commit_at: string (nullable = true)
 |    |    |-- committer_id: long (nullable = true)
 |    |    |-- generate_at: string (nullable = true)
 |    |    |-- message: string (nullable = true)
 |    |    |-- repo_description: string (nullable = true)
 |    |    |-- repo_id: long (nullable = true)
 |    |    |-- repo_name: string (nullable = true)
 |    |    |-- repo_owner_id: long (nullable = true)
 |-- commits: long (nullable = true)
 |-- company: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- email: string (nullable = true)
 |-- follower_list: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- followers: long (nullable = true)
 |-- following: long (nullable = true)
 |-- following_list: array (nullable = true)
 | 

In [16]:
df.show(10)

+----+----------------+--------------------+-------+----------+-------------------+-----+--------------------+---------+---------+--------------------+-------+--------+-------------+--------+--------------------+-----------+------------+------------+--------------------+----+-------------------+
| bio|            blog|         commit_list|commits|   company|         created_at|email|       follower_list|followers|following|      following_list|hirable|      id|is_suspicious|location|               login|       name|public_gists|public_repos|           repo_list|type|         updated_at|
+----+----------------+--------------------+-------+----------+-------------------+-----+--------------------+---------+---------+--------------------+-------+--------+-------------+--------+--------------------+-----------+------------+------------+--------------------+----+-------------------+
|NULL|                |                NULL|   NULL|      NULL|2015-09-20 19:52:29| NULL|                NULL

In [14]:
df.count()

10649574

In [17]:
#Dropping nested list columns, these will be separated objects
columns_to_drop = [
    "commit_list",
    "repo_list",
    "follower_list",
    "following_list"
]

GitMainDF = df.drop(*columns_to_drop)

GitMainDF.printSchema()
GitMainDF.show(10)

root
 |-- bio: string (nullable = true)
 |-- blog: string (nullable = true)
 |-- commits: long (nullable = true)
 |-- company: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- email: string (nullable = true)
 |-- followers: long (nullable = true)
 |-- following: long (nullable = true)
 |-- hirable: boolean (nullable = true)
 |-- id: long (nullable = true)
 |-- is_suspicious: boolean (nullable = true)
 |-- location: string (nullable = true)
 |-- login: string (nullable = true)
 |-- name: string (nullable = true)
 |-- public_gists: long (nullable = true)
 |-- public_repos: long (nullable = true)
 |-- type: string (nullable = true)
 |-- updated_at: string (nullable = true)

+----+----------------+-------+----------+-------------------+-----+---------+---------+-------+--------+-------------+--------+--------------------+-----------+------------+------------+----+-------------------+
| bio|            blog|commits|   company|         created_at|email|followers|follow

In [18]:
# Casting the datatypes as per sample data validated
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, TimestampType

df_casted_GitMain = GitMainDF.select(
    F.col("hirable").cast("boolean"),
    F.col("public_repos").cast("int"),
    F.col("is_suspicious").cast("boolean"),
    F.col("updated_at").cast("timestamp"),
    F.col("id").cast("long"),
    F.col("blog").cast("string"),
    F.col("followers").cast("long"),
    F.col("location").cast("string"),
    F.col("type").cast("string"),
    F.col("bio").cast("string"),
    F.col("commits").cast("long"),
    F.col("company").cast("string"),
    F.col("public_gists").cast("int"),
    F.col("name").cast("string"),
    F.col("created_at").cast("timestamp"),
    F.col("email").cast("string"),
    F.col("following").cast("long"),
    F.col("login").cast("string")
)

GitMain_Casted_DF = (
    df_casted_GitMain
    .filter(F.col("id").isNotNull())
    .dropDuplicates(["id"])
)

In [19]:
GitMain_Casted_DF.printSchema()

root
 |-- hirable: boolean (nullable = true)
 |-- public_repos: integer (nullable = true)
 |-- is_suspicious: boolean (nullable = true)
 |-- updated_at: timestamp (nullable = true)
 |-- id: long (nullable = true)
 |-- blog: string (nullable = true)
 |-- followers: long (nullable = true)
 |-- location: string (nullable = true)
 |-- type: string (nullable = true)
 |-- bio: string (nullable = true)
 |-- commits: long (nullable = true)
 |-- company: string (nullable = true)
 |-- public_gists: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- email: string (nullable = true)
 |-- following: long (nullable = true)
 |-- login: string (nullable = true)



In [9]:
# Created Parquet file for non-nested fields with overwrite option
main_parquet_name = "github-dataset-main.parquet"
output_parquet_path = f"wasbs://{storage_container_name}@{storage_account_name}.blob.core.windows.net/{main_parquet_name}"

GitMain_Casted_DF.write.mode("overwrite").parquet(output_parquet_path)

NameError: name 'GitMain_Casted_DF' is not defined

In [20]:
#Reading new parque file with sample data

from pyspark.sql.types import StructType, StructField, StringType, LongType, IntegerType, BooleanType, TimestampType

casted_schema = StructType([
    StructField("hirable", BooleanType(), True),
    StructField("public_repos", IntegerType(), True),
    StructField("is_suspicious", BooleanType(), True),
    StructField("updated_at", TimestampType(), True),
    StructField("id", LongType(), True),
    StructField("blog", StringType(), True),
    StructField("followers", LongType(), True),
    StructField("location", StringType(), True),
    StructField("type", StringType(), True),
    StructField("bio", StringType(), True),
    StructField("commits", LongType(), True),
    StructField("company", StringType(), True),
    StructField("public_gists", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("created_at", TimestampType(), True),
    StructField("email", StringType(), True),
    StructField("following", LongType(), True),
    StructField("login", StringType(), True)
])


df_read = spark.read.schema(casted_schema).parquet(output_parquet_path)

df_read.printSchema()
df_read.show(10)

root
 |-- hirable: boolean (nullable = true)
 |-- public_repos: integer (nullable = true)
 |-- is_suspicious: boolean (nullable = true)
 |-- updated_at: timestamp (nullable = true)
 |-- id: long (nullable = true)
 |-- blog: string (nullable = true)
 |-- followers: long (nullable = true)
 |-- location: string (nullable = true)
 |-- type: string (nullable = true)
 |-- bio: string (nullable = true)
 |-- commits: long (nullable = true)
 |-- company: string (nullable = true)
 |-- public_gists: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- email: string (nullable = true)
 |-- following: long (nullable = true)
 |-- login: string (nullable = true)

+-------+------------+-------------+-------------------+---+--------------------+---------+--------------------+----+--------------------+-------+----------------+------------+-----------------+-------------------+--------------------+---------+---------+
|hirable|public_repos|is_susp

In [26]:
df_read.count()

10649574