In [1]:
!pip install pyspark
!pip install python-dotenv
!pip install snowflake-connector-python

Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.1.0
Collecting snowflake-connector-python
  Downloading snowflake_connector_python-3.15.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (70 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.8/70.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting asn1crypto<2.0.0,>0.24.0 (from snowflake-connector-python)
  Downloading asn1crypto-1.5.1-py2.py3-none-any.whl.metadata (13 kB)
Collecting boto3>=1.24 (from snowflake-connector-python)
  Downloading boto3-1.38.12-py3-none-any.whl.metadata (6.6 kB)
Collecting botocore>=1.24 (from snowflake-connector-python)
  Downloading botocore-1.38.12-py3-none-any.whl.metadata (5.7 kB)
Collecting tomlkit (from snowflake-connector-python)
  Downloading tomlkit-0.13.2-py3-non

In [2]:
# Step 1: Initialize variables

import snowflake.connector
from google.colab import userdata

# Replace the placeholders with your actual Snowflake credentials
conn = snowflake.connector.connect(
    user = userdata.get('SNOWFLAKE_USER'),
    password = userdata.get('SNOWFLAKE_PASSWORD'),
    account= userdata.get('SNOWFLAKE_ACCOUNT'),
    warehouse='COMPUTE_WH',
    database='BIGDATA_GITHUB',
    schema='RAW',
    role='ACCOUNTADMIN'
)

In [3]:
# Step 2: Create Snowflake stage for parquet file created by analytics

cur = conn.cursor()
url = 'azure://matthewleffler1.blob.core.windows.net/kaggle-datasets/clean_data/Influence_Top/'

try:
    cur.execute("BEGIN;")
    cur.execute(f"""
        CREATE OR REPLACE STAGE BIGDATA_GITHUB.ANALYTICS.azure_parquet_stage_Influence_Top
          URL = '{url}'
          CREDENTIALS = (
            AZURE_SAS_TOKEN = '{userdata.get('AZURE_SAS_TOKEN')}'
          )
          FILE_FORMAT = (TYPE = PARQUET);
        """)
    cur.execute("COMMIT;")
    print(f"Successfullt created stage.")
except Exception as e:
    cur.execute("ROLLBACK;")
    print(f"Error creating database object: {e}")
finally:
    cur.close()

Successfullt created stage.


In [4]:
# Step 3: Create Snowflake table defination based on parquet analytics

cur = conn.cursor()
table_name = "BIGDATA_GITHUB.ANALYTICS.Influence_Top_100"

try:
    cur.execute("BEGIN;")
    cur.execute(f"""
      CREATE OR REPLACE TABLE {table_name} (
      user_id          BIGINT,
      user_login       STRING,
      follower_count   BIGINT,
      total_stars      BIGINT,
      total_commits    BIGINT,
      total_forks      BIGINT,
      influence_score  FLOAT
      );
          """)
    cur.execute("COMMIT;")
    print("Table created successfully.")
except Exception as e:
    cur.execute("ROLLBACK;")
    print(f"Error creating database object: {e}")
finally:
    cur.close()

Table created successfully.


In [5]:
# Step 4: Load data into Snowflake table

cur = conn.cursor()

try:
    cur.execute("BEGIN;")
    cur.execute(f"""
      COPY INTO {table_name}
      FROM @azure_parquet_stage_Influence_Top
      FILE_FORMAT = (TYPE = PARQUET)
      MATCH_BY_COLUMN_NAME = CASE_INSENSITIVE
      ON_ERROR = CONTINUE;
          """)
    cur.execute("COMMIT;")
    print(f"Data loaded into {table_name} successfully.")
except Exception as e:
    cur.execute("ROLLBACK;")
    print(f"Error loading data: {e}")
finally:
    cur.close()

Data loaded into BIGDATA_GITHUB.ANALYTICS.Influence_Top_100 successfully.
