In [0]:
# Import necessary libraries  
import pandas as pd  
from pyspark.sql import SparkSession  
from pyspark.sql.types import StructType, StructField, StringType, IntegerType 

In [0]:
%sh
# Install Git if it's not already installed
apt-get install -y git

# Clone the repository (replace with the URL of your repository)
git clone https://github.com/ignavinuales/Battery_RUL_Prediction  /tmp/battery

# Move to the directory where you cloned your repository to check its contents
cd /tmp/repository
ls


In [0]:
import os

feature_data = [x for x in os.listdir('/tmp/battery/Datasets/HNEI_Processed') if 'features' in x]
feature_data = [os.path.join('/tmp/battery/Datasets/HNEI_Processed', x) for x in feature_data]
feature_data

# Initialize a SparkSession with Delta support  
spark = SparkSession.builder.appName("DeltaTableExample").config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension").config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog").getOrCreate()

import pandas as pd

df = pd.DataFrame()

for file in feature_data:
    print(file)
    new_df = pd.read_csv(file)
    new_df = new_df.drop(columns=['Unnamed: 0'])
    new_df['Source'] = file.split('/')[-1]
    df = pd.concat([df, new_df])
    

def remove_non_alphanumeric_chars(input_str):
    # Replace all whitespaces with underscores
    input_str = input_str.replace(' ', '_')
    
    # Use a list comprehension to filter out non-alphanumeric characters (excluding underscore)
    only_alphanumeric_and_underscore = [char for char in input_str if char.isalnum() or char == '_']
    
    # Join the characters back into a single string
    result_str = ''.join(only_alphanumeric_and_underscore)
    
    return result_str

new_cols = {}
for col in df.columns:
    new_cols[col] = remove_non_alphanumeric_chars(col)

df = df.rename(columns=new_cols)
df

In [0]:
spark_df = spark.createDataFrame(df)
spark_df.head()


In [0]:
# Set up the configuration to access your Azure Data Lake Storage Gen2 account
spark.conf.set("fs.azure.account.auth.type.<YOUR-ACCOUNT>.dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type.<YOUR-ACCOUNT>.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id.<YOUR-ACCOUNT>.dfs.core.windows.net", "<CLIENT-ID>")
spark.conf.set("fs.azure.account.oauth2.client.secret.<YOUR-ACCOUNT>.dfs.core.windows.net","<CLIENT-SECRET>")
spark.conf.set("fs.azure.account.oauth2.client.endpoint.<YOUR-ACCOUNT>.dfs.core.windows.net", "https://login.microsoftonline.com/<TENANT-ID>/oauth2/token")


In [0]:
# Define the path where the Delta table will be stored in Azure Data Lake Storage Gen2
delta_table_path = "abfss://mldata@<YOUR-ACCOUNT>.dfs.core.windows.net/battery_cycle_rul_data"

# Write the Spark DataFrame as a Delta table to Azure Data Lake Storage Gen2
spark_df.write.format("delta").mode("overwrite").save(delta_table_path)

# Register the Delta table in the metastore (optional)
spark.sql(f"CREATE TABLE my_delta_table USING DELTA LOCATION '{delta_table_path}'")


In [0]:
hold_df = spark.sql("SELECT * FROM my_delta_table")
display(hold_df)