In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.ml import Pipeline

spark = SparkSession.builder \
    .appName("CreditCardFraudDetection") \
    .getOrCreate()

df = spark.read.csv("creditcard.csv", header=True, inferSchema=True)

In [None]:
df = df.dropna()

feature_cols = [c for c in df.columns if c not in ['Class']]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features_unscaled")
scaler = StandardScaler(inputCol="features_unscaled", outputCol="features", withStd=True, withMean=True)

pipeline = Pipeline(stages=[assembler, scaler])

In [None]:
df_transformed = pipeline.fit(df).transform(df)

final_df = df_transformed.select("features", col("Class").alias("label"))
pandas_df = final_df.toPandas()