In [1]:
import findspark
findspark.init('/home/ubuntu/spark-3.2.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('DM-GBT').getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/12 02:33:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Load data. 
data = spark.read.csv('BDAS_Iteration_Dataset/final_df.csv',inferSchema=True,header=True)

# Load Trained Model Logistic Regression
from pyspark.ml.classification import GBTClassificationModel
trained_GBT_model = GBTClassificationModel.load("Gradient Boosted Trees_model")

In [3]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline

# Create StringIndexers for categorical predictor variables
gender_indexer = StringIndexer(inputCol="gender", outputCol="gender_index")
marital_indexer = StringIndexer(inputCol="marital", outputCol="marital_index")
parenthood_indexer = StringIndexer(inputCol="parenthood", outputCol="parenthood_index")

# Create a StringIndexer for the target variable
target_indexer = StringIndexer(inputCol="predicted_category", outputCol="label")

# Assemble the predictor variables into a single features column
assembler = VectorAssembler(
    inputCols=["gender_index", "marital_index", "parenthood_index"],
    outputCol="features"
)

# Define a pipeline to execute the transformations
pipeline = Pipeline(stages=[gender_indexer, marital_indexer, parenthood_indexer, target_indexer, assembler])

# Fit and transform the data using the pipeline
model = pipeline.fit(data)
data = model.transform(data)

# Select the relevant columns for your machine learning model
final_data = data.select("label", "features")

# Show the first few rows of the final_data DataFrame
final_data.show()

+-----+-------------+
|label|     features|
+-----+-------------+
|  0.0|[0.0,1.0,1.0]|
|  0.0|[0.0,1.0,1.0]|
|  0.0|    (3,[],[])|
|  0.0|[0.0,1.0,1.0]|
|  0.0|[0.0,1.0,1.0]|
|  0.0|[0.0,1.0,1.0]|
|  0.0|[0.0,1.0,1.0]|
|  0.0|    (3,[],[])|
|  0.0|    (3,[],[])|
|  0.0|[1.0,0.0,0.0]|
|  0.0|[0.0,1.0,1.0]|
|  0.0|[0.0,1.0,1.0]|
|  0.0|[0.0,1.0,1.0]|
|  0.0|    (3,[],[])|
|  0.0|    (3,[],[])|
|  0.0|    (3,[],[])|
|  0.0|[0.0,1.0,0.0]|
|  0.0|[0.0,1.0,1.0]|
|  0.0|[0.0,0.0,1.0]|
|  0.0|    (3,[],[])|
+-----+-------------+
only showing top 20 rows



In [9]:
# Get the feature names from the VectorAssembler stage in the pipeline
assembler_stage = model.stages[-1]  # Assuming the VectorAssembler is the last stage
feature_names = assembler_stage.getInputCols()

# Get feature importances from the trained GBT model
feature_importance = trained_GBT_model.featureImportances

# Assuming you want to work with a Pandas DataFrame
import pandas as pd

# Create a Pandas DataFrame for feature importances and feature names
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance.toArray()
})

# Sort by feature importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display the feature importances
print(feature_importance_df)

            Feature  Importance
1     marital_index    0.367624
0      gender_index    0.345033
2  parenthood_index    0.287343
