In [1]:
import findspark
findspark.init('/home/ubuntu/spark-3.2.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('DM-LR').getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/12 08:44:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/10/12 08:44:54 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
# Load data. 
data = spark.read.csv('BDAS_Iteration_Dataset/final_df.csv',inferSchema=True,header=True)

In [3]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline

# Create StringIndexers for categorical predictor variables
gender_indexer = StringIndexer(inputCol="gender", outputCol="gender_index")
marital_indexer = StringIndexer(inputCol="marital", outputCol="marital_index")
parenthood_indexer = StringIndexer(inputCol="parenthood", outputCol="parenthood_index")

# Create a StringIndexer for the target variable
target_indexer = StringIndexer(inputCol="predicted_category", outputCol="label")

# Assemble the predictor variables into a single features column
assembler = VectorAssembler(
    inputCols=["gender_index", "marital_index", "parenthood_index"],
    outputCol="features"
)

# Define a pipeline to execute the transformations
pipeline = Pipeline(stages=[gender_indexer, marital_indexer, parenthood_indexer, target_indexer, assembler])

# Fit and transform the data using the pipeline
model = pipeline.fit(data)
data = model.transform(data)

# Select the relevant columns for your machine learning model
final_data = data.select("label", "features")

# Show the first few rows of the final_data DataFrame
final_data.show()

                                                                                

+-----+-------------+
|label|     features|
+-----+-------------+
|  0.0|[0.0,1.0,1.0]|
|  0.0|[0.0,1.0,1.0]|
|  0.0|    (3,[],[])|
|  0.0|[0.0,1.0,1.0]|
|  0.0|[0.0,1.0,1.0]|
|  0.0|[0.0,1.0,1.0]|
|  0.0|[0.0,1.0,1.0]|
|  0.0|    (3,[],[])|
|  0.0|    (3,[],[])|
|  0.0|[1.0,0.0,0.0]|
|  0.0|[0.0,1.0,1.0]|
|  0.0|[0.0,1.0,1.0]|
|  0.0|[0.0,1.0,1.0]|
|  0.0|    (3,[],[])|
|  0.0|    (3,[],[])|
|  0.0|    (3,[],[])|
|  0.0|[0.0,1.0,0.0]|
|  0.0|[0.0,1.0,1.0]|
|  0.0|[0.0,0.0,1.0]|
|  0.0|    (3,[],[])|
+-----+-------------+
only showing top 20 rows



In [8]:
import pandas as pd
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Split the data into training and testing sets
train_data, test_data = final_data.randomSplit([0.7, 0.3])

# Create a LogisticRegression model
lr = LogisticRegression(featuresCol="features", labelCol="label")

# Fit the model on the training data
trained_model = lr.fit(train_data)
predictions = trained_model.transform(test_data)

# Create a binary evaluator for AUC
binary_evaluator = BinaryClassificationEvaluator()
auc = binary_evaluator.evaluate(predictions)

# Access the coefficients from the loaded model
coefficients = trained_model.coefficients
intercept = trained_model.intercept

# A list of feature names
feature_names = ["gender_index", "marital_index", "parenthood_index"]

# Create a Pandas DataFrame for coefficients and feature names
coefficients_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
})

# Add an intercept row for the intercept term
intercept_row = pd.DataFrame({'Feature': ['Intercept'], 'Coefficient': [intercept]})
coefficients_df = coefficients_df.append(intercept_row)

# Display the coefficients DataFrame
print(coefficients_df)
print('AUC', auc)

            Feature  Coefficient
0      gender_index     0.628168
1     marital_index     0.387329
2  parenthood_index     0.335476
0         Intercept    -2.007598
AUC 0.6521913510019819


  coefficients_df = coefficients_df.append(intercept_row)
