# Machine Learning Model for Therapist Ratings

This notebook is used to build and evaluate a machine learning model to predict therapist ratings based on various factors such as review sentiment, location, and specialties.

In [None]:
# Import necessary libraries
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Initialize Spark session
spark = SparkSession.builder.appName("TherapistRatingPrediction").getOrCreate()

# Load processed data
data_path = "../data/processed/therapist_data.parquet"
df = spark.read.parquet(data_path)

# Display the schema of the data
df.printSchema()

# Data preprocessing
indexer = StringIndexer(inputCol="specialty", outputCol="specialtyIndex")
model = indexer.fit(df)
df = model.transform(df)

# Assemble features
feature_columns = ["location", "specialtyIndex", "review_sentiment"]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df = assembler.transform(df)

# Split the data into training and test sets
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

# Train the model
lr = LinearRegression(featuresCol="features", labelCol="rating")
lr_model = lr.fit(train_data)

# Evaluate the model
test_results = lr_model.evaluate(test_data)
print(f"RMSE: {test_results.rootMeanSquaredError}")
print(f"R2: {test_results.r2}")

# Save the model
model_path = "../models/therapist_rating_model"
lr_model.save(model_path)

# Stop the Spark session
spark.stop()