In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=fa49b1736b351f79d9348ed0a72ae09aacc09d28378212f04c1a60cf1fb6cda0
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [None]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
import json
import numpy as np
import pickle

# Load the trained Random Forest model
with open('/gdrive/My Drive/trip_model.pkl', 'rb') as model_file:
    rf_model = pickle.load(model_file)

# Load the scaler object
with open('/gdrive/My Drive/scaler_model.pkl', 'rb') as scaler_file:
    scaler = pickle.load(scaler_file)

# Load station data from JSON file
data_file_path = '/gdrive/My Drive/station_data.json'
with open(data_file_path, 'r') as data_file:
    station_data = json.load(data_file)

# Iterate through each record in the data
for record in station_data:
    # Extract values for "day_of_week," "hour," and "local_id"
    day_of_week = record['day_of_week']
    hour = record['hour']
    local_id = record['local_id']

    # Create a new data point
    new_data_point = [[day_of_week, hour, local_id]]

    # Scale the features using the loaded scaler
    new_data_point_scaled = scaler.transform(new_data_point)

    # Use the trained model to predict the target variable
    predicted_traffic = rf_model.predict(new_data_point_scaled)[0]

    # Append the predicted value to the original JSON data
    record['predicted_traffic'] = predicted_traffic

# Save the updated data to a new JSON file
output_file_path = '/gdrive/My Drive/station_data_with_predictions.json'
with open(output_file_path, 'w') as output_file:
    json.dump(station_data, output_file)

print(f'Predictions added to {output_file_path}')




Predictions added to /gdrive/My Drive/station_data_with_predictions.json




In [None]:
from pyspark.sql import SparkSession
import time

def process_batch():
    # Read the JSON file into a DataFrame
    df = spark.read.option("multiline", "true").json(json_file_path)

    # Create a new column maintenance_priority
    df = df.withColumn(
        "maintenance_priority",
        (df["num_docks_disabled"] + df["num_vehicles_disabled"]) /
        (df["num_docks_available"] + df["num_vehicles_available"])
    )

    # Order the DataFrame in descending order of maintenance_priority
    df = df.orderBy("maintenance_priority", ascending=False)

    df.createOrReplaceTempView("station")

    try:

        # Save the DataFrame as a JSON file
        output_path = "/gdrive/My Drive/output.json"
        df.write.json(output_path,mode="overwrite")

    finally:
        # Unpersist the DataFrame to release resources
        df.unpersist()

# Create a Spark session
spark = SparkSession.builder.appName("BatchApp").getOrCreate()

# Specify the path to the JSON file
json_file_path = "/gdrive/My Drive/station_data_with_predictions.json"

# Set the interval in seconds
interval_seconds = 15

try:
    # Run indefinitely
    while True:
        # Record the start time for each iteration
        iteration_start_time = time.time()

        # Process the batch
        process_batch()

        # Calculate and print the time taken for the iteration
        iteration_end_time = time.time()
        iteration_elapsed_time = iteration_end_time - iteration_start_time
        print(f"Time taken for iteration: {iteration_elapsed_time:.2f} seconds")

        # Wait for the specified interval
        time.sleep(interval_seconds)

except KeyboardInterrupt:
    # Handle keyboard interrupt (e.g., press Ctrl+C to stop the loop)
    print("Stopping the application")

finally:
    # Stop the Spark session
    spark.stop()


Time taken for iteration: 14.81 seconds
Time taken for iteration: 1.59 seconds
Time taken for iteration: 1.76 seconds
Stopping the application


In [None]:
spark_master = spark.conf.get("spark.master")
print(spark_master)

local[*]
