### Python Script Description

In [18]:
import os
import sys
import subprocess
import re
import pandas as pd

input_path = './datasets/csv_datasets/test001/part-00000-3e9d02f3-2f5d-49cb-bd84-2103a4259e9d-c000.csv'

df = pd.read_csv(input_path, low_memory=False)

directory_path = os.path.dirname(input_path)
file_name = os.path.basename(input_path)

no_header_input_path = f'{directory_path}/NOHEADER_{file_name}'

x_column = df.columns.get_loc("trip_distance")
y_column = df.columns.get_loc("total_amount")

print(x_column) 
print(y_column)

df.to_csv(no_header_input_path, index=True, header=False)
df = pd.read_csv(no_header_input_path, low_memory=False)

# trip_distance = 'trip_distance'

# print(f"The field number for the column is: {df.columns.get_loc(trip_distance)}")


KeyError: 'trip_distance'

In [1]:
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt
from IPython.display import display

# Initialize a Spark session
# local[*] = run local with as many threads as cores available
spark = SparkSession.builder \
    .config("spark.driver.host", "localhost") \
    .appName("Read Parquet File") \
    .master("local[*]") \
    .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC -XX:InitiatingHeapOccupancyPercent=35") \
    .config("spark.driver.memory", "16g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.maxResultSize", "4g") \
    .config("spark.network.timeout", "1200s") \
    .config("spark.sql.shuffle.partitions", "200") \
    .getOrCreate()

# Read the Parquet file
# parquet_file = "./datasets/combined_datasets/yellow_tripdata_2023.parquet" # too large apparently
parquet_file = "./datasets/filtered_fields_datasets/yellow_tripdata_2023_06_distance_amount.parquet"
df = spark.read.parquet(parquet_file).repartition(10).persist()

# Show the schema of the DataFrame
df.printSchema()

# Get the total number of records
total_records = df.count()

print(f"Total number of records: {total_records}")

# Convert to Pandas DataFrame for plotting
pdf = df.toPandas()

# Plot 1: All data
plt.figure(figsize=(10, 6))
plt.scatter(pdf['trip_distance'], pdf['total_amount'], alpha=0.5, color='blue')
plt.title('Trip Distance vs Total Amount (All Data)')
plt.xlabel('Trip Distance (miles)')
plt.ylabel('Total Amount ($)')
plt.grid(True)
plt.savefig('trip_distance_vs_total_amount_all_data.png')  # Save the plot as a PNG file
plt.close()  # Close the plot to free memory

# Calculate mean and standard deviation
mean_trip_distance = pdf['trip_distance'].mean()
std_trip_distance = pdf['trip_distance'].std()

mean_total_amount = pdf['total_amount'].mean()
std_total_amount = pdf['total_amount'].std()

# Filter data to include only values within 2 standard deviations from the mean
normalized_pdf = pdf[
    (pdf['trip_distance'] > mean_trip_distance - 2 * std_trip_distance) & 
    (pdf['trip_distance'] < mean_trip_distance + 2 * std_trip_distance) &
    (pdf['total_amount'] > mean_total_amount - 2 * std_total_amount) &
    (pdf['total_amount'] < mean_total_amount + 2 * std_total_amount)
]

# Plot 2: Normalized data
plt.figure(figsize=(10, 6))
plt.scatter(normalized_pdf['trip_distance'], normalized_pdf['total_amount'], alpha=0.5, color='green')
plt.title('Trip Distance vs Total Amount (Normalized Data)')
plt.xlabel('Trip Distance (miles)')
plt.ylabel('Total Amount ($)')
plt.grid(True)
plt.savefig('trip_distance_vs_total_amount_normalized_data.png')  # Save the plot as a PNG file
plt.close()  # Close the plot to free memory

spark.stop()

root
 |-- trip_distance: double (nullable = true)
 |-- total_amount: double (nullable = true)

Total number of records: 3307234
