In [1]:
from pyspark import SparkContext

In [2]:
from datetime import datetime

In [3]:
sc = SparkContext(master='local[*]', appName = 'First Spark App')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/16 02:40:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
lines = sc.textFile('/home/maryam/content/lax_passengers_header.csv')

In [20]:
# Extract the header 
header = lines.first()

In [30]:
# Function to parse CSV lines and extract the ReportPeriod month, year, and Passenger_Count for specific terminals
def parse_line(line):
    try:
        if line == header:
            return None
        fields = line.split(',')
        report_period = fields[1]  # ReportPeriod is the second field
        terminal = fields[2]  # Terminal is the third field
        passenger_count = int(fields[5])  # Passenger_Count is the sixth field
        
        # Filter for terminals 1 to 8 and Tom Bradley International Terminal
        valid_terminals = [f'Terminal {i}' for i in range(1, 9)] + ['Tom Bradley International Terminal']
        if terminal in valid_terminals:
            # Parse the date and format as "Month YYYY"
            date_obj = datetime.strptime(report_period, '%m/%d/%Y %I:%M:%S %p')
            month_year = date_obj.strftime('%B %Y')
            return (month_year, passenger_count)
        else:
            return None
    except Exception as e:
        print(f"Error parsing line: {line}, Error: {e}")
        return None

In [31]:
# Parse the CSV lines and filter out invalid rows 
parsed_lines = lines.map(parse_line).filter(lambda x: x is not None)

In [32]:
# Aggregate data by month (sum the Passenger_Count) 
monthly_passenger_counts = parsed_lines.reduceByKey(lambda a, b: a + b)

In [36]:
# Filter results to include only those with passenger counts over 5 million 
filtered_results = monthly_passenger_counts.filter(lambda x: x[1] > 5_000_000)

In [37]:
# Collect and print the result
try:
    results = filtered_results.collect()
    for result in results:
        print(f"Month-Year: {result[0]}, Total Passengers: {result[1]}")
except Exception as e:
    print(f"Error collecting results: {e}")


Month-Year: March 2006, Total Passengers: 5088556
Month-Year: April 2006, Total Passengers: 5085946
Month-Year: June 2006, Total Passengers: 5509853
Month-Year: July 2006, Total Passengers: 5919529
Month-Year: August 2006, Total Passengers: 5745000
Month-Year: December 2006, Total Passengers: 5070491
Month-Year: March 2007, Total Passengers: 5240144
Month-Year: May 2007, Total Passengers: 5256763
Month-Year: June 2007, Total Passengers: 5626291
Month-Year: July 2007, Total Passengers: 6052466
Month-Year: October 2007, Total Passengers: 5069354
Month-Year: March 2008, Total Passengers: 5232233
Month-Year: June 2008, Total Passengers: 5538956
Month-Year: August 2008, Total Passengers: 5714031
Month-Year: June 2010, Total Passengers: 5311142
Month-Year: April 2011, Total Passengers: 5028056
Month-Year: May 2011, Total Passengers: 5476704
Month-Year: July 2011, Total Passengers: 6071335
Month-Year: August 2011, Total Passengers: 5918484
Month-Year: December 2011, Total Passengers: 5025839
