In [None]:
# Install PySpark and Java (Colab needs this for Apache Spark)
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip install pyspark



In [None]:
# Import necessary libraries and start Spark session
from pyspark import SparkContext, SparkConf
from google.colab import drive
import re

In [None]:
# Set up SparkContext
conf = SparkConf().setAppName("Q2_Temperature").setMaster("local[*]")
sc = SparkContext(conf=conf)

In [None]:
# Mount Google Drive to access your files
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Read the text file from Google Drive.
inp_city = sc.textFile("/content/drive/MyDrive/ColabNotebooks/CS4371_HW3_Natthiya/Dataset/city_temperature.csv")
print(inp_city.collect())  # To see the content of the file (optional)

inp_country = sc.textFile("/content/drive/MyDrive/ColabNotebooks/CS4371_HW3_Natthiya/Dataset/country-list.csv")
print(inp_country.collect())  # To see the content of the file (optional)

Buffered data was truncated after reaching the output size limit.

In [None]:
# Clean and Split City Temperature Data
#Each line has 8 columns: Region,Country,State,City,Month,Day,Year,AvgTemperature
header_city = inp_city.first()
city_data = (
    inp_city.filter(lambda l: l != header_city)
            .map(lambda l: l.split(","))
            .filter(lambda x: len(x) == 8 and x[7].strip() != "")
            .map(lambda x: (x[0].strip(), x[1].strip(), x[2].strip(),
                            x[3].strip(), int(x[4]), int(x[5]), int(x[6]),
                            float(x[7])))
)

In [None]:
#Clean and Split Country List Data
#Columns: country, capital, type

header_country = inp_country.first()
country_data = (
    inp_country.filter(lambda l: l != header_country)
               .map(lambda l: l.split(","))
               .filter(lambda x: len(x) == 3)
               .map(lambda x: (x[0].strip(), x[1].strip(), x[2].strip()))
)


# **Q2A. Find the average of AvgTemperature for each Region.**

In [18]:
region_avg = (
    city_data.map(lambda x: (x[0], (x[7], 1)))
             .reduceByKey(lambda a,b: (a[0]+b[0], a[1]+b[1]))
             .mapValues(lambda v: round(v[0]/v[1], 2))
)
region_avg.coalesce(1).saveAsTextFile("/content/drive/MyDrive/ColabNotebooks/CS4371_HW3_Natthiya/q2A_output")


# **Q2B. Find the average of AvgTemperature by Month for countries only located in the “Asia” Region**

In [12]:
asia_avg = (
    city_data.filter(lambda x: x[0].lower() == "asia")
             .map(lambda x: (x[4], (x[7], 1)))   # key = Month
             .reduceByKey(lambda a,b: (a[0]+b[0], a[1]+b[1]))
             .mapValues(lambda v: round(v[0]/v[1], 2))
)
asia_avg.coalesce(1).saveAsTextFile("/content/drive/MyDrive/ColabNotebooks/CS4371_HW3_Natthiya/q2B_output")


# **Q2C. Find the average of AvgTemperature by City only located in the Country “Germany”**

In [13]:
germany_city_avg = (
    city_data.filter(lambda x: x[1].lower() == "germany")
             .map(lambda x: (x[3], (x[7], 1)))   # key = City
             .reduceByKey(lambda a,b: (a[0]+b[0], a[1]+b[1]))
             .mapValues(lambda v: round(v[0]/v[1], 2))
)
germany_city_avg.coalesce(1).saveAsTextFile("/content/drive/MyDrive/ColabNotebooks/CS4371_HW3_Natthiya/q2C_output")

# **Q2D. For each country, find the capital and average of AvgTemperature of that capital city. Your**
output file should contain: <Country><TAB><Year><TAB><Avg of AvgTemperature of the Country>

In [14]:
# city_data: (Region, Country, State, City, Month, Day, Year, Temp)
# country_data: (Country, Capital, Type)

city_pairs = city_data.map(lambda x: (x[1], (x[3], x[6], x[7])))   # (Country, (City, Year, Temp))
capital_pairs = country_data.map(lambda x: (x[0], x[1]))           # (Country, Capital)

joined = city_pairs.join(capital_pairs)
# joined -> (Country, ((City, Year, Temp), Capital))

capital_avg = (
    joined.filter(lambda kv: kv[1][0][0].lower() == kv[1][1].lower())
          .map(lambda kv: ((kv[0], kv[1][0][1]), (kv[1][0][2], 1)))   # key = (Country, Year)
          .reduceByKey(lambda a,b: (a[0]+b[0], a[1]+b[1]))
          .mapValues(lambda v: round(v[0]/v[1], 2))
)

capital_avg.coalesce(1).saveAsTextFile("/content/drive/MyDrive/ColabNotebooks/CS4371_HW3_Natthiya/q2D_output")
