### TRƯỜNG ĐẠI HỌC CÔNG NGHIỆP
### THÀNH PHỐ HỒ CHÍ MINH
 
### KHOA Công nghệ Thông tin   
## ĐỀ THI GIỮA KỲ
### Môn thi : Nhập môn dữ liệu lớn 
### Lớp/Lớp học phần:  DHKHDL17A
* Thời gian làm bài: 75 phút (Không kể thời gian phát đề)
* Thí sinh được sử dụng tài liệu và tra cứu tại trang wed
  - https://spark.apache.org/
  - https://stackoverflow.com/
  - https://learn.microsoft.com/en-us/sql/t-sql
* Thí sinh làm bài và lưu lại với định dạng mssv_hovaten_gk.ipynb. Ví dụ bạn có mã số sinh viên là: 12131411, họ và tên: Nguyễn Văn A, thì nộp bài với tên: **12131411_NguyenVanA_gk.ipynb**
* Thí sinh sử dụng dữ liệu *emails.csv*, điều chỉnh biến 'dataPath' ở cell đầu tiên lại cho đúng với đường dẫn đến file data.
* Hoàn thành tất cả các vị trí có chữ **# YOUR CODE HERE** để hoàn thành yêu cầu của mỗi hàm.
#### LƯU Ý: KHÔNG THAY ĐỔI NHỮNG CHỖ KHÁC

In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.dataframe import DataFrame
from pyspark.rdd import RDD
from pyspark.sql.types import Row

sc = SparkSession.builder \
    .appName("Email Data Processing") \
    .getOrCreate()

dataPath = "./data/email.csv"

In [2]:
dataPath = os.environ.get("DATA_MIDTERM") or dataPath

In [3]:
#0.5
def loadAndProcessCsv(filePath: str, spark: SparkSession) -> DataFrame:
    '''
    This function loads a CSV file into a Spark DataFrame, caches it, 
    drops rows with null values, and prints the schema. (using option when read to keep format - header, columns)
    
    Args:
        file_path (str): Path to the CSV file.
        spark (SparkSession): Active Spark session.
        
    Returns:
        DataFrame: Processed DataFrame.
    '''

    data = None
    ### BEGIN SOLUTION 
    data = spark.read.format("csv") \
        .option("header", "true") \
        .option("multiLine", "true") \
        .option("escape", "\"") \
        .option("inferSchema", "true") \
        .load(filePath)
    data.cache()
    data = data.dropna()
    data.printSchema()
    ### END SOLUTION
    return data

In [4]:
data = loadAndProcessCsv(dataPath,sc)

root
 |-- file: string (nullable = true)
 |-- message: string (nullable = true)



In [5]:
data.columns

['file', 'message']

In [6]:
### BEGIN HIDDEN TESTS
dataPathTest = os.environ.get("DATA_MIDTERM_TEST")
def loadAndProcessCsvSolution(filePath: str, spark: SparkSession) -> DataFrame:
    data = spark.read.format("csv") \
        .option("header", "true") \
        .option("multiLine", "true") \
        .option("escape", "\"") \
        .option("inferSchema", "true") \
        .load(filePath)
    data.cache()
    data = data.dropna()
    data.printSchema()
    return data
dataTest = loadAndProcessCsvSolution(dataPathTest, sc)
assert isinstance(dataTest, DataFrame), "loadAndProcessCsv() don't return correct dataType"
assert dataTest.columns == ['file', 'message'], "loadAndProcessCsv does not contain the correct columns"
### END HIDDEN TESTS

root
 |-- file: string (nullable = true)
 |-- message: string (nullable = true)



In [7]:
# 0.5đ
def createRDD(data: DataFrame)->RDD[Row]:
    '''
    This function converts a Spark DataFrame into an RDD of Rows.
    
    Args:
        data (DataFrame): Input DataFrame containing data loaded by Spark.
        
    Returns:
        RDD[Row]: An RDD containing all Rows from the input DataFrame.
    '''
    outRDD = None

    ### BEGIN SOLUTION
    outRDD = data.rdd
    ### END SOLUTION
    
    return outRDD

In [8]:
emailRDD = createRDD(data)
assert isinstance(emailRDD, RDD), "createRDD() does not return the correct data type (RDD)"
assert isinstance(emailRDD.first(),Row), "createRDD() contains elements that are not of type Row"

In [9]:
### BEGIN HIDDEN TESTS
def createRRDSolution(dataF):
    return  dataF.rdd
rddTest = createRRDSolution(dataTest)
result = createRDD(dataTest)
assert isinstance(result, RDD), "createRDD() don't return correct dataType"
assert isinstance(result.first(),Row), "createRDD() have element not correct datatype"
assert result.count() == 500, "createRDD() does not contain the correct number of elements"
### END HIDDEN TESTS

In [10]:
import email
from typing import List, Optional
from pyspark.sql.types import Row

def splitEmailAddresses(emailString: str) -> List[Optional[str]]:
    '''
    The function splits a comma-separated string of email addresses into a unique list.
    
    Args:
        emailString: A string containing email addresses separated by commas.
        
    Returns:
        A list of unique email addresses.
    '''
    if emailString:
        addresses = emailString.split(',')
        uniqueAddresses = list(frozenset(map(lambda x: x.strip(), addresses)))
        return uniqueAddresses
    return []

def extractEmailDetailsFromRawText(rawEmail: str) -> Row:
    '''
    The function extracts relevant details from a raw email message string.
    
    Args:
        rawEmail: A string representing the raw email message.
        
    Returns:
        A Row object containing the extracted email details.
    '''
    emailMessage = email.message_from_string(rawEmail)
    emailContentParts = []
    for part in emailMessage.walk():
        if part.get_content_type() == 'text/plain':
            emailContentParts.append(part.get_payload())

    emailContent = ''.join(emailContentParts)

    fromAddresses = splitEmailAddresses(emailMessage.get("From"))
    toAddresses = splitEmailAddresses(emailMessage.get("To"))
    ccEmail = splitEmailAddresses(emailMessage.get("Cc"))
    return Row(
        Date=emailMessage.get("Date"),
        From=fromAddresses, 
        To=toAddresses, 
        Subject=emailMessage.get("Subject"), 
        CC=ccEmail, 
        Content=emailContent
    )

# Extract structured email details from the first email message
firstEmailData = data.first()
structuredEmail = extractEmailDetailsFromRawText(firstEmailData.message)
structuredEmail


Row(Date='Thu, 1 Feb 2001 08:00:00 -0800 (PST)', From=['tana.jones@enron.com'], To=['tracy.ngo@enron.com', 'laurel.adams@enron.com', 'dianne.seib@enron.com', 'brant.reves@enron.com', 'wendy.conwell@enron.com', 'mary.cook@enron.com', 'wendi.lebrocq@enron.com', 'melissa.murphy@enron.com', 'lesli.campbell@enron.com', 'diane.ellstrom@enron.com', 'veronica.espinoza@enron.com', 'samuel.schott@enron.com', 'bernice.rodriguez@enron.com', 'cheryl.johnson@enron.com', 'derek.bailey@enron.com', 'edward.sacks@enron.com', 'kim.theriot@enron.com', 'bradley.diebner@enron.com', 'jason.moore@enron.com', 'dale.neuner@enron.com', 'kevin.meredith@enron.com', 'andrea.guillen@enron.com', 'carol.clair@enron.com', 'frank.sayre@enron.com', 'diane.anderson@enron.com', 'leslie.reeves@enron.com', 'carrie.southard@enron.com', 'mary.gosnell@enron.com', 'mark.taylor@enron.com', 'karen.lambert@enron.com', 'sara.shackleton@enron.com', 'amber.ebow@enron.com', 'georgi.landau@enron.com', 'marilyn.colbert@enron.com', 'rober

In [11]:
#0.5đ
def createStructuredEmailRDD(emailRDD: RDD[Row]) -> RDD[Row]:
    '''
    The function takes an RDD of email messages and converts it into a new RDD containing structured email details.
    
    Args:
        emailRDD: An RDD where each Row contains an email message in raw text format.
        
    Returns:
        A new RDD where each element is a Row with structured email details such as Date, From, To, Subject, CC, and Content.
    '''
    structuredEmailRDD = None
    ### BEGIN SOLUTION
    structuredEmailRDD = emailRDD.map(lambda row: extractEmailDetailsFromRawText(row.message))
    ### END SOLUTION
    return structuredEmailRDD

In [12]:
structuredEmailRDD = createStructuredEmailRDD(emailRDD)
assert isinstance(structuredEmailRDD, RDD), "createStructuredEmailRDD() doesn't return an RDD"
assert isinstance(structuredEmailRDD.first(), Row), "createStructuredEmailRDD() elements are not of type Row"

In [13]:
### BEGIN HIDDEN TESTS
def createStructuredEmailRDDSolution(emailRDD):
    return emailRDD.map(lambda row: extractEmailDetailsFromRawText(row.message))
result =  createStructuredEmailRDD(rddTest)
firstRow = result.first()
assert "Date" in firstRow, "'Date' field is missing in the structured Row"
assert "From" in firstRow, "'From' field is missing in the structured Row"
assert "To" in firstRow, "'To' field is missing in the structured Row"
assert "Subject" in firstRow, "'Subject' field is missing in the structured Row"
assert "CC" in firstRow, "'CC' field is missing in the structured Row"
assert "Content" in firstRow, "'Content' field is missing in the structured Row"

assert firstRow.Date == 'Thu, 1 Feb 2001 08:00:00 -0800 (PST)', "structuredEmailRDD() have error when get content from Row"
### END HIDDEN TESTS

In [14]:
#1.
def countNumberEmail(structuredEmailRDD: RDD[Row], k: int)->int:
    '''
    The function counts the number of emails with more than `k` email addresses in the From field and To field.
    
    Args:
    - structuredEmailRDD: RDD of Row objects, each containing an email's structured data.
    - k: The threshold for the number of emails in the CC field.
    
    Returns:
    - int: The count of emails with more than `k` email addresses in the From field and To field.
    '''
    count = -1
    ### BEGIN SOLUTION
    count =  structuredEmailRDD.map(lambda row: 1 if (len(row.From) + len(row.To)) > k else 0).reduce(lambda a, b: a + b)
    ### END SOLUTION
    return count

In [15]:
countNumberEmail(structuredEmailRDD,50)

533

In [16]:
### BEGIN HIDDEN TESTS
structuredEmailRDDTest = createStructuredEmailRDDSolution(rddTest)
countTest = countNumberEmail(structuredEmailRDDTest,12)
assert countTest == 41, "Count return wrong number email"
### END HIDDEN TESTS

In [17]:
# 1đ
def countUniqueEmailDomains(structuredEmailRDD: RDD[Row], k) -> int:
    '''
    The function counts the number of unique email domains in the "From" and "CC" using map and reduce.
    
    Args:
    - structuredEmailRDD: RDD of Row objects, each containing an email's structured data.
    
    Returns:
    - dict: A dictionary showing the count of emails from each unique domain in the "From" and "CC" field.
        k = 3 => {
                'enron.com': 16452,
                'aol.com': 122,
                'hotmail.com': 101
                }
    '''
    results = {}
    ### BEGIN SOLUTION
    domainCounts = structuredEmailRDD.flatMap(lambda row: [email.split('@')[1] for email in row.From+row.CC]) \
                                    .map(lambda domain: (domain, 1)) \
                                    .reduceByKey(lambda a, b: a + b)
    sortedDomainCounts = domainCounts.sortBy(lambda x: x[1], ascending=False)
    
    topDomain = sortedDomainCounts.take(k)
    results = {i:v for i,v in topDomain}
    ### END SOLUTION
    return results
    

In [18]:
countUniqueEmailDomains(structuredEmailRDD, 10)

{'enron.com': 35316,
 'aol.com': 214,
 'hotmail.com': 180,
 'enron.com>': 109,
 'txu.com': 104,
 'haas.berkeley.edu': 98,
 'duke-energy.com': 92,
 'yahoo.com': 88,
 'caiso.com': 83,
 'nyiso.com': 59}

In [19]:
### BEGIN HIDDEN TESTS
countDomains = countUniqueEmailDomains(structuredEmailRDDTest,3)
assert countDomains == {'enron.com': 839, 'hotmail.com': 7, 'haas.berkeley.edu': 7}, "countUniqueEmailDomains wrong answer"
### END HIDDEN TESTS

In [20]:
# 0.5đ ;
def countEmailsBySender(structuredEmailRDD: RDD[Row], sender: str) -> int:
    '''
    The function counted emails in the dataset to include those sent by a specific sender's email.

    Args:
    - structuredEmailRDD: RDD of Row objects, each containing an email's structured data.
    - recipient: The email address of the sender to count.

    Returns:
    - numEmails: number email sent by sender
    '''
    numEmails =  -1
    ### BEGIN SOLUTION
    filteredEmailsRDD = structuredEmailRDD.filter(lambda emailRow: sender in emailRow.From)
    numEmails = filteredEmailsRDD.count()
     ### END SOLUTION
    return numEmails

In [21]:
countEmailsBySender(structuredEmailRDD, "tana.jones@enron.com")

327

In [22]:
### BEGIN HIDDEN TESTS
countEmailsByRecipient =  countEmailsBySender(structuredEmailRDDTest, "tana.jones@enron.com")
assert countEmailsByRecipient == 5, "filterEmailsByRecipient wrong answer"
### END HIDDEN TESTS

In [23]:
# 1.5d
def getTopKFrequentWordsInContentByRecipient(structuredEmailRDD: RDD[Row], recipient: str, k: int) -> dict:
    '''
    This function filtered emails in the dataset sent to a specific recipient's email,
    and then returns the top k most frequent words found in the content of those filtered emails.

    Args:
    - structuredEmailRDD: RDD of Row objects, where each Row represents an email with structured data, such as subject and content.
    - recipient: Recipient's email which needs to count.
    - k: The number of most frequent words to return.

    Returns:
    - topKWordsDict: A dictionary containing the top k most frequent words found in the content of filtered emails.
                     The keys are the words, and the values are their frequencies, representing how often they appear in the content of the filtered emails.
                    example:
                    {'to': 12,
                     'the': 9,
                     'your': 8,
                     'a': 5,
                     'is': 4,
                     'survey': 4,
                     'and': 4,
                     'you': 4,
                     'of': 3,
                     'very': 3}
    '''
    topKWordsDict = {}
    ### BEGIN SOLUTION
    filteredEmailsRDD = structuredEmailRDD.filter(lambda emailRow: recipient in emailRow.To)
    
    wordCountRDD = filteredEmailsRDD.flatMap(lambda emailRow: emailRow.Content.lower().split()) \
                                    .map(lambda word: (word, 1)) \
                                    .reduceByKey(lambda a, b: a + b)
    sortedWordCount = wordCountRDD.collect()
    sortedWordCount.sort(key=lambda x: x[1], reverse=True)
    topKWordsDict = dict(sortedWordCount[:k])
    ### END SOLUTION
    return topKWordsDict

In [24]:
getTopKFrequentWordsInContentByRecipient(structuredEmailRDD, "tana.jones@enron.com", 10)

{'the': 2385,
 'to': 1503,
 'of': 1046,
 'and': 1005,
 'for': 702,
 'a': 646,
 'in': 626,
 'you': 612,
 'on': 600,
 'is': 552}

In [25]:
### BEGIN HIDDEN TESTS
topkWord = getTopKFrequentWordsInContentByRecipient(structuredEmailRDDTest, "tana.jones@enron.com", 4)
assert topkWord == {'the': 100, 'to': 52, 'of': 48, 'for': 28}, "getTopKFrequentWordsInContentByRecipient wrong answer"
### END HIDDEN TESTS

In [26]:
# SQL Query
from pyspark.sql import SQLContext
from pyspark.sql.functions import col, size, to_date, year, month, weekofyear
sc.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")
# Convert RDD to DataFrame
dfEmais = structuredEmailRDD.toDF()
dfEmais = dfEmais.withColumn('DateTime', to_date(col('Date'), "EEE, d MMM yyyy HH:mm:ss Z"))
dfEmais = dfEmais.withColumn('Num_To', size(col('To')))
dfEmais = dfEmais.withColumn('Num_CC', size(col('CC')))

In [27]:
sqlContext = SQLContext(sc)
tableName = "Emails"
dfEmais.createOrReplaceTempView(tableName)



In [28]:
# 0.5đ
def getTopKRowsBySQL(sqlContext: SQLContext, tableName: str, k: int) -> DataFrame:
    '''
    This function queries the first k rows from a given table using SQLContext.

    Args:
    - sqlContext: An SQLContext object that provides the environment to run SQL queries on structured data.
    - tableName: The name of the table from which the rows will be selected.
    - k: The number of rows to return. It must be a positive integer.

    Returns:
    - result: A DataFrame containing the first k rows of data from the specified table.
                If the table contains fewer than k rows, the DataFrame will contain all available rows.
    '''
    ### BEGIN SOLUTION
    assert isinstance(sqlContext, SQLContext), f"sqlContext is not the correct data type, expected SQLContext but got {type(sqlContext)}"
    assert isinstance(tableName, str), f"tableName is not the correct data type, expected string but got {type(tableName)}"
    assert isinstance(k, int) and k > 0, f"k must be a positive integer, but got {k}"

    query = f"SELECT * FROM {tableName} LIMIT {k}"
    
    result = sqlContext.sql(query)
    ### END SOLUTION
    return result


def getTopKRowsByDFOperations(dataFrame: DataFrame, k: int) -> DataFrame:
    '''
    This function queries the first k rows from a given table using SQLContext.

    Args:
    - dataFrame: An DataFrame data
    - tableName: The name of the table from which the rows will be selected.
    - k: The number of rows to return. It must be a positive integer.

    Returns:
    - resultDF: A DataFrame containing the first k rows of data from the specified table.
                If the table contains fewer than k rows, the DataFrame will contain all available rows.
    '''
    ### BEGIN SOLUTION
    assert isinstance(dataFrame, DataFrame), f"sqlContext is not the correct data type, expected SQLContext but got {type(dataFrame)}"
    assert isinstance(k, int) and k > 0, f"k must be a positive integer, but got {k}"
    result =  dataFrame.limit(k)
    ### END SOLUTION
    return result

In [29]:
getTopKRowsBySQL(sqlContext,tableName,1).show()

+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+----------+------+------+
|                Date|                From|                  To|         Subject|                  CC|             Content|  DateTime|Num_To|Num_CC|
+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+----------+------+------+
|Thu, 1 Feb 2001 0...|[tana.jones@enron...|[carol.clair@enro...|Deutsche Bank AG|[larry.gagliardi@...|We have received ...|2001-02-01|    64|     3|
+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+----------+------+------+



In [30]:
getTopKRowsByDFOperations(dfEmais,1).show()

+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+----------+------+------+
|                Date|                From|                  To|         Subject|                  CC|             Content|  DateTime|Num_To|Num_CC|
+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+----------+------+------+
|Thu, 1 Feb 2001 0...|[tana.jones@enron...|[carol.clair@enro...|Deutsche Bank AG|[larry.gagliardi@...|We have received ...|2001-02-01|    64|     3|
+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+----------+------+------+



In [31]:
### BEGIN HIDDEN TESTS
dfEmaisTest = structuredEmailRDDTest.toDF()
dfEmaisTest = dfEmaisTest.withColumn('DateTime', to_date(col('Date'), "EEE, d MMM yyyy HH:mm:ss Z"))
dfEmaisTest = dfEmaisTest.withColumn('Num_To', size(col('To')))
dfEmaisTest = dfEmaisTest.withColumn('Num_CC', size(col('CC')))
tableNameTest = "EmailsTest"
dfEmaisTest.createOrReplaceTempView(tableNameTest)
def getTopKRowsBySQLSolution(sqlContext: SQLContext, tableName: str, k: int) -> DataFrame:
    query = f"SELECT * FROM {tableName} LIMIT {k}"
    result = sqlContext.sql(query)
    return result
assert getTopKRowsBySQL(sqlContext,tableNameTest,5).take(4) == getTopKRowsBySQLSolution(sqlContext,tableNameTest,5).take(4), "Wrong in function getTopKRowsBySQL"
assert getTopKRowsBySQL(sqlContext,tableNameTest,1).take(4) == getTopKRowsBySQLSolution(sqlContext,tableNameTest,1).take(4), "Wrong in function getTopKRowsBySQL"
### END HIDDEN TESTS

In [32]:
### BEGIN HIDDEN TESTS    
assert getTopKRowsByDFOperations(dfEmaisTest,5).take(4) == dfEmaisTest.limit(5).take(4), "Wrong in function getTopKRowsByDFOperations"
assert getTopKRowsByDFOperations(dfEmaisTest,1).take(4) == dfEmaisTest.limit(1).take(4), "Wrong in function getTopKRowsByDFOperations"
### END HIDDEN TESTS

In [33]:
# 1đ, 6.5đ
def countEmailsWithCCGreaterThanKBySQL(sqlContext: SQLContext, tableName: str, k: int) -> int:
    '''
    This function counts the number of emails with more than k email addresses in the CC field using SQL.

    Args:
    - sqlContext: An SQLContext object that provides the environment to run SQL queries on structured data.
    - tableName: The name of the table that contains the email data.
    - k: The threshold number of email addresses in the CC field.

    Returns:
    - count: An integer representing the number of emails where the CC field has more than k email addresses.
    '''
    ### BEGIN SOLUTION
    assert isinstance(sqlContext, SQLContext), f"sqlContext is not the correct data type, expected SQLContext but got {type(sqlContext)}"
    assert isinstance(tableName, str), f"tableName is not the correct data type, expected string but got {type(tableName)}"
    assert isinstance(k, int) and k >= 0, f"k must be a non-negative integer, but got {k}"
    query = f"SELECT COUNT(*) as count FROM {tableName} WHERE size(CC) > {k}"
    resultDF = sqlContext.sql(query)
    count = resultDF.collect()[0]['count']
    ### END SOLUTION
    return count
def countEmailsWithCCGreaterThanKByDFOperations(dataFrame: DataFrame, k: int) -> int:
    '''
    This function counts the number of emails with more than k email addresses in the CC field using DataFrame operations.

    Args:
    - dataFrame: A DataFrame containing the email data.
    - k: The threshold number of email addresses in the CC field.

    Returns:
    - count: An integer representing the number of emails where the CC field has more than k email addresses.
    '''
    ### BEGIN SOLUTION
    assert isinstance(dataFrame, DataFrame), f"dataFrame is not the correct data type, expected DataFrame but got {type(dataFrame)}"
    assert isinstance(k, int) and k >= 0, f"k must be a non-negative integer, but got {k}"

    result = dataFrame.filter(size(col('CC')) > k).count()
    ### END SOLUTION
    return result


In [34]:
countEmailsWithCCGreaterThanKBySQL(sqlContext,tableName,50)

15

In [35]:
countEmailsWithCCGreaterThanKByDFOperations(dfEmais,50)

15

In [36]:
### BEGIN HIDDEN TESTS
assert countEmailsWithCCGreaterThanKBySQL(sqlContext,tableNameTest,50) == 2, "Wrong in function countEmailsWithCCGreaterThanKBySQL"
assert countEmailsWithCCGreaterThanKBySQL(sqlContext,tableNameTest,10) == 11, "Wrong in function countEmailsWithCCGreaterThanKBySQL"
### END HIDDEN TESTS

In [37]:
### BEGIN HIDDEN TESTS
assert countEmailsWithCCGreaterThanKByDFOperations(dfEmaisTest, 50) == 2, "Wrong in function countEmailsWithCCGreaterThanKByDFOperations"
assert countEmailsWithCCGreaterThanKByDFOperations(dfEmaisTest, 10) == 11, "Wrong in function countEmailsWithCCGreaterThanKByDFOperations"
### END HIDDEN TESTS

In [38]:
#1đ, 7.5
from datetime import date
def getDateRangeBySQL(sqlContext: SQLContext, tableName: str) -> tuple[date,date]:
    '''
    This function retrieves the start and end dates from the email dataset using SQL queries.

    Args:
    - sqlContext: An SQLContext object that provides the environment to run SQL queries on structured data.
    - tableName: The name of the table that contains the email data.

    Returns:
    - (startDate, endDate): A tuple containing two elements: the earliest (start) date and the latest (end) date.
    '''
    ### BEGIN SOLUTION
    assert isinstance(sqlContext, SQLContext), f"sqlContext is not the correct data type, expected SQLContext but got {type(sqlContext)}"
    assert isinstance(tableName, str), f"tableName is not the correct data type, expected string but got {type(tableName)}"

    query = f"SELECT MIN(DateTime) as startDate, MAX(DateTime) as endDate FROM {tableName}"
    
    resultDF = sqlContext.sql(query)
    dateRange = resultDF.collect()[0]
    startDate, endDate = dateRange['startDate'], dateRange['endDate']
     ### END SOLUTION
    return startDate, endDate
def getDateRangeByDFOperations(dataFrame: DataFrame) -> tuple[date,date]:
    '''
    This function retrieves the start and end dates from the email dataset using DataFrame operations.

    Args:
    - dataFrame: A DataFrame containing the email data.

    Returns:
    - (startDate, endDate): A tuple containing two elements: the earliest (start) date and the latest (end) date.
    '''
     ### BEGIN SOLUTION
    assert isinstance(dataFrame, DataFrame), f"dataFrame is not the correct data type, expected DataFrame but got {type(dataFrame)}"
    startDate = dataFrame.agg({"DateTime": "min"}).collect()[0][0]
    endDate = dataFrame.agg({"DateTime": "max"}).collect()[0][0]
     ### END SOLUTION
    return startDate, endDate


In [39]:
getDateRangeBySQL(sqlContext,tableName)

(datetime.date(1, 8, 1), datetime.date(2012, 11, 28))

In [40]:
getDateRangeByDFOperations(dfEmais)

(datetime.date(1, 8, 1), datetime.date(2012, 11, 28))

In [41]:
### BEGIN HIDDEN TESTS
import datetime
assert getDateRangeBySQL(sqlContext,tableNameTest) == (datetime.date(1998, 12, 16), datetime.date(2002, 5, 22)), "Wrong in function getDateRangeBySQL"
### END HIDDEN TESTS

In [42]:
### BEGIN HIDDEN TESTS
import datetime
assert getDateRangeByDFOperations(dfEmaisTest) == (datetime.date(1998, 12, 16), datetime.date(2002, 5, 22)), "Wrong in function getDateRangeByDFOperations"
### END HIDDEN TESTS

In [43]:
# 1đ, 8.5
def countEmailsInWeekBySQL(sqlContext: SQLContext, tableName: str, week: int) -> int:
    '''
    This function calculates the number of emails sent in a given week across all years by performing a filter and group by operation on the DateTime column using SQL.

    Args:
    - sqlContext: An SQLContext object that provides the environment to run SQL queries on structured data.
    - tableName: The name of the table that contains the email data.
    - week: The specific week to filter the emails by.

    Returns:
    - email_count: An integer representing the number of emails sent in the given week across all years.
    '''
    assert isinstance(sqlContext, SQLContext), f"sqlContext is not the correct data type, expected SQLContext but got {type(sqlContext)}"
    assert isinstance(tableName, str), f"tableName is not the correct data type, expected string but got {type(tableName)}"
    assert isinstance(week, int) and 1 <= week <= 52, f"week must be an integer between 1 and 52, but got {week}"
    ### BEGIN SOLUTION  
    query = f"""
        SELECT COUNT(*) as email_count
        FROM {tableName}
        WHERE WEEKOFYEAR(DateTime) = {week}
    """
    
    resultDF = sqlContext.sql(query)
    emailCount = resultDF.collect()[0]['email_count']
    ### END SOLUTION
    return emailCount


def countEmailsInWeekByDFOperations(dataFrame: DataFrame, weekValue: int) -> int:
    '''
    This function calculates the number of emails sent in a given week across all years by performing a filter operation on the DateTime column using DataFrame operations.

    Args:
    - dataFrame: A DataFrame containing the email data.
    - weekValue: The specific week to filter the emails by.

    Returns:
    - email_count: An integer representing the number of emails sent in the given week across all years.
    '''
    assert isinstance(dataFrame, DataFrame), f"dataFrame is not the correct data type, expected DataFrame but got {type(dataFrame)}"
    assert isinstance(weekValue, int) and 1 <= weekValue <= 52, f"weekValue must be an integer between 1 and 52, but got {weekValue}"
    ### BEGIN SOLUTION 
    result = dataFrame.filter(weekofyear(col("DateTime")) == weekValue).count()
    ### END SOLUTION
    return result


In [44]:
countEmailsInWeekBySQL(sqlContext,tableName,30)

209

In [45]:
countEmailsInWeekByDFOperations(dfEmais,30)

209

In [46]:
countEmailsInWeekBySQL(sqlContext,tableNameTest,40)

10

In [47]:
### BEGIN HIDDEN TESTS
assert countEmailsInWeekBySQL(sqlContext,tableNameTest,2) == 11, "Wrong in function countEmailsInWeekBySQL"
assert countEmailsInWeekBySQL(sqlContext,tableNameTest,40) == 10, "Wrong in function countEmailsInWeekBySQL"
### END HIDDEN TESTS

In [48]:
### BEGIN HIDDEN TESTS
assert countEmailsInWeekByDFOperations(dfEmaisTest,2) == 11, "Wrong in function countEmailsInYearByDFOperations"
assert countEmailsInWeekByDFOperations(dfEmaisTest,40) == 10, "Wrong in function countEmailsInYearByDFOperations"
### END HIDDEN TESTS

In [49]:
# 1đ
def countEmailsPerMonthBySQL(sqlContext: SQLContext, tableName: str) -> DataFrame:
    '''
    This function calculates the number of emails sent per Month by performing a group by operation on the DateTime column 
    and sorts the result in descending order by the number of emails.

    Args:
    - sqlContext: An SQLContext object that provides the environment to run SQL queries on structured data.
    - tableName: The name of the table that contains the email data.

    Returns:
    - resultDF: A DataFrame containing the number of emails sent per Month, sorted in descending order by email count.
                With schema:
                    root
                     |-- month: integer (nullable = true)
                     |-- count: long (nullable = false)
    '''
    ### BEGIN SOLUTION 
    assert isinstance(sqlContext, SQLContext), f"sqlContext is not the correct data type, expected SQLContext but got {type(sqlContext)}"
    assert isinstance(tableName, str), f"tableName is not the correct data type, expected string but got {type(tableName)}"

    query = f"""
        SELECT MONTH(DateTime) as month, COUNT(*) as count
        FROM {tableName}
        GROUP BY MONTH(DateTime)
        ORDER BY count DESC
    """
    
    result = sqlContext.sql(query)
    ### END SOLUTION 
    return result

def countEmailsPerMonthByDFOperations(dataFrame: DataFrame) -> DataFrame:
    '''
    This function calculates the number of emails sent per Month by performing a group by operation on the DateTime column 
    and sorts the result in descending order by the number of emails.

    Args:
    - dataFrame: A DataFrame containing the email data.

    Returns:
    - resultDF: A DataFrame containing the number of emails sent per Month, sorted in descending order by email count.
                With schema:
                    root
                     |-- month: integer (nullable = true)
                     |-- count: long (nullable = false)
    '''
    ### BEGIN SOLUTION 
    assert isinstance(dataFrame, DataFrame), f"dataFrame is not the correct data type, expected DataFrame but got {type(dataFrame)}"

    result = dataFrame.groupBy(month(col("DateTime")).alias("month")).count()
    result = result.orderBy(col("count").desc())
    ### END SOLUTION 
    return result


In [50]:
countEmailsPerMonthBySQL(sqlContext,tableName).show()

+-----+-----+
|month|count|
+-----+-----+
|   11| 2469|
|   10| 2365|
|    1| 2024|
|    5| 1826|
|    4| 1782|
|   12| 1753|
|    3| 1634|
|    2| 1534|
|    6| 1291|
|    9| 1218|
|    8| 1146|
|    7|  958|
+-----+-----+



In [51]:
countEmailsPerMonthByDFOperations(dfEmais).show()

+-----+-----+
|month|count|
+-----+-----+
|   11| 2469|
|   10| 2365|
|    1| 2024|
|    5| 1826|
|    4| 1782|
|   12| 1753|
|    3| 1634|
|    2| 1534|
|    6| 1291|
|    9| 1218|
|    8| 1146|
|    7|  958|
+-----+-----+



In [52]:
### BEGIN HIDDEN TESTS
def countEmailsPerMonthBySQLSolution(sqlContext: SQLContext, tableName: str) -> DataFrame:
    query = f"""
        SELECT MONTH(DateTime) as month, COUNT(*) as count
        FROM {tableName}
        GROUP BY MONTH(DateTime)
        ORDER BY count DESC
    """
    result = sqlContext.sql(query)
    return result
   
assert countEmailsPerMonthBySQL(sqlContext,tableNameTest).take(10)  == countEmailsPerMonthBySQLSolution(sqlContext,tableNameTest).take(10), "Wrong in function countEmailsPerMonthBySQL"
assert countEmailsPerMonthBySQL(sqlContext,tableNameTest).take(1)  == countEmailsPerMonthBySQLSolution(sqlContext,tableNameTest).take(1), "Wrong in function countEmailsPerMonthBySQL"
assert countEmailsPerMonthBySQL(sqlContext,tableNameTest).take(1000)  == countEmailsPerMonthBySQLSolution(sqlContext,tableNameTest).take(1000), "Wrong in function countEmailsPerMonthBySQL"
### END HIDDEN TESTS

In [53]:
### BEGIN HIDDEN TESTS
def countEmailsPerMonthByDFOperationsSolution(dataFrame: DataFrame) -> DataFrame:
    result = dataFrame.groupBy(month(col("DateTime")).alias("month")).count()
    result = result.orderBy(col("count").desc())
    return result

assert countEmailsPerMonthByDFOperations(dfEmaisTest).take(1) == countEmailsPerMonthByDFOperationsSolution(dfEmaisTest).take(1), "Wrong in function countEmailsPerMonthByDFOperations"
assert countEmailsPerMonthByDFOperations(dfEmaisTest).take(5) == countEmailsPerMonthByDFOperationsSolution(dfEmaisTest).take(5), "Wrong in function countEmailsPerMonthByDFOperations"
assert countEmailsPerMonthByDFOperations(dfEmaisTest).take(1000) == countEmailsPerMonthByDFOperationsSolution(dfEmaisTest).take(1000), "Wrong in function countEmailsPerMonthByDFOperations"
### END HIDDEN TESTS