### TRƯỜNG ĐẠI HỌC CÔNG NGHIỆP
### THÀNH PHỐ HỒ CHÍ MINH
 
### KHOA Công nghệ Thông tin   
## ĐỀ THI GIỮA KỲ
### Môn thi : Nhập môn dữ liệu lớn 
### Lớp/Lớp học phần:  DHKHDL17A
* Thời gian làm bài: 75 phút (Không kể thời gian phát đề)
* Thí sinh được sử dụng tài liệu và tra cứu tại trang wed
  - https://spark.apache.org/
  - https://stackoverflow.com/
  - https://learn.microsoft.com/en-us/sql/t-sql
* Thí sinh làm bài và lưu lại với định dạng mssv_hovaten_gk.ipynb. Ví dụ bạn có mã số sinh viên là: 12131411, họ và tên: Nguyễn Văn A, thì nộp bài với tên: **12131411_NguyenVanA_gk.ipynb**
* Thí sinh sử dụng dữ liệu *emails.csv*, điều chỉnh biến 'dataPath' ở cell đầu tiên lại cho đúng với đường dẫn đến file data.
* Hoàn thành tất cả các vị trí có chữ **# YOUR CODE HERE** để hoàn thành yêu cầu của mỗi hàm.
#### LƯU Ý: KHÔNG THAY ĐỔI NHỮNG CHỖ KHÁC

In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.dataframe import DataFrame
from pyspark.rdd import RDD
from pyspark.sql.types import Row

sc = SparkSession.builder \
    .appName("Email Data Processing") \
    .getOrCreate()

dataPath = "./data/email.csv"

In [2]:
dataPath = os.environ.get("DATA_MIDTERM") or dataPath

In [3]:
#0.5
def loadAndProcessCsv(filePath: str, spark: SparkSession) -> DataFrame:
    '''
    This function loads a CSV file into a Spark DataFrame, caches it, 
    drops rows with null values, and prints the schema. (using option when read to keep format - header, columns)
    
    Args:
        file_path (str): Path to the CSV file.
        spark (SparkSession): Active Spark session.
        
    Returns:
        DataFrame: Processed DataFrame.
    '''

    data = None
    ### BEGIN SOLUTION
    print(filePath)
    data = spark.read.format("csv") \
        .option("header", "true") \
        .option("multiLine", "true") \
        .option("escape", "\"") \
        .option("inferSchema", "true") \
        .load(filePath)
    data.cache()
    data = data.dropna()
    data.printSchema()
    ### END SOLUTION
    return data

In [4]:
data = loadAndProcessCsv(dataPath,sc)

/home/jovyan/work/Introduction-To-Big-Data/Midterm/source/midterm/data/email.csv


root
 |-- file: string (nullable = true)
 |-- message: string (nullable = true)



In [5]:
data.columns

['file', 'message']

In [6]:
### BEGIN HIDDEN TESTS
dataPathTest = os.environ.get("DATA_MIDTERM_TEST")
def loadAndProcessCsvSolution(filePath: str, spark: SparkSession) -> DataFrame:
    data = spark.read.format("csv") \
        .option("header", "true") \
        .option("multiLine", "true") \
        .option("escape", "\"") \
        .option("inferSchema", "true") \
        .load(filePath)
    data.cache()
    data = data.dropna()
    data.printSchema()
    return data
dataTest = loadAndProcessCsvSolution(dataPathTest, sc)
result =  loadAndProcessCsv(dataPathTest, sc)
assert isinstance(result, DataFrame), "loadAndProcessCsv() don't return correct dataType"
assert result.columns == ['file', 'message'], "loadAndProcessCsv does not contain the correct columns"
### END HIDDEN TESTS

root
 |-- file: string (nullable = true)
 |-- message: string (nullable = true)

/home/jovyan/work/Introduction-To-Big-Data/Midterm/source/midterm/data/sample.csv


root
 |-- file: string (nullable = true)
 |-- message: string (nullable = true)



In [7]:
# 0.5đ
def createRDD(data: DataFrame)->RDD[Row]:
    '''
    This function converts a Spark DataFrame into an RDD of Rows.
    
    Args:
        data (DataFrame): Input DataFrame containing data loaded by Spark.
        
    Returns:
        RDD[Row]: An RDD containing all Rows from the input DataFrame.
    '''
    outRDD = None

    ### BEGIN SOLUTION
    outRDD = data.rdd
    ### END SOLUTION
    
    return outRDD

In [8]:
emailRDD = createRDD(data)
assert isinstance(emailRDD, RDD), "createRDD() does not return the correct data type (RDD)"
assert isinstance(emailRDD.first(),Row), "createRDD() contains elements that are not of type Row"

In [9]:
### BEGIN HIDDEN TESTS
def createRRDSolution(dataF):
    return  dataF.rdd
rddTest = createRRDSolution(dataTest)
result = createRDD(dataTest)
assert isinstance(result, RDD), "createRDD() don't return correct dataType"
assert isinstance(result.first(),Row), "createRDD() have element not correct datatype"
assert result.count() == 500, "createRDD() does not contain the correct number of elements"
### END HIDDEN TESTS

In [10]:
import email
from typing import List, Optional
from pyspark.sql.types import Row

def splitEmailAddresses(emailString: str) -> List[Optional[str]]:
    '''
    The function splits a comma-separated string of email addresses into a unique list.
    
    Args:
        emailString: A string containing email addresses separated by commas.
        
    Returns:
        A list of unique email addresses.
    '''
    if emailString:
        addresses = emailString.split(',')
        uniqueAddresses = list(frozenset(map(lambda x: x.strip(), addresses)))
        return uniqueAddresses
    return []

def extractEmailDetailsFromRawText(rawEmail: str) -> Row:
    '''
    The function extracts relevant details from a raw email message string.
    
    Args:
        rawEmail: A string representing the raw email message.
        
    Returns:
        A Row object containing the extracted email details.
    '''
    emailMessage = email.message_from_string(rawEmail)
    emailContentParts = []
    for part in emailMessage.walk():
        if part.get_content_type() == 'text/plain':
            emailContentParts.append(part.get_payload())

    emailContent = ''.join(emailContentParts)

    fromAddresses = splitEmailAddresses(emailMessage.get("From"))
    toAddresses = splitEmailAddresses(emailMessage.get("To"))
    ccEmail = splitEmailAddresses(emailMessage.get("Cc"))
    return Row(
        Date=emailMessage.get("Date"),
        From=fromAddresses, 
        To=toAddresses, 
        Subject=emailMessage.get("Subject"), 
        CC=ccEmail, 
        Content=emailContent
    )

# Extract structured email details from the first email message
firstEmailData = data.first()
structuredEmail = extractEmailDetailsFromRawText(firstEmailData.message)
structuredEmail


Row(Date='Thu, 1 Feb 2001 08:00:00 -0800 (PST)', From=['tana.jones@enron.com'], To=['darren.vanek@enron.com', 'lynn.shivers@enron.com', 'samuel.schott@enron.com', 'anthony.campos@enron.com', 'veronica.espinoza@enron.com', 'janie.aguayo@enron.com', 'karen.lambert@enron.com', 'wendi.lebrocq@enron.com', 'brent.hendry@enron.com', 'mary.gosnell@enron.com', 'tanya.rohauer@enron.com', 'stacey.richardson@enron.com', 'francisco.leite@enron.com', 'paul.radous@enron.com', 'tracy.ngo@enron.com', 'leslie.reeves@enron.com', 'stephanie.panus@enron.com', 'jason.moore@enron.com', 'bill.hare@enron.com', 'adnan.patel@enron.com', 'samantha.boyd@enron.com', 'kim.theriot@enron.com', 'amber.ebow@enron.com', 'sara.shackleton@enron.com', 'dianne.seib@enron.com', 'lesli.campbell@enron.com', 'scott.tackett@enron.com', 'cheryl.nelson@enron.com', 'brant.reves@enron.com', 'tom.moran@enron.com', 'susan.bailey@enron.com', 'carrie.southard@enron.com', 'frank.davis@enron.com', 'nidia.mendoza@enron.com', 'robert.bruce@e

In [11]:
#0.5đ
def createStructuredEmailRDD(emailRDD: RDD[Row]) -> RDD[Row]:
    '''
    The function takes an RDD of email messages and converts it into a new RDD containing structured email details.
    
    Args:
        emailRDD: An RDD where each Row contains an email message in raw text format.
        
    Returns:
        A new RDD where each element is a Row with structured email details such as Date, From, To, Subject, CC, and Content.
    '''
    structuredEmailRDD = None
    ### BEGIN SOLUTION
    structuredEmailRDD = emailRDD.map(lambda row: extractEmailDetailsFromRawText(row.message))
    ### END SOLUTION
    return structuredEmailRDD

In [12]:
structuredEmailRDD = createStructuredEmailRDD(emailRDD)
assert isinstance(structuredEmailRDD, RDD), "createStructuredEmailRDD() doesn't return an RDD"
assert isinstance(structuredEmailRDD.first(), Row), "createStructuredEmailRDD() elements are not of type Row"

In [13]:
### BEGIN HIDDEN TESTS
def createStructuredEmailRDDSolution(emailRDD):
    return emailRDD.map(lambda row: extractEmailDetailsFromRawText(row.message))
result =  createStructuredEmailRDD(rddTest)
firstRow = result.first()
assert "Date" in firstRow, "'Date' field is missing in the structured Row"
assert "From" in firstRow, "'From' field is missing in the structured Row"
assert "To" in firstRow, "'To' field is missing in the structured Row"
assert "Subject" in firstRow, "'Subject' field is missing in the structured Row"
assert "CC" in firstRow, "'CC' field is missing in the structured Row"
assert "Content" in firstRow, "'Content' field is missing in the structured Row"

assert firstRow.Date == 'Thu, 1 Feb 2001 08:00:00 -0800 (PST)', "structuredEmailRDD() have error when get content from Row"
### END HIDDEN TESTS

In [14]:
#1.
def countNumberEmail(structuredEmailRDD: RDD[Row], k: int)->int:
    '''
    The function counts the number of emails with more than `k` email addresses in the CC field.
    
    Args:
    - structuredEmailRDD: RDD of Row objects, each containing an email's structured data.
    - k: The threshold for the number of emails in the CC field.
    
    Returns:
    - int: The count of emails with more than `k` email addresses in the CC field.
    '''
    count = -1
    ### BEGIN SOLUTION
    count =  structuredEmailRDD.filter(lambda emailRow: len(emailRow.CC)>k).count()
    ### END SOLUTION
    return count

In [15]:
countNumberEmail(structuredEmailRDD,40)

25

In [16]:
### BEGIN HIDDEN TESTS
structuredEmailRDDTest = createStructuredEmailRDDSolution(rddTest)
countTest = countNumberEmail(structuredEmailRDDTest,12)
assert countTest == 8, "Count return wrong number email"
### END HIDDEN TESTS

In [17]:
# 1đ
def countUniqueEmailDomains(structuredEmailRDD: RDD[Row], k) -> int:
    '''
    This function counts the number of unique email domains in the "From" field using `map` and `reduce`.
    
    Args:
    - structuredEmailRDD: An RDD containing Row objects, each representing an email's structured data.
    
    Returns:
    - dict: A dictionary showing the count of emails from each unique domain in the "From" field.
      Example:
        If k = 3, the result might look like:
        {
          'enron.com': 16452,
          'aol.com': 122,
          'hotmail.com': 101
        }
    '''

    results = {}
    ### BEGIN SOLUTION
    domainCounts = structuredEmailRDD.flatMap(lambda row: [email.split('@')[1] for email in row.From]) \
                                    .map(lambda domain: (domain, 1)) \
                                    .reduceByKey(lambda a, b: a + b)
    sortedDomainCounts = domainCounts.sortBy(lambda x: x[1], ascending=False)
    
    topDomain = sortedDomainCounts.take(k)
    results = {i:v for i,v in topDomain}
    ### END SOLUTION
    return results
    

In [18]:
countUniqueEmailDomains(structuredEmailRDD, 10)

{'enron.com': 16452,
 'aol.com': 122,
 'hotmail.com': 101,
 'txu.com': 76,
 'enron.com>': 66,
 'mailman.enron.com': 54,
 'nymex.com': 53,
 'haas.berkeley.edu': 48,
 'yahoo.com': 44,
 'nyiso.com': 41}

In [19]:
countUniqueEmailDomains(structuredEmailRDDTest,3)

{'enron.com': 392, 'hotmail.com': 5, 'aol.com': 4}

In [20]:
### BEGIN HIDDEN TESTS
countDomains = countUniqueEmailDomains(structuredEmailRDDTest,3)
assert countDomains == {'enron.com': 392, 'hotmail.com': 5, 'aol.com': 4}, "countUniqueEmailDomains wrong answer"
### END HIDDEN TESTS

In [21]:
# 0.5đ
def countEmailsByRecipient(structuredEmailRDD: RDD[Row], recipient: str) -> int:
    '''
    The function filters the dataset to include only emails sent to a specific recipient.

    Args:
    - structuredEmailRDD: RDD of Row objects, each containing an email's structured data.
    - recipient: The email address of the recipient to filter by.

    Returns:
    - numEmail: number email sent to recipient
    '''
    numEmails =  -1
    ### BEGIN SOLUTION
    filteredEmailsRDD = structuredEmailRDD.filter(lambda emailRow: recipient in emailRow.To)
    numEmails = filteredEmailsRDD.count()
     ### END SOLUTION
    return numEmails

In [22]:
countEmailsByRecipient(structuredEmailRDD, "tana.jones@enron.com")

454

In [23]:
### BEGIN HIDDEN TESTS
countEmailsByRecipient =  countEmailsByRecipient(structuredEmailRDDTest, "tana.jones@enron.com")
assert countEmailsByRecipient == 15, "countEmailsByRecipient wrong answer"
### END HIDDEN TESTS

In [24]:
# 1.5d
def getTopKFrequentWordsInContentBySubject(structuredEmailRDD: RDD[Row], keyword: str, k: int) -> dict:
    '''
    This function filters the dataset to include only emails with a specific keyword in the subject line,
    and then returns the top k most frequent words found in the content of those filtered emails.

    Args:
    - structuredEmailRDD: RDD of Row objects, where each Row represents an email with structured data, such as subject and content.
    - keyword: The keyword to search for in the subject line (case-insensitive).
    - k: The number of most frequent words to return.

    Returns:
    - topKWordsDict: A dictionary containing the top k most frequent words found in the content of filtered emails.
                     The keys are the words, and the values are their frequencies, representing how often they appear in the content of the filtered emails.
                     example:
                    {'to': 12,
                     'the': 9,
                     'your': 8,
                     'a': 5,
                     'is': 4,
                     'survey': 4,
                     'and': 4,
                     'you': 4,
                     'of': 3,
                     'very': 3}
    '''
    topKWordsDict = {}
    ### BEGIN SOLUTION
    filteredEmailsRDD = structuredEmailRDD.filter(lambda emailRow: keyword.lower() in emailRow.Subject.lower())
    
    wordCountRDD = filteredEmailsRDD.flatMap(lambda emailRow: emailRow.Content.lower().split()) \
                                    .map(lambda word: (word, 1)) \
                                    .reduceByKey(lambda a, b: a + b)
    sortedWordCount = wordCountRDD.collect()
    sortedWordCount.sort(key=lambda x: x[1], reverse=True)
    topKWordsDict = dict(sortedWordCount[:k])
    ### END SOLUTION
    return topKWordsDict

In [25]:
getTopKFrequentWordsInContentBySubject(structuredEmailRDD, "image", 10)

{'to': 12,
 'the': 9,
 'your': 8,
 'a': 5,
 'is': 4,
 'survey': 4,
 'and': 4,
 'you': 4,
 'of': 3,
 'very': 3}

In [26]:
### BEGIN HIDDEN TESTS
topkWord = getTopKFrequentWordsInContentBySubject(structuredEmailRDDTest, "da", 4)
assert topkWord == {'the': 1155, 'to': 743, 'and': 509, 'of': 463}, "getTopKFrequentWordsInContentBySubject wrong answer"
### END HIDDEN TESTS

In [27]:
# SQL Query
from pyspark.sql import SQLContext
from pyspark.sql.functions import col, size, to_date, year, month
sc.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")
# Convert RDD to DataFrame
dfEmais = structuredEmailRDD.toDF()
dfEmais = dfEmais.withColumn('DateTime', to_date(col('Date'), "EEE, d MMM yyyy HH:mm:ss Z"))
dfEmais = dfEmais.withColumn('Num_To', size(col('To')))
dfEmais = dfEmais.withColumn('Num_CC', size(col('CC')))

In [28]:
sqlContext = SQLContext(sc)
tableName = "Emails"
dfEmais.createOrReplaceTempView(tableName)



In [29]:
# 0.5đ
def getTopKRowsBySQL(sqlContext: SQLContext, tableName: str, k: int) -> DataFrame:
    '''
    This function queries the first k rows from a given table using SQLContext.

    Args:
    - sqlContext: An SQLContext object that provides the environment to run SQL queries on structured data.
    - tableName: The name of the table from which the rows will be selected.
    - k: The number of rows to return. It must be a positive integer.

    Returns:
    - result: A DataFrame containing the first k rows of data from the specified table.
                If the table contains fewer than k rows, the DataFrame will contain all available rows.
    '''
    ### BEGIN SOLUTION
    assert isinstance(sqlContext, SQLContext), f"sqlContext is not the correct data type, expected SQLContext but got {type(sqlContext)}"
    assert isinstance(tableName, str), f"tableName is not the correct data type, expected string but got {type(tableName)}"
    assert isinstance(k, int) and k > 0, f"k must be a positive integer, but got {k}"

    query = f"SELECT * FROM {tableName} LIMIT {k}"
    
    result = sqlContext.sql(query)
    ### END SOLUTION
    return result


def getTopKRowsByDFOperations(dataFrame: DataFrame, k: int) -> DataFrame:
    '''
    This function queries the first k rows from a given table using SQLContext.

    Args:
    - dataFrame: An DataFrame data
    - tableName: The name of the table from which the rows will be selected.
    - k: The number of rows to return. It must be a positive integer.

    Returns:
    - resultDF: A DataFrame containing the first k rows of data from the specified table.
                If the table contains fewer than k rows, the DataFrame will contain all available rows.
    '''
    ### BEGIN SOLUTION
    assert isinstance(dataFrame, DataFrame), f"sqlContext is not the correct data type, expected SQLContext but got {type(dataFrame)}"
    assert isinstance(k, int) and k > 0, f"k must be a positive integer, but got {k}"
    result =  dataFrame.limit(k)
    ### END SOLUTION
    return result

In [30]:
getTopKRowsBySQL(sqlContext,tableName,1).show()

+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+----------+------+------+
|                Date|                From|                  To|         Subject|                  CC|             Content|  DateTime|Num_To|Num_CC|
+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+----------+------+------+
|Thu, 1 Feb 2001 0...|[tana.jones@enron...|[carol.clair@enro...|Deutsche Bank AG|[larry.gagliardi@...|We have received ...|2001-02-01|    64|     3|
+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+----------+------+------+



In [31]:
getTopKRowsByDFOperations(dfEmais,1).show()

+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+----------+------+------+
|                Date|                From|                  To|         Subject|                  CC|             Content|  DateTime|Num_To|Num_CC|
+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+----------+------+------+
|Thu, 1 Feb 2001 0...|[tana.jones@enron...|[carol.clair@enro...|Deutsche Bank AG|[larry.gagliardi@...|We have received ...|2001-02-01|    64|     3|
+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+----------+------+------+



In [32]:
### BEGIN HIDDEN TESTS
dfEmaisTest = structuredEmailRDDTest.toDF()
dfEmaisTest = dfEmaisTest.withColumn('DateTime', to_date(col('Date'), "EEE, d MMM yyyy HH:mm:ss Z"))
dfEmaisTest = dfEmaisTest.withColumn('Num_To', size(col('To')))
dfEmaisTest = dfEmaisTest.withColumn('Num_CC', size(col('CC')))
tableNameTest = "EmailsTest"
dfEmaisTest.createOrReplaceTempView(tableNameTest)
def getTopKRowsBySQLSolution(sqlContext: SQLContext, tableName: str, k: int) -> DataFrame:
    query = f"SELECT * FROM {tableName} LIMIT {k}"
    result = sqlContext.sql(query)
    return result
assert getTopKRowsBySQL(sqlContext,tableNameTest,5).take(4) == getTopKRowsBySQLSolution(sqlContext,tableNameTest,5).take(4), "Wrong in function getTopKRowsBySQL"
assert getTopKRowsBySQL(sqlContext,tableNameTest,1).take(4) == getTopKRowsBySQLSolution(sqlContext,tableNameTest,1).take(4), "Wrong in function getTopKRowsBySQL"
### END HIDDEN TESTS

In [33]:
### BEGIN HIDDEN TESTS    
assert getTopKRowsByDFOperations(dfEmaisTest,5).take(4) == dfEmaisTest.limit(5).take(4), "Wrong in function getTopKRowsByDFOperations"
assert getTopKRowsByDFOperations(dfEmaisTest,1).take(4) == dfEmaisTest.limit(1).take(4), "Wrong in function getTopKRowsByDFOperations"
### END HIDDEN TESTS

In [34]:
# 1đ, 6.5đ
def countEmailsWithCCGreaterThanKBySQL(sqlContext: SQLContext, tableName: str, k: int) -> int:
    '''
    This function uses SQL to count the number of emails with more than k email addresses in the CC field.

    Args:
    - sqlContext: An SQLContext object that provides the environment to run SQL queries on structured data.
    - tableName: The name of the table that contains the email data.
    - k: The threshold number of email addresses in the CC field.

    Returns:
    - count: An integer representing the number of emails where the CC field has more than k email addresses.
    '''
    ### BEGIN SOLUTION
    assert isinstance(sqlContext, SQLContext), f"sqlContext is not the correct data type, expected SQLContext but got {type(sqlContext)}"
    assert isinstance(tableName, str), f"tableName is not the correct data type, expected string but got {type(tableName)}"
    assert isinstance(k, int) and k >= 0, f"k must be a non-negative integer, but got {k}"
    query = f"SELECT COUNT(*) as count FROM {tableName} WHERE size(CC) > {k}"
    resultDF = sqlContext.sql(query)
    count = resultDF.collect()[0]['count']
    ### END SOLUTION
    return count
def countEmailsWithCCGreaterThanKByDFOperations(dataFrame: DataFrame, k: int) -> int:
    '''
    This function counts the number of emails with more than k email addresses in the CC field using DataFrame operations.

    Args:
    - dataFrame: A DataFrame containing the email data.
    - k: The threshold number of email addresses in the CC field.

    Returns:
    - count: An integer representing the number of emails where the CC field has more than k email addresses.
    '''
    ### BEGIN SOLUTION
    assert isinstance(dataFrame, DataFrame), f"dataFrame is not the correct data type, expected DataFrame but got {type(dataFrame)}"
    assert isinstance(k, int) and k >= 0, f"k must be a non-negative integer, but got {k}"

    result = dataFrame.filter(size(col('CC')) > k).count()
    ### END SOLUTION
    return result


In [35]:
countEmailsWithCCGreaterThanKBySQL(sqlContext,tableName,50)

15

In [36]:
countEmailsWithCCGreaterThanKByDFOperations(dfEmais,50)

15

In [37]:
### BEGIN HIDDEN TESTS
assert countEmailsWithCCGreaterThanKBySQL(sqlContext,tableNameTest,50) == 2, "Wrong in function countEmailsWithCCGreaterThanKBySQL"
assert countEmailsWithCCGreaterThanKBySQL(sqlContext,tableNameTest,10) == 11, "Wrong in function countEmailsWithCCGreaterThanKBySQL"
### END HIDDEN TESTS

In [38]:
### BEGIN HIDDEN TESTS
assert countEmailsWithCCGreaterThanKByDFOperations(dfEmaisTest, 50) == 2, "Wrong in function countEmailsWithCCGreaterThanKByDFOperations"
assert countEmailsWithCCGreaterThanKByDFOperations(dfEmaisTest, 10) == 11, "Wrong in function countEmailsWithCCGreaterThanKByDFOperations"
### END HIDDEN TESTS

In [39]:
#1đ, 7.5
def getDateRangeBySQL(sqlContext: SQLContext, tableName: str) -> tuple:
    '''
    This function retrieves the start and end dates from the email dataset using SQL queries.

    Args:
    - sqlContext: An SQLContext object that provides the environment to run SQL queries on structured data.
    - tableName: The name of the table that contains the email data.

    Returns:
    - (startDate, endDate): A tuple containing two elements: the earliest (start) date and the latest (end) date.
    '''
    ### BEGIN SOLUTION
    assert isinstance(sqlContext, SQLContext), f"sqlContext is not the correct data type, expected SQLContext but got {type(sqlContext)}"
    assert isinstance(tableName, str), f"tableName is not the correct data type, expected string but got {type(tableName)}"

    query = f"SELECT MIN(DateTime) as startDate, MAX(DateTime) as endDate FROM {tableName}"
    
    resultDF = sqlContext.sql(query)
    dateRange = resultDF.collect()[0]
    startDate, endDate = dateRange['startDate'], dateRange['endDate']
     ### END SOLUTION
    return startDate, endDate
def getDateRangeByDFOperations(dataFrame: DataFrame) -> tuple:
    '''
    This function retrieves the start and end dates from the email dataset using DataFrame operations.

    Args:
    - dataFrame: A DataFrame containing the email data.

    Returns:
    - (startDate, endDate): A tuple containing two elements: the earliest (start) date and the latest (end) date.
    '''
     ### BEGIN SOLUTION
    assert isinstance(dataFrame, DataFrame), f"dataFrame is not the correct data type, expected DataFrame but got {type(dataFrame)}"
    startDate = dataFrame.agg({"DateTime": "min"}).collect()[0][0]
    endDate = dataFrame.agg({"DateTime": "max"}).collect()[0][0]
     ### END SOLUTION
    return startDate, endDate


In [40]:
getDateRangeBySQL(sqlContext,tableName)

(datetime.date(1, 8, 1), datetime.date(2012, 11, 28))

In [41]:
getDateRangeByDFOperations(dfEmais)

(datetime.date(1, 8, 1), datetime.date(2012, 11, 28))

In [42]:
### BEGIN HIDDEN TESTS
import datetime
assert getDateRangeBySQL(sqlContext,tableNameTest) == (datetime.date(1998, 12, 16), datetime.date(2002, 5, 22)), "Wrong in function getDateRangeBySQL"
### END HIDDEN TESTS

In [43]:
### BEGIN HIDDEN TESTS
import datetime
assert getDateRangeByDFOperations(dfEmaisTest) == (datetime.date(1998, 12, 16), datetime.date(2002, 5, 22)), "Wrong in function getDateRangeByDFOperations"
### END HIDDEN TESTS

In [44]:
# 1đ, 8.5
def countEmailsInYearBySQL(sqlContext: SQLContext, tableName: str, year: int) -> int:
    '''
    This function calculates the number of emails sent in a given year by performing a group by operation on the DateTime column using SQL.

    Args:
    - sqlContext: An SQLContext object that provides the environment to run SQL queries on structured data.
    - tableName: The name of the table that contains the email data.
    - year: The specific year to filter the emails by.

    Returns:
    - email_count: An integer representing the number of emails sent in the given year.
    '''
    ### BEGIN SOLUTION    
    assert isinstance(sqlContext, SQLContext), f"sqlContext is not the correct data type, expected SQLContext but got {type(sqlContext)}"
    assert isinstance(tableName, str), f"tableName is not the correct data type, expected string but got {type(tableName)}"
    assert isinstance(year, int) and year > 0, f"year must be a positive integer, but got {year}"

    query = f"""
        SELECT COUNT(*) as email_count
        FROM {tableName}
        WHERE YEAR(DateTime) = {year}
    """
    
    resultDF = sqlContext.sql(query)
    emailCount = resultDF.collect()[0]['email_count']
    ### END SOLUTION
    return emailCount

def countEmailsInYearByDFOperations(dataFrame: DataFrame, yearValue: int) -> int:
    '''
    This function calculates the number of emails sent in a given year by performing a filter operation on the DateTime column using DataFrame operations.

    Args:
    - dataFrame: A DataFrame containing the email data.
    - yearValue: The specific year to filter the emails by.

    Returns:
    - email_count: An integer representing the number of emails sent in the given year.
    '''
    ### BEGIN SOLUTION 
    assert isinstance(dataFrame, DataFrame), f"dataFrame is not the correct data type, expected DataFrame but got {type(dataFrame)}"
    assert isinstance(yearValue, int) and yearValue > 0, f"yearValue must be a positive integer, but got {yearValue}"

    result = dataFrame.filter(year(col("DateTime")) == yearValue).count()
    ### END SOLUTION
    return result


In [45]:
countEmailsInYearBySQL(sqlContext,tableName,2000)

7462

In [46]:
countEmailsInYearByDFOperations(dfEmais,2000)

7462

In [47]:
### BEGIN HIDDEN TESTS
assert countEmailsInYearBySQL(sqlContext,tableNameTest,2001) == 250, "Wrong in function countEmailsInYearBySQL"
assert countEmailsInYearBySQL(sqlContext,tableNameTest,1) == 0, "Wrong in function countEmailsInYearBySQL"
### END HIDDEN TESTS

In [48]:
### BEGIN HIDDEN TESTS
assert countEmailsInYearByDFOperations(dfEmaisTest,2001) == 250, "Wrong in function countEmailsInYearByDFOperations"
assert countEmailsInYearByDFOperations(dfEmaisTest,1) == 0, "Wrong in function countEmailsInYearByDFOperations"
### END HIDDEN TESTS

In [49]:
# 1đ
def countEmailsPerDayBySQL(sqlContext: SQLContext, tableName: str) -> DataFrame:
    '''
    This function calculates the number of emails sent per day by performing a group by operation on the DateTime column 
    and sorts the result in descending order by the number of emails.

    Args:
    - sqlContext: An SQLContext object that provides the environment to run SQL queries on structured data.
    - tableName: The name of the table that contains the email data.

    Returns:
    - resultDF: A DataFrame containing the number of emails sent per day, sorted in descending order by email count.
                With schema:
                    root
                     |-- Date: integer (nullable = true)
                     |-- count: long (nullable = false)
    '''
    ### BEGIN SOLUTION 
    assert isinstance(sqlContext, SQLContext), f"sqlContext is not the correct data type, expected SQLContext but got {type(sqlContext)}"
    assert isinstance(tableName, str), f"tableName is not the correct data type, expected string but got {type(tableName)}"

    query = f"""
        SELECT to_date(DateTime) as Date, COUNT(*) as count
        FROM {tableName}
        GROUP BY to_date(DateTime)
        ORDER BY count DESC
    """
    
    result = sqlContext.sql(query)
    ### END SOLUTION 
    return result

def countEmailsPerDayByDFOperations(dataFrame: DataFrame) -> DataFrame:
    '''
    This function calculates the number of emails sent per day by performing a group by operation on the DateTime column 
    and sorts the result in descending order by the number of emails.

    Args:
    - dataFrame: A DataFrame containing the email data.

    Returns:
    - resultDF: A DataFrame containing the number of emails sent per day, sorted in descending order by email count.
                With schema:
                    root
                     |-- Date: integer (nullable = true)
                     |-- count: long (nullable = false)
    '''
    ### BEGIN SOLUTION 
    assert isinstance(dataFrame, DataFrame), f"dataFrame is not the correct data type, expected DataFrame but got {type(dataFrame)}"

    result = dataFrame.groupBy(col("DateTime").alias("Date")).count()
    result = result.orderBy(col("count").desc())
    ### END SOLUTION 
    return result


In [50]:
countEmailsPerDayBySQL(sqlContext,tableName).show()

+----------+-----+
|      Date|count|
+----------+-----+
|2000-12-13|  140|
|2000-12-12|  128|
|2001-10-24|  111|
|2001-10-26|  108|
|2001-11-27|  107|
|2001-10-25|  105|
|2001-11-26|   99|
|2002-01-30|   95|
|2000-12-11|   95|
|2001-10-23|   94|
|2001-11-19|   93|
|2000-12-08|   91|
|2000-11-28|   91|
|2001-10-22|   88|
|2001-11-20|   88|
|2001-11-21|   88|
|2001-10-18|   86|
|2000-12-04|   84|
|2001-05-01|   83|
|2001-04-19|   83|
+----------+-----+
only showing top 20 rows



In [51]:
countEmailsPerDayByDFOperations(dfEmais).show()

+----------+-----+
|      Date|count|
+----------+-----+
|2000-12-13|  140|
|2000-12-12|  128|
|2001-10-24|  111|
|2001-10-26|  108|
|2001-11-27|  107|
|2001-10-25|  105|
|2001-11-26|   99|
|2002-01-30|   95|
|2000-12-11|   95|
|2001-10-23|   94|
|2001-11-19|   93|
|2000-12-08|   91|
|2000-11-28|   91|
|2001-10-22|   88|
|2001-11-20|   88|
|2001-11-21|   88|
|2001-10-18|   86|
|2000-12-04|   84|
|2001-05-01|   83|
|2001-04-19|   83|
+----------+-----+
only showing top 20 rows



In [52]:
### BEGIN HIDDEN TESTS
def countEmailsPerDayBySQLSolution(sqlContext: SQLContext, tableName: str) -> DataFrame:
    query = f"""
        SELECT to_date(DateTime) as Date, COUNT(*) as count
        FROM {tableName}
        GROUP BY to_date(DateTime)
        ORDER BY count DESC
    """
    result = sqlContext.sql(query)
    return result
   
assert countEmailsPerDayBySQL(sqlContext,tableNameTest).take(10)  == countEmailsPerDayBySQLSolution(sqlContext,tableNameTest).take(10), "Wrong in function countEmailsPerDayBySQL"
assert countEmailsPerDayBySQL(sqlContext,tableNameTest).take(1)  == countEmailsPerDayBySQLSolution(sqlContext,tableNameTest).take(1), "Wrong in function countEmailsPerDayBySQL"
assert countEmailsPerDayBySQL(sqlContext,tableNameTest).take(1000)  == countEmailsPerDayBySQLSolution(sqlContext,tableNameTest).take(1000), "Wrong in function countEmailsPerDayBySQL"
### END HIDDEN TESTS

In [53]:
### BEGIN HIDDEN TESTS
def countEmailsPerDayByDFOperationsSolution(dataFrame: DataFrame) -> DataFrame:
    result = dataFrame.groupBy(col("DateTime").alias("Date")).count()
    result = result.orderBy(col("count").desc())
    return result

assert countEmailsPerDayByDFOperations(dfEmaisTest).take(1) == countEmailsPerDayByDFOperationsSolution(dfEmaisTest).take(1), "Wrong in function countEmailsPerDayByDFOperations"
assert countEmailsPerDayByDFOperations(dfEmaisTest).take(5) == countEmailsPerDayByDFOperationsSolution(dfEmaisTest).take(5), "Wrong in function countEmailsPerDayByDFOperations"
assert countEmailsPerDayByDFOperations(dfEmaisTest).take(1000) == countEmailsPerDayByDFOperationsSolution(dfEmaisTest).take(1000), "Wrong in function countEmailsPerDayByDFOperations"
### END HIDDEN TESTS