![Lancaster University](https://www.lancaster.ac.uk/media/lancaster-university/content-assets/images/fst/logos/SCC-Logo.svg)

# SCC.454: Large Scale Platforms for AI and Data Analysis
## Practice Quiz — Answer Key

**⚠️ FOR INSTRUCTOR USE ONLY**

---


---
# Section A: Python, NumPy, Pandas & Scikit-learn (30 marks)
---


## Question 1 — NumPy Array Operations [10 marks]


In [None]:
# Q1 — ANSWERS
import numpy as np

# (a) Create matrix M, print shape and dtype [2 marks]
M = np.array([
    [4, 12, 7, 3],
    [8, 5, 14, 10],
    [6, 11, 2, 9]
])

print("Matrix M:")
print(M)
print(f"Shape: {M.shape}")  # (3, 4)
print(f"Data type: {M.dtype}")  # int64


In [None]:
# (b) Extract second row, third column, element at [1,2] [2 marks]
print(f"Second row (index 1): {M[1]}")
print(f"Third column (index 2): {M[:, 2]}")
print(f"Element at [1,2]: {M[1, 2]}")


In [None]:
# (c) Sum of each row, mean of each column [3 marks]
row_sums = np.sum(M, axis=1)
col_means = np.mean(M, axis=0)

print(f"Sum of each row: {row_sums}")
print(f"Mean of each column: {col_means}")


In [None]:
# (d) Elements > 7, then replace > 7 with 0 [3 marks]
elements_gt_7 = M[M > 7]
print(f"Elements greater than 7: {elements_gt_7}")

M_copy = M.copy()
M_copy[M_copy > 7] = 0
print(f"Matrix with elements > 7 replaced by 0:")
print(M_copy)


## Question 2 — Pandas Data Manipulation [10 marks]


In [None]:
# Q2 — ANSWERS
import pandas as pd
import numpy as np

# (a) Create DataFrame and print info [2 marks]
data = {
    'order_id': [1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008],
    'product': ['Laptop', 'Mouse', 'Notebook', 'Keyboard', 'Pen Set', 'Monitor', 'Stapler', 'Headphones'],
    'category': ['Electronics', 'Electronics', 'Stationery', 'Electronics', 'Stationery', 'Electronics', 'Stationery', 'Electronics'],
    'price': [999.99, 29.99, 5.99, 79.99, 12.99, 349.99, 8.99, 149.99],
    'quantity': [1, 3, 10, 2, 5, 1, np.nan, 2],
    'date': ['2025-03-01', '2025-03-01', '2025-03-02', '2025-03-02', '2025-03-03', '2025-03-03', '2025-03-04', '2025-03-04']
}

df = pd.DataFrame(data)
print("DataFrame:")
print(df)
print("\nDataFrame Info:")
print(df.info())


In [None]:
# (b) Fill missing quantity with median [2 marks]
median_qty = df['quantity'].median()
print(f"Median quantity: {median_qty}")

df['quantity'] = df['quantity'].fillna(median_qty)
print("\nDataFrame after filling missing quantity:")
print(df)


In [None]:
# (c) Add total column, filter where total > 100 [3 marks]
df['total'] = df['price'] * df['quantity']
print("DataFrame with total column:")
print(df)

print("\nRows where total > 100:")
print(df[df['total'] > 100])


In [None]:
# (d) Groupby category: total revenue and order count [3 marks]
category_stats = df.groupby('category').agg(
    total_revenue=('total', 'sum'),
    num_orders=('order_id', 'count')
).sort_values('total_revenue', ascending=False)

print("Revenue and order count by category:")
print(category_stats)


## Question 3 — Scikit-learn Classification [10 marks]


In [None]:
# Q3 — ANSWERS
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pandas as pd

# Load data
iris = load_iris()
X = iris.data
y = iris.target

# (a) Train-test split with stratification [2 marks]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")


In [None]:
# (b) StandardScaler - fit on train, transform both [2 marks]
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit AND transform on training
X_test_scaled = scaler.transform(X_test)        # Only transform on test

print(f"X_train_scaled mean (should be ~0): {X_train_scaled.mean(axis=0)}")
print(f"X_train_scaled std (should be ~1): {X_train_scaled.std(axis=0)}")


In [None]:
# (c) Train KNN with n_neighbors=3, print accuracy [3 marks]
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_scaled, y_train)

y_pred = knn.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"KNN Test Accuracy: {accuracy:.4f}")


In [None]:
# (d) Confusion matrix and classification report [3 marks]
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))


---
# Section B: Apache Spark — RDDs, DataFrames & SQL (35 marks)
---


In [None]:
# === SETUP: Install PySpark and Java ===
!pip install pyspark==3.5.0 -q
!apt-get install openjdk-11-jdk-headless -qq > /dev/null 2>&1

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
print("PySpark and Java installed successfully!")


In [2]:
# === SETUP: Create SparkSession ===
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("SCC454-Practice-Answers") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

sc = spark.sparkContext
print(f"Spark version: {spark.version}")


Spark version: 4.0.2


## Question 4 — RDD Transformations and Actions [12 marks]


In [None]:
# Q4 — ANSWERS

# Setup
sentences = [
    "Apache Spark is fast",
    "Spark is used for big data",
    "Big data processing is important",
    "Spark and Hadoop are popular",
    "Data science uses Spark",
]

sentences_rdd = sc.parallelize(sentences, 2)
print(f"RDD created with {sentences_rdd.count()} sentences")


In [None]:
# (a) Split sentences into words, count total words [3 marks]
words_rdd = sentences_rdd.flatMap(lambda s: s.lower().split())
all_words = words_rdd.collect()

print(f"Total number of words: {len(all_words)}")
print(f"Words: {all_words}")


In [None]:
# (b) Word count using map and reduceByKey [3 marks]
word_counts = words_rdd \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda a, b: a + b)

print("Word counts:")
for word, count in word_counts.collect():
    print(f"  {word}: {count}")


In [None]:
# (c) Top 5 most frequent words [3 marks]
top_5 = word_counts \
    .sortBy(lambda x: -x[1]) \
    .take(5)

print("Top 5 most frequent words:")
for word, count in top_5:
    print(f"  {word}: {count}")


In [None]:
# (d) Words containing letter 'a' [3 marks]
words_with_a = words_rdd.filter(lambda w: 'a' in w)
words_with_a_list = words_with_a.distinct().collect()

print(f"Number of unique words containing 'a': {len(words_with_a_list)}")
print(f"Words: {words_with_a_list}")


## Question 5 — Spark DataFrame Operations [12 marks]


In [3]:
# Q5 — ANSWERS
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import col, when, avg, round as spark_round

# Setup
grades_data = [
    ("S001", "Alice", "Maths", 85, "Fall"),
    ("S001", "Alice", "Physics", 78, "Fall"),
    ("S002", "Bob", "Maths", 92, "Fall"),
    ("S002", "Bob", "Physics", 88, "Fall"),
    ("S003", "Carol", "Maths", 76, "Fall"),
    ("S003", "Carol", "Physics", 82, "Fall"),
    ("S001", "Alice", "Maths", 88, "Spring"),
    ("S001", "Alice", "Physics", 84, "Spring"),
    ("S002", "Bob", "Maths", 90, "Spring"),
    ("S002", "Bob", "Physics", 91, "Spring"),
]

grades_schema = StructType([
    StructField("student_id", StringType(), True),
    StructField("name", StringType(), True),
    StructField("subject", StringType(), True),
    StructField("score", IntegerType(), True),
    StructField("semester", StringType(), True),
])

grades_df = spark.createDataFrame(grades_data, grades_schema)
print("Grades DataFrame:")
grades_df.show()


Grades DataFrame:
+----------+-----+-------+-----+--------+
|student_id| name|subject|score|semester|
+----------+-----+-------+-----+--------+
|      S001|Alice|  Maths|   85|    Fall|
|      S001|Alice|Physics|   78|    Fall|
|      S002|  Bob|  Maths|   92|    Fall|
|      S002|  Bob|Physics|   88|    Fall|
|      S003|Carol|  Maths|   76|    Fall|
|      S003|Carol|Physics|   82|    Fall|
|      S001|Alice|  Maths|   88|  Spring|
|      S001|Alice|Physics|   84|  Spring|
|      S002|  Bob|  Maths|   90|  Spring|
|      S002|  Bob|Physics|   91|  Spring|
+----------+-----+-------+-----+--------+



In [None]:
# (a) Select columns and filter score >= 85 [3 marks]
result_a = grades_df \
    .select("name", "subject", "score") \
    .filter(col("score") >= 85)

print("Rows with score >= 85:")
result_a.show()


In [None]:
# (b) Add grade column (A/B/C based on score) [3 marks]
grades_with_letter = grades_df.withColumn(
    "grade",
    when(col("score") >= 90, "A")
    .when(col("score") >= 80, "B")
    .otherwise("C")
)

print("DataFrame with grade column:")
grades_with_letter.show()


In [None]:
# (c) Average score per student [3 marks]
avg_per_student = grades_df \
    .groupBy("name") \
    .agg(spark_round(avg("score"), 2).alias("avg_score")) \
    .orderBy(col("avg_score").desc())

print("Average score per student:")
avg_per_student.show()


In [None]:
# (d) Average score per subject per semester [3 marks]
avg_per_subject_semester = grades_df \
    .groupBy("semester", "subject") \
    .agg(spark_round(avg("score"), 2).alias("avg_score")) \
    .orderBy("semester", "subject")

print("Average score per subject per semester:")
avg_per_subject_semester.show()


## Question 6 — Spark SQL [11 marks]


In [4]:
# Q6 — ANSWERS

# Register view
grades_df.createOrReplaceTempView("grades")

# (a) Students scoring above 85 in Maths [3 marks]
result_a = spark.sql("""
    SELECT name, score, semester
    FROM grades
    WHERE subject = 'Maths' AND score > 85
""")
print("Students with Maths score > 85:")
result_a.show()


Students with Maths score > 85:
+-----+-----+--------+
| name|score|semester|
+-----+-----+--------+
|  Bob|   92|    Fall|
|Alice|   88|  Spring|
|  Bob|   90|  Spring|
+-----+-----+--------+



In [None]:
# (b) Average score per subject [3 marks]
result_b = spark.sql("""
    SELECT subject, ROUND(AVG(score), 2) as avg_score
    FROM grades
    GROUP BY subject
""")
print("Average score per subject:")
result_b.show()


In [5]:
# (c) Highest score per student [5 marks]
result_c = spark.sql("""
    SELECT name, MAX(score) as max_score
    FROM grades
    GROUP BY name
    ORDER BY max_score DESC
""")
print("Highest score per student:")
result_c.show()


Highest score per student:
+-----+---------+
| name|max_score|
+-----+---------+
|  Bob|       92|
|Alice|       88|
|Carol|       82|
+-----+---------+



---
# Section C: Data Preprocessing & Similarity Search (35 marks)
---


## Question 7 — Text Preprocessing & Regular Expressions [12 marks]


In [None]:
# Q7 — ANSWERS
from pyspark.sql.functions import regexp_extract, regexp_replace, lower, col
from pyspark.sql.types import DoubleType

# Setup
product_data = [
    (1, "Product: LAPTOP-2025 | Price: $999.99 | Stock: 50"),
    (2, "Product: mouse-2024 | Price: $29.50 | Stock: 200"),
    (3, "Product: KEYBOARD-2025 | Price: $79.00 | Stock: 75"),
    (4, "Product: Monitor-2023 | Price: $349.99 | Stock: 30"),
    (5, "Product: HEADSET-2025 | Price: $149.00 | Stock: 100"),
]

products_df = spark.createDataFrame(product_data, ["id", "raw_text"])
print("Original DataFrame:")
products_df.show(truncate=False)


In [None]:
# (a) Extract product name [3 marks]
# Pattern: Product: followed by the product name (letters, numbers, hyphens)
df_with_name = products_df.withColumn(
    "product_name",
    regexp_extract(col("raw_text"), r"Product:\s*([A-Za-z0-9\-]+)", 1)
)

print("With product_name extracted:")
df_with_name.select("id", "product_name").show()


In [None]:
# (b) Extract price and cast to Double [3 marks]
df_with_price = df_with_name.withColumn(
    "price",
    regexp_extract(col("raw_text"), r"Price:\s*\$([0-9.]+)", 1).cast(DoubleType())
)

print("With price extracted:")
df_with_price.select("id", "product_name", "price").show()


In [None]:
# (c) Lowercase product name and remove year [3 marks]
df_cleaned = df_with_price \
    .withColumn("product_name_lower", lower(col("product_name"))) \
    .withColumn("product_name_clean", regexp_replace(col("product_name_lower"), r"-\d{4}", ""))

print("With cleaned product name (lowercase, no year):")
df_cleaned.select("id", "product_name", "product_name_clean").show()


In [None]:
# (d) Filter products from 2025 [3 marks]
df_2025 = df_with_name.filter(col("product_name").rlike("2025"))

print("Products from 2025:")
df_2025.select("id", "product_name").show()


## Question 8 — Shingling & Jaccard Similarity [12 marks]


In [None]:
# Q8 — ANSWERS

# Documents
doc_a = "the cat sat on the mat"
doc_b = "the cat sat on the hat"
doc_c = "the dog ran in the park"

# (a) Word shingles function, apply with n=2 [3 marks]
def word_shingles(text, n):
    """Generate word n-gram shingles from text."""
    words = text.lower().split()
    shingles = set()
    for i in range(len(words) - n + 1):
        shingle = ' '.join(words[i:i+n])
        shingles.add(shingle)
    return shingles

shingles_a = word_shingles(doc_a, 2)
shingles_b = word_shingles(doc_b, 2)
shingles_c = word_shingles(doc_c, 2)

print(f"Doc A shingles: {shingles_a}")
print(f"Doc B shingles: {shingles_b}")
print(f"Doc C shingles: {shingles_c}")


In [None]:
# (b) Jaccard similarity function and compute for all pairs [3 marks]
def jaccard_similarity(set_a, set_b):
    """Compute Jaccard similarity between two sets."""
    if not set_a or not set_b:
        return 0.0
    intersection = len(set_a & set_b)
    union = len(set_a | set_b)
    return intersection / union

sim_ab = jaccard_similarity(shingles_a, shingles_b)
sim_ac = jaccard_similarity(shingles_a, shingles_c)
sim_bc = jaccard_similarity(shingles_b, shingles_c)

print(f"Jaccard(A, B): {sim_ab:.4f}")
print(f"Jaccard(A, C): {sim_ac:.4f}")
print(f"Jaccard(B, C): {sim_bc:.4f}")


In [None]:
# (c) Most similar and least similar pairs [2 marks]
print(f"Most similar pair: (A, B) with Jaccard = {sim_ab:.4f}")
print(f"Least similar pair: (A, C) and (B, C) both with Jaccard = {sim_ac:.4f}")


In [None]:
# (d) Simple MinHash function and comparison [4 marks]
def simple_minhash(shingle_set, num_hashes=50):
    """Compute MinHash signature using Python's hash with different salts."""
    signature = []
    for i in range(num_hashes):
        min_hash = float('inf')
        for shingle in shingle_set:
            # Use salt to create different hash functions
            h = hash(f"{i}_{shingle}") % (2**32)
            if h < min_hash:
                min_hash = h
        signature.append(min_hash)
    return signature

def estimate_jaccard_from_signatures(sig_a, sig_b):
    """Estimate Jaccard similarity from MinHash signatures."""
    matches = sum(1 for a, b in zip(sig_a, sig_b) if a == b)
    return matches / len(sig_a)

# Compute signatures
sig_a = simple_minhash(shingles_a, num_hashes=50)
sig_b = simple_minhash(shingles_b, num_hashes=50)

# Compare
estimated_sim = estimate_jaccard_from_signatures(sig_a, sig_b)
true_sim = jaccard_similarity(shingles_a, shingles_b)

print(f"True Jaccard(A, B): {true_sim:.4f}")
print(f"Estimated Jaccard(A, B) from MinHash: {estimated_sim:.4f}")
print(f"Estimation error: {abs(true_sim - estimated_sim):.4f}")


## Question 9 — LSH with Spark ML [11 marks]


In [None]:
# Q9 — ANSWERS
from pyspark.ml.feature import Tokenizer, CountVectorizer, MinHashLSH
from pyspark.sql.functions import col

# (a) Create DataFrame, tokenize, vectorize [3 marks]
docs_data = [
    ("A", "the cat sat on the mat"),
    ("B", "the cat sat on the hat"),
    ("C", "the dog ran in the park"),
]

docs_df = spark.createDataFrame(docs_data, ["id", "text"])

# Tokenize
tokenizer = Tokenizer(inputCol="text", outputCol="words")
tokenized_df = tokenizer.transform(docs_df)

# Vectorize with CountVectorizer (binary=True)
cv = CountVectorizer(inputCol="words", outputCol="features", binary=True)
cv_model = cv.fit(tokenized_df)
vectorized_df = cv_model.transform(tokenized_df)

print("Schema:")
vectorized_df.printSchema()
print("\nVectorized DataFrame:")
vectorized_df.select("id", "text", "features").show(truncate=False)


In [None]:
# (b) Fit MinHashLSH and show hashes [3 marks]
minhash_lsh = MinHashLSH(
    inputCol="features",
    outputCol="hashes",
    numHashTables=3
)

lsh_model = minhash_lsh.fit(vectorized_df)
hashed_df = lsh_model.transform(vectorized_df)

print("DataFrame with hash values:")
hashed_df.select("id", "hashes").show(truncate=False)


In [None]:
# (c) approxSimilarityJoin with threshold 0.6 [3 marks]
similar_pairs = lsh_model.approxSimilarityJoin(
    vectorized_df, vectorized_df,
    threshold=0.6,
    distCol="distance"
)

# Filter out self-joins and duplicates
similar_pairs_filtered = similar_pairs.filter(
    col("datasetA.id") < col("datasetB.id")
).select(
    col("datasetA.id").alias("id_a"),
    col("datasetB.id").alias("id_b"),
    col("distance")
)

print("Similar document pairs (distance < 0.6):")
similar_pairs_filtered.show()


In [None]:
# (d) approxNearestNeighbors for document A [2 marks]
# Get feature vector for document A
doc_a_features = vectorized_df.filter(col("id") == "A").select("features").first()[0]

# Find 2 nearest neighbors (excluding itself, so request 3)
neighbors = lsh_model.approxNearestNeighbors(
    vectorized_df,
    doc_a_features,
    numNearestNeighbors=3
)

print("Nearest neighbors of document A:")
neighbors.select("id", "text", "distCol").show()


---
## Cleanup


In [None]:
# Stop Spark session
spark.stop()
print("Spark session stopped.")


---
### End of Answer Key
---
*SCC.454: Large Scale Platforms for AI and Data Analysis — Lancaster University*
