In [3]:
from random import randint, choice

# Map Reduce

In the part of the assignment you are requested to use Map Reduce paradigm to solve the following exercises.

**NOTE THAT**: **A solution that does not use map reduce is not valid!**

# Exercise 1

You have a list of dictionaries, each representing a student with the following properties: a name and an array of test scores. Your task is to use map, filter, and reduce to calculate the average test score for each student, and then return a list of dictionaries containing only the students whose average score is above 90.

In [4]:
students = [
    {"name": "Alice", "scores": [95, 92, 88, 100]},
    {"name": "Bob", "scores": [78, 81, 85, 80]},
    {"name": "Charlie", "scores": [99, 91, 94, 96]},
    {"name": "Diana", "scores": [85, 87, 89, 83]}
]

In [5]:
from functools import reduce
# Step 1: Calculate average test scores using map
students_with_averages = list(map(lambda student: {
    "name": student["name"],
    "average_score": reduce(lambda x, y: x + y, student["scores"]) / len(student["scores"])
}, students))

# Step 2: Filter students with average score above 90
top_students = list(filter(lambda student: student["average_score"] > 90, students_with_averages))

# Output
print("[")
for student in top_students:
    print(f'{{"name":"{student["name"]}", "average_score":{student["average_score"]}}},')
print("]")

[
{"name":"Alice", "average_score":93.75},
{"name":"Charlie", "average_score":95.0},
]


Use `map`, `reduce` and `filter` that produce an output like:

In [6]:
[
    {"name": "Alice", "average_score":
      93.75},
    {"name": "Charlie", "average_score": 95.0}
]

[{'name': 'Alice', 'average_score': 93.75},
 {'name': 'Charlie', 'average_score': 95.0}]

### Test
Test your solution using the dataset generated by the following function.

In [7]:
def generate_random_student_dataset(num_students=50):
    names = [f"Student {i}" for i in range(1, num_students + 1)]
    dataset = [
        {
            "name": name,
            "scores": [randint(50, 100) for _ in range(randint(3, 6))]  # Random scores between 50 and 100
        }
        for name in names
    ]
    return dataset

random_student_dataset = generate_random_student_dataset(50)
random_student_dataset[:3]

[{'name': 'Student 1', 'scores': [96, 80, 73, 60]},
 {'name': 'Student 2', 'scores': [99, 75, 73, 67, 63]},
 {'name': 'Student 3', 'scores': [89, 53, 68, 59, 51]}]

In [29]:
# your code goes here
from functools import reduce
# Step 1: Calculate average test scores using map
students_with_averages = list(map(lambda student: {
    "name": student["name"],
    "average_score": reduce(lambda x, y: x + y, student["scores"]) / len(student["scores"])
}, random_student_dataset))

# Step 2: Filter students with average score above 90
top_students = list(filter(lambda student: student["average_score"] > 90, students_with_averages))

# Output
print("[")
for student in top_students[:3]:
    print(f'{{"name":"{student["name"]}", "average_score":{student["average_score"]}}},')
print("]")
random_student_dataset[:3]
print(top_students)

[
]
[]


## Exercise 2

You have a list of dictionaries, each representing a product with the following properties: name, price, and category. Using the functions `map`, `filter`, and `reduce`, calculate the average price of the products in each category and return a list of dictionaries containing only the categories where the average price exceeds 50.

Example input:

In [12]:
products = [
    {"name": "Product A", "price": 60, "category": "Electronics"},
    {"name": "Product B", "price": 40, "category": "Electronics"},
    {"name": "Product C", "price": 70, "category": "Home"},
    {"name": "Product D", "price": 30, "category": "Home"},
    {"name": "Product E", "price": 90, "category": "Sports"}
]

Use `map`, `reduce` and `filter` that produce an output like:

In [13]:
[
    {"category": "Electronics", "average_price": 50.0},
    {"category": "Sports", "average_price": 90.0}
]
# your code goes here
from functools import reduce
# Step 1: Calculate average price using map
categories={}
for product in products:
    if product["category"] in categories:
        categories[product["category"]].append(product["price"])
    else:
        categories[product["category"]]=[product["price"]]
prods_averages = list(map(lambda category: {
    "category": category,
    "average_price": reduce(lambda x, y: x + y, categories[category]) / len(categories[category])
}, categories))

# Step 2: Filter prices with average  above 50
filtered_products = list(filter(lambda product: product["average_price"] > 50, prods_averages))

# Output
print("[")
for product in filtered_products:
    print(f'{{"category":"{product["category"]}", "average_price":{product["average_price"]}}},')
print("]")


[
{"category":"Sports", "average_price":90.0},
]


### Test
Test your solution using the dataset generated by the following function.

In [39]:
def generate_random_product_dataset(num_products=100):
    categories = ["Electronics", "Home", "Sports", "Books", "Clothing", "Toys"]
    dataset = [
        {
            "name": f"Product {i}",
            "price": randint(10, 200),  # Random price between 10 and 200
            "category": choice(categories),  # Randomly choose a category
        }
        for i in range(1, num_products + 1)
    ]
    return dataset

# Example of using the function
random_dataset = generate_random_product_dataset(100)
random_dataset[:5]  # Display the first 5 entries to check the dataset structure


[{'name': 'Product 1', 'price': 78, 'category': 'Toys'},
 {'name': 'Product 2', 'price': 187, 'category': 'Books'},
 {'name': 'Product 3', 'price': 166, 'category': 'Clothing'},
 {'name': 'Product 4', 'price': 111, 'category': 'Books'},
 {'name': 'Product 5', 'price': 10, 'category': 'Books'}]

In [25]:
# your code goes here
# hints: 1) Group products by category (you don't need to use map reduce for this part), then 2) use map reduce paradigm to
# calculate the average price for each category and filter categories with an average price > 50
from functools import reduce
# Step 1: Calculate average price using map
categories={}
for product in random_dataset:
    if product["category"] in categories:
        categories[product["category"]].append(product["price"])
    else:
        categories[product["category"]]=[product["price"]]
prods_averages = list(map(lambda category: {
    "category": category,
    "average_price": reduce(lambda x, y: x + y, categories[category]) / len(categories[category])
}, categories))

# Step 2: Filter prices with average  above 50
filtered_products = list(filter(lambda product: product["average_price"] > 50, prods_averages))

# Output
print("[")
for product in filtered_products[:5]:
    print(f'{{"category":"{product["category"]}", "average_price":{product["average_price"]}}},')
print("]")



[
{"category":"Sports", "average_price":111.82608695652173},
{"category":"Toys", "average_price":101.42857142857143},
{"category":"Home", "average_price":120.8695652173913},
{"category":"Electronics", "average_price":96.38888888888889},
{"category":"Books", "average_price":77.6},
]


# Exercise 3

You have a list of dictionaries, each representing an employee with the following properties: name, salary, and department. Your task is to use `map`, `filter`, and `reduce` to calculate the average salary for each department and return a list of dictionaries containing only the departments where the average salary is above 65,000.

**Example Input**

In [16]:
employees = [
    {"name": "John", "salary": 70000, "department": "Engineering"},
    {"name": "Jane", "salary": 75000, "department": "Engineering"},
    {"name": "Alice", "salary": 60000, "department": "HR"},
    {"name": "Bob", "salary": 68000, "department": "HR"},
    {"name": "Charlie", "salary": 90000, "department": "Marketing"},
    {"name": "Diana", "salary": 50000, "department": "Marketing"}
]

Use `map`, `reduce` and `filter` that produce an output like:

In [17]:
[
    {"department": "Engineering", "average_salary": 72500.0},
    {"department": "Marketing", "average_salary": 70000.0}
]

[{'department': 'Engineering', 'average_salary': 72500.0},
 {'department': 'Marketing', 'average_salary': 70000.0}]

In [18]:
from functools import reduce
# Step 1: Calculate average price using map
departments={}
for department in employees:
    if department ["department"] in departments:
        departments[department["department"]].append(department["salary"])
    else:
        departments[department["department"]]=[department["salary"]]
salary_averages = list(map(lambda department: {
    "department": department,
    "average_salary": reduce(lambda x, y: x + y, departments[department]) / len(departments[department])
}, departments))

# Step 2: Filter prices with average  above 50
filtered_salary = list(filter(lambda employe: employe["average_salary"] > 65000, salary_averages))

# Output
print("[")
for employe in filtered_salary:
    print(f'{{"department":"{employe["department"]}", "average_salary":{employe["average_salary"]}}},')
print("]")

[
{"department":"Engineering", "average_salary":72500.0},
{"department":"Marketing", "average_salary":70000.0},
]


### Test

Test your solution using the dataset generated by the following function.

In [19]:
def generate_random_employee_dataset(num_employees=50):
    departments = ["Engineering", "HR", "Marketing", "Sales", "Finance", "IT"]
    dataset = [
        {
            "name": f"Employee {i}",
            "salary": randint(40000, 120000),  # Random salary between 40,000 and 120,000
            "department": choice(departments)  # Randomly choose a department
        }
        for i in range(1, num_employees + 1)
    ]
    return dataset

random_employee_dataset = generate_random_employee_dataset(50)

random_employee_dataset[:3]  # Display the first 3 entries of each dataset for checking


[{'name': 'Employee 1', 'salary': 72431, 'department': 'Sales'},
 {'name': 'Employee 2', 'salary': 77878, 'department': 'Engineering'},
 {'name': 'Employee 3', 'salary': 69207, 'department': 'Sales'}]

In [24]:
# your code goes here
# hints: 1) Group employees' salaries by department (you don't need to use map reduce for this part), then 2) use map reduce paradigm to
# calculate the average salary for each department and filter departments with an average salary > threshold
from functools import reduce
# Step 1: Calculate average price using map
departments={}
for department in random_employee_dataset:
    if department ["department"] in departments:
        departments[department["department"]].append(department["salary"])
    else:
        departments[department["department"]]=[department["salary"]]
salary_averages = list(map(lambda department: {
    "department": department,
    "average_salary": reduce(lambda x, y: x + y, departments[department]) / len(departments[department])
}, departments))

# Step 2: Filter prices with average  above 50
filtered_salary = list(filter(lambda employe: employe["average_salary"] > 65000, salary_averages))

# Output
print("[")
for employe in filtered_salary[:3]:
    print(f'{{"department":"{employe["department"]}", "average_salary":{employe["average_salary"]}}},')
print("]")


[
{"department":"Sales", "average_salary":85545.0},
{"department":"Engineering", "average_salary":89312.0},
{"department":"Marketing", "average_salary":75185.55555555556},
]


# Biopython

Write the following five functions to analyze global alignments between two sequences using Biopython's `pairwise2` module:

1. **countMatches(s1, s2)**  
   This function takes two sequences (`s1`, `s2`) aligned using global alignment (pairwise2.globalxx) of the same length. It returns the number of positions where the elements of both sequences match.

2. **countMismatches(s1, s2)**  
   This function takes two sequences (`s1`, `s2`) aligned using global alignment of the same length. It returns the number of positions where the elements of the two sequences are different (i.e., they are not gaps, and the characters do not match).

3. **countGapOpens(s1, s2)**  
   This function takes two sequences (`s1`, `s2`) aligned using global alignment of the same length. It returns the number of gap openings in the alignment (a gap is opened when a '-' appears in the sequence).

4. **countGapExtensions(s1, s2)**  
   This function takes two sequences (`s1`, `s2`) aligned using global alignment of the same length. It returns the number of gap extensions (where '-' continues in the alignment after an initial gap is opened).

5. **getScore(s1, s2, matchScore, mismatchPenalty, gapOpenPenalty, gapExtensionPenalty)**  
   This function takes two sequences (`s1`, `s2`) aligned using global alignment and returns the alignment score based on the provided scoring scheme: `matchScore` for matches, `mismatchPenalty` for mismatches, `gapOpenPenalty` for opening a gap, and `gapExtensionPenalty` for extending a gap.

In [30]:
# Add your functions here
from Bio import pairwise2



In [31]:
#for test
from Bio import SeqIO
def read_fasta(filename):
    sequences=[]
    for record in SeqIO.parse(filename,"fasta"):
        sequences.append(str(record.seq))
    return sequences

In [32]:
def countMatches(s1,s2):
    alignments=pairwise2.align.globalxx(s1,s2, one_alignment_only=True)
    aligned_s1, aligned_s2,_,_, _=alignments[0]
    matches=sum(1 for a, b in zip(aligned_s1, aligned_s2)if a==b)
    return matches

In [33]:
def countMismatches(s1,s2):
    alignments=pairwise2.align.globalxx(s1,s2, one_alignment_only=True)
    aligned_s1, aligned_s2,_,_, _=alignments[0]
    mismatches=sum(1 for a, b in zip(aligned_s1, aligned_s2)if a!=b and a!='-'and b!='-')
    return mismatches

In [34]:
def countGapOpens(s1,s2):
    alignments=pairwise2.align.globalxx(s1,s2, one_alignment_only=True)
    aligned_s1, aligned_s2,_,_, _=alignments[0]
    gap_opens=0
    gap_o_s1=False
    gap_o_s2=False
    for a,b in zip(aligned_s1, aligned_s2):#check for gaps in s1 and s2
        if a =='-' and not gap_o_s1:
            gap_opens+=1
            gap_o_s1=True#a gap is found
        elif a !='-':
            gap_o_s1=False#reset gap traxker because no gap is found
        if b =='-' and not gap_o_s2:
            gap_opens+=1
            gap_o_s2=True#a gap is found
        elif b !='-':
            gap_o_s2=False#reset gap traxker because no gap is found
    return gap_opens


In [35]:
def countGapExtensions(s1,s2):
    alignments=pairwise2.align.globalxx(s1,s2, one_alignment_only=True)
    aligned_s1, aligned_s2,_,_, _=alignments[0]
    gap_extension=0
    gap_o_s1=False
    gap_o_s2=False
    for a,b in zip(aligned_s1, aligned_s2):#check for gaps in s1 and s2
        if a =='-':#so if gap exists
            if gap_o_s1:#the gap is open, so it's an extension 
                gap_extension+=1
            gap_o_s1=True#keep gap open
        else:
            gap_o_s1=False#reset when no gap
        if b =='-':#so if gap exists
            if gap_o_s2:#the gap is open, so it's an extension 
                gap_extension+=1
            gap_o_s2=True#keep gap open
        else:
            gap_o_s2=False#reset when no gap
    return gap_extension
    

In [36]:
def getScore(s1, s2, matchScore, mismatchPenalty, gapOpenPenalty, gapExtensionPenalty):
    alignments=pairwise2.align.globalms(
        s1,s2,
        matchScore,
        mismatchPenalty,
        gapOpenPenalty, 
        gapExtensionPenalty,
        one_alignment_only=True 
    )
    alignment_score=alignments[0].score
    return alignment_score

### Test
Align the sequences of the [Interleukin-12](https://en.wikipedia.org/wiki/Interleukin_12) chain A (denoted as `s1`) from the file [`IL12A.fasta`](https://qcbsciprolab2020.readthedocs.io/en/latest/file_samples/IL12A.fasta) and the Interleukin-12 chain B (denoted as `s2`) from the file [`IL12B.fasta`](https://qcbsciprolab2020.readthedocs.io/en/latest/file_samples/IL12B.fasta) and check the score as computed from pairwise2 and from your functions.

In [37]:
s1=read_fasta("IL12A (1).fasta")[0]
s2=read_fasta("IL12B.fasta")[0]

In [38]:
alignments=pairwise2.align.globalms(s1,s2,1,-1,-0.5,-0.2, one_alignment_only=True)
aligned_s1, aligned_s2, score, start, end=alignments[0]
print("alignment score:", score)
custom_score=getScore(s1,s2,1,-1,-0.5,-0.2)
print("custom alignment score:", custom_score)
matches=countMatches(s1,s2)
mismatches=countMismatches(s1,s2)
gap_opens=countGapOpens(s1,s2)
gap_extensions=countGapExtensions(s1,s2)
print("matches:",matches)
print("mismatches:", mismatches)
print("gap opens:", gap_opens)
print("gap extensions:", gap_extensions)

alignment score: 3.3999999999999844
custom alignment score: 3.3999999999999844
matches: 103
mismatches: 0
gap opens: 117
gap extensions: 224


In [None]:
# add the output of the test here
'''alignment score from pairwise2: -236.0
custom alignment score: -236.0
matches: 103
mismatches: 0
gap opens: 117
gap extensions: 224'''


SyntaxError: invalid syntax (1618931976.py, line 2)