In [6]:
# ==================== PART A: MACHINE LEARNING ====================
# Feature Selection on Breast Cancer Wisconsin Dataset
# NURUL ATHIRAH SYAFIQAH BINIT MOHD RAZALI

# 1. Import libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import SelectKBest, chi2, f_classif

print("=" * 70)
print("PART A: FEATURE SELECTION - BREAST CANCER WISCONSIN DATASET")
print("=" * 70)

# 2. Load dataset
cancer = load_breast_cancer()
X = cancer.data
y = cancer.target

# Create DataFrame for better visualization
df = pd.DataFrame(X, columns=cancer.feature_names)
df['target'] = y

# 3. Print dimensions
print("\n1. DATASET DIMENSIONS:")
print("-" * 40)
print(f"dimension: {X.shape}")
print(f"Samples: {X.shape[0]}, Features: {X.shape[1]}")

# 4. Print all features
print("\n2. FEATURES LIST:")
print("-" * 40)
print(f"{len(cancer.feature_names)} Features:")
for i, feature in enumerate(cancer.feature_names):
    print(f"  {i:2d}. {feature}")

# 5. Print malignant and benign samples
print("\n3. SAMPLE DISTRIBUTION:")
print("-" * 40)

malignant_indices = np.where(y == 0)[0]
benign_indices = np.where(y == 1)[0]

print(f"Malignant samples: {len(malignant_indices)}")
print(f"Benign samples: {len(benign_indices)}")

# Print IDs (optional, as in example)
print(f"\nID for malignant with size {len(malignant_indices)} :")
print(list(malignant_indices[:20]), "...")  # Show first 20

print(f"\nID for benign with size {len(benign_indices)} :")
print(list(benign_indices[:20]), "...")  # Show first 20

# 6. Print first three samples
print("\n4. FIRST THREE SAMPLES WITH FEATURE VALUES:")
print("-" * 40)
print("the first three samples with their features values:")
for i in range(3):
    print(f"\nSample {i}:")
    for j, (value, feature) in enumerate(zip(X[i], cancer.feature_names)):
        if j % 5 == 0 and j > 0:
            print()
        print(f"{feature:25}: {value:.4e}", end="  ")
    print()

# 7. Chi-squared test feature selection
print("\n5. CHI-SQUARED TEST FEATURE SELECTION:")
print("-" * 40)

chi2_selector = SelectKBest(score_func=chi2, k=5)
chi2_selector.fit(X, y)

# Get p-values
chi2_pvalues = chi2_selector.pvalues_
chi2_features_idx = chi2_selector.get_support(indices=True)

print("the top 5 features from p-values of chi-squared test:")
print("features p<0.05")
print("-" * 40)

# Create DataFrame for chi2 results
chi2_results = pd.DataFrame({
    'Feature_Index': chi2_features_idx,
    'Feature_Name': [cancer.feature_names[i] for i in chi2_features_idx],
    'P_Value': chi2_pvalues[chi2_features_idx]
})
chi2_results = chi2_results.sort_values('P_Value')

for idx, row in chi2_results.iterrows():
    print(f"{row['Feature_Index']:3d} {row['Feature_Name']:25} {row['P_Value']:.6e}")

# 8. F-test feature selection
print("\n6. F-TEST FEATURE SELECTION:")
print("-" * 40)

f_selector = SelectKBest(score_func=f_classif, k=5)
f_selector.fit(X, y)

# Get p-values
f_pvalues = f_selector.pvalues_
f_features_idx = f_selector.get_support(indices=True)

print("the top 5 features from p-values of F-test:")
print("features p<0.05")
print("-" * 40)

# Create DataFrame for F-test results
f_results = pd.DataFrame({
    'Feature_Index': f_features_idx,
    'Feature_Name': [cancer.feature_names[i] for i in f_features_idx],
    'P_Value': f_pvalues[f_features_idx]
})
f_results = f_results.sort_values('P_Value')

for idx, row in f_results.iterrows():
    print(f"{row['Feature_Index']:3d} {row['Feature_Name']:25} {row['P_Value']:.6e}")

# 9. Find common features
print("\n7. COMMON FEATURES IN BOTH TESTS:")
print("-" * 40)

chi2_feature_names = [cancer.feature_names[i] for i in chi2_features_idx]
f_feature_names = [cancer.feature_names[i] for i in f_features_idx]

common_features = set(chi2_feature_names) & set(f_feature_names)

print("the same features from p-values of chi-squared test and F-test:")
if common_features:
    print(f"{list(common_features)}")
else:
    print("No common features found")


PART A: FEATURE SELECTION - BREAST CANCER WISCONSIN DATASET

1. DATASET DIMENSIONS:
----------------------------------------
dimension: (569, 30)
Samples: 569, Features: 30

2. FEATURES LIST:
----------------------------------------
30 Features:
   0. mean radius
   1. mean texture
   2. mean perimeter
   3. mean area
   4. mean smoothness
   5. mean compactness
   6. mean concavity
   7. mean concave points
   8. mean symmetry
   9. mean fractal dimension
  10. radius error
  11. texture error
  12. perimeter error
  13. area error
  14. smoothness error
  15. compactness error
  16. concavity error
  17. concave points error
  18. symmetry error
  19. fractal dimension error
  20. worst radius
  21. worst texture
  22. worst perimeter
  23. worst area
  24. worst smoothness
  25. worst compactness
  26. worst concavity
  27. worst concave points
  28. worst symmetry
  29. worst fractal dimension

3. SAMPLE DISTRIBUTION:
----------------------------------------
Malignant samples: 212


In [11]:
!pip install tensorflow




[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: C:\Users\fyqaz\AppData\Local\Programs\Python\Python313\python.exe -m pip install --upgrade pip


In [12]:
# ==================== PART B: DEEP LEARNING ====================
# Convolutional Neural Network (CNN) Architecture
# NURUL ATHIRAH SYAFIQAH BINTI MOHD RAZALI

import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout, Conv2D, MaxPooling2D

print("=" * 70)
print("PART B: CONVOLUTIONAL NEURAL NETWORK (CNN)")
print("=" * 70)

# 1. Create sequential model
model = Sequential()

# 2. First convolutional layer
model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(300, 300, 2)))
model.add(MaxPooling2D(pool_size=(2, 2)))

# 3. Second convolutional layer
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

# 4. Third convolutional layer
model.add(Conv2D(96, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

# 5. Fourth convolutional layer
model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

# 6. Fifth convolutional layer
model.add(Conv2D(224, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

# 7. Flatten layer
model.add(Flatten())

# 8. Dropout layer
model.add(Dropout(0.5))

# 9. Dense layer with SoftMax activation
model.add(Dense(5, activation='softmax'))

# 10. Show model summary
print("\nCNN MODEL SUMMARY:")
print("-" * 70)
model.summary()


PART B: CONVOLUTIONAL NEURAL NETWORK (CNN)

CNN MODEL SUMMARY:
----------------------------------------------------------------------


In [14]:
# ==================== PART C: TANDEM REPEATS ====================
# DNA Sequence Analysis - Nucleotide Counting and Microsatellite Comparison
# NURUL ATHIRAH SYAFIQAH BINTI MOHD RAZALI

print("=" * 70)
print("PART C: IDENTIFICATION OF TANDEM REPEATS")
print("=" * 70)

# DNA sequences from the problem
part1 = "tttaga"
part2 = "ttcgtg"
part3 = "ttgtga"

# Convert to uppercase for consistency
part1 = part1.upper()
part2 = part2.upper()
part3 = part3.upper()

# ========== TASK 1: COUNT NUCLEOTIDES ==========
def count_nucleotides(dna_sequence):
    """
    Count the number of A, C, G, T in a DNA sequence.
    Returns a dictionary with counts.
    """
    counts = {'A': 0, 'C': 0, 'G': 0, 'T': 0}
    
    for nucleotide in dna_sequence:
        if nucleotide in counts:
            counts[nucleotide] += 1
    
    return counts

print("\n1. NUCLEOTIDE COUNTS:")
print("-" * 40)

# Count nucleotides for each part
counts1 = count_nucleotides(part1)
counts2 = count_nucleotides(part2)
counts3 = count_nucleotides(part3)

print(f"Part 1 Counts: {counts1}")
print(f"Part 2 Counts: {counts2}")
print(f"Part 3 Counts: {counts3}")

# ========== TASK 2: COMPARE MICROSATELLITE PROFILES ==========
def count_microsatellite_repeats(dna_sequence, motif):
    """
    Count the number of consecutive repeats of a microsatellite motif.
    Returns the maximum consecutive repeats.
    """
    motif = motif.upper()
    dna_sequence = dna_sequence.upper()
    max_repeats = 0
    current_repeats = 0
    i = 0
    
    while i < len(dna_sequence):
        if dna_sequence[i:i+len(motif)] == motif:
            current_repeats += 1
            i += len(motif)
        else:
            if current_repeats > max_repeats:
                max_repeats = current_repeats
            current_repeats = 0
            i += 1
    
    # Check at the end of sequence
    if current_repeats > max_repeats:
        max_repeats = current_repeats
    
    return max_repeats

print("\n2. MICROSATELLITE PROFILE COMPARISON:")
print("-" * 40)

# Microsatellite motif
motif = "TT"

# Count repeats for each part
repeats1 = count_microsatellite_repeats(part1, motif)
repeats2 = count_microsatellite_repeats(part2, motif)
repeats3 = count_microsatellite_repeats(part3, motif)

print(f"Part 1 ('{part1}'): {repeats1} repeats of '{motif}'")
print(f"Part 2 ('{part2}'): {repeats2} repeats of '{motif}'")
print(f"Part 3 ('{part3}'): {repeats3} repeats of '{motif}'")

# Determine result based on conditions
print("\nMicrosatellite Profile Comparison Result:")

max_repeats = max(repeats1, repeats2, repeats3)

if max_repeats == 1:
    # Find which profile has 1 repeat
    profiles_with_max = []
    if repeats1 == max_repeats:
        profiles_with_max.append("Part 1")
    if repeats2 == max_repeats:
        profiles_with_max.append("Part 2")
    if repeats3 == max_repeats:
        profiles_with_max.append("Part 3")
    
    if len(profiles_with_max) == 1:
        print(f"The profile {profiles_with_max[0]} has the most repeats of {motif}.")
    else:
        profiles_str = ", ".join(profiles_with_max)
        print(f"The profiles {profiles_str} have the same most repeats of {motif}.")

elif max_repeats >= 2:
    # Find which profiles have the max repeats (could be multiple)
    profiles_with_max = []
    if repeats1 == max_repeats:
        profiles_with_max.append("Part 1")
    if repeats2 == max_repeats:
        profiles_with_max.append("Part 2")
    if repeats3 == max_repeats:
        profiles_with_max.append("Part 3")
    
    profiles_str = ", ".join(profiles_with_max)
    print(f"The profiles {profiles_str} have the same most repeats of {motif}.")

else:
    print(f"No profile has any repeats of {motif}.")


PART C: IDENTIFICATION OF TANDEM REPEATS

1. NUCLEOTIDE COUNTS:
----------------------------------------
Part 1 Counts: {'A': 2, 'C': 0, 'G': 1, 'T': 3}
Part 2 Counts: {'A': 0, 'C': 1, 'G': 2, 'T': 3}
Part 3 Counts: {'A': 1, 'C': 0, 'G': 2, 'T': 3}

2. MICROSATELLITE PROFILE COMPARISON:
----------------------------------------
Part 1 ('TTTAGA'): 1 repeats of 'TT'
Part 2 ('TTCGTG'): 1 repeats of 'TT'
Part 3 ('TTGTGA'): 1 repeats of 'TT'

Microsatellite Profile Comparison Result:
The profiles Part 1, Part 2, Part 3 have the same most repeats of TT.
