## Due date: 5/1/2025
### Nandini Doddi & Jeslyn Miranda
For your group project, you are requested to do a complete process of machine learning, which includes
Feature extraction, classification, and analysis.
You can access the data that we are working with on Canvas -> Modules =-> Subcellular Data

In [5]:
#loads raw CSVs → keep ID / Fold / Sequence → save cleaned CSVs.
import pandas as pd  # table helper

def clean_csv(src, dst, keep_cols):
    df = pd.read_csv(src, header=None)      # 1. read the CSV with no header row
    df = df[keep_cols]                      # 2. keep only the columns we list
    df.columns = ["ID", "Fold", "Sequence"] # 3. give those columns clear names
    df["ID"] = df["ID"].str.lstrip(">")     # 4. remove the '>' at the start of each ID
    df.dropna(inplace=True)                 # 5. delete rows missing any value
    df.to_csv(dst, index=False)             # 6. write the tidy table to a new file
    print(f"✓ saved {dst}   shape={df.shape}")  # 7. show where it went and its size

# clean Gram-negative (has duplicate columns, so we keep 2, 1, 5)
clean_csv("data/n-data.csv", "data/clean_gramneg.csv", [2, 1, 5])

# clean Gram-positive (no duplicates, so we keep 2, 1, 3)
clean_csv("data/g_data.csv", "data/clean_grampos.csv", [2, 1, 3])

✓ saved data/clean_gramneg.csv   shape=(1456, 3)
✓ saved data/clean_grampos.csv   shape=(523, 3)


In [6]:
def analyze_protein_data(file_path, dataset_name):
    data = pd.read_csv(file_path, header=None)

    if data.shape[1] == 6:
        data.columns = ["Fold", "Label", "ID", "SeqCol1", "Blank", "Sequence"]
    elif data.shape[1] == 4:
        data.columns = ["ID", "Label", "Extra", "Sequence"]
    else:
        raise ValueError("Unexpected columns")

    data = data.dropna(subset=["Sequence", "Label"])

    print(f"Analysis for {dataset_name} dataset:")

    # Number of proteins
    print(f"1. Number of proteins: {len(data)}")

    # Number of unique labels
    print(f"2. Number of Labels: {data['Label'].nunique()}")

    # Proteins in each class
    print("\n3. Proteins in each class:")
    print(data['Label'].value_counts())

    # Protein lengths
    data["Length"] = data["Sequence"].str.len()

    # Average length
    
    print("\n4. Average length per class:")
    print(data.groupby("Label")["Length"].mean().round(2))

    # Max and Min lengths
    print("\n5. Max and Min protein length per class:")
    print(data.groupby("Label")["Length"].agg(['max', 'min']))

# Run for both datasets
analyze_protein_data("data/n-data.csv", "Gram-negative")
print("\n")
analyze_protein_data("data/g_data.csv", "Gram-positive")

Analysis for Gram-negative dataset:
1. Number of proteins: 1456
2. Number of Labels: 8

3. Proteins in each class:
Label
Fold1    557
Fold3    410
Fold8    180
Fold4    133
Fold2    124
Fold5     32
Fold6     12
Fold7      8
Name: count, dtype: int64

4. Average length per class:
Label
Fold1    361.75
Fold2    488.10
Fold3    373.40
Fold4    472.39
Fold5    203.25
Fold6    480.33
Fold7    432.50
Fold8    360.48
Name: Length, dtype: float64

5. Max and Min protein length per class:
        max  min
Label           
Fold1  2832   50
Fold2  2249   55
Fold3  1486   67
Fold4  1849   58
Fold5   357   70
Fold6   685  249
Fold7  1486  167
Fold8  1015   82


Analysis for Gram-positive dataset:
1. Number of proteins: 523
2. Number of Labels: 4

3. Proteins in each class:
Label
Fold3    208
Fold1    174
Fold4    123
Fold2     18
Name: count, dtype: int64

4. Average length per class:
Label
Fold1    434.18
Fold2    759.22
Fold3    393.80
Fold4    410.19
Name: Length, dtype: float64

5. Max and Min

In [7]:
# Reads the cleaned CSVs, creates 20 amino-acid fraction features + Length,
# and saves the new feature tables in the data/ folder.
import pandas as pd
from collections import Counter   # for counting letters

AA = list("ACDEFGHIKLMNPQRSTVWY")  # fixed order of the 20 amino acids

# Convert ONE sequence into:
#   [Frac_A, Frac_C, …, Frac_Y, Length]
def seq_to_row(seq):
    seq   = seq.upper()
    length = len(seq)
    counts = Counter(seq)
    return [counts.get(a, 0) / length for a in AA] + [length]

# Build a full feature table from a cleaned CSV and write it out
def make_features(src, dst):
    df = pd.read_csv(src)                             # load cleaned file
    feats = df["Sequence"].apply(seq_to_row)          # features for every row
    cols = [f"Frac_{a}" for a in AA] + ["Length"]     # new column names
    out = pd.concat([df[["ID", "Fold"]],              # keep ID + label
                     pd.DataFrame(feats.tolist(), columns=cols)],
                    axis=1)
    out.to_csv(dst, index=False)                      # save result
    print(f"✓ saved {dst}  (rows={len(out)})")

# Generate features for both datasets
make_features("data/clean_gramneg.csv", "data/features_gramneg.csv")
make_features("data/clean_grampos.csv", "data/features_grampos.csv")

✓ saved data/features_gramneg.csv  (rows=1456)
✓ saved data/features_grampos.csv  (rows=523)


In [9]:
# Purpose: 1) Train a Random-Forest on the Gram-positive feature table.
#          2) Check how good it is with two tests:
#               • 5-fold cross-validation on 90 % of the data
#               • a final 10 % hold-out test the model never saw
#          3) Print the usual classification stats.

import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline        import make_pipeline
from sklearn.preprocessing    import StandardScaler
from sklearn.ensemble         import RandomForestClassifier
from sklearn.metrics          import accuracy_score, balanced_accuracy_score, classification_report

# 1) LOAD the numeric features which were made using features.py

df = pd.read_csv("data/features_grampos.csv")               # 523 rows × 23 cols
X  = df[[c for c in df.columns if c.startswith("Frac_")] +  # 20 fraction columns
        ["Length"]]                                         # + the Length column
y  = df["Fold"]                                             # 4 class labels

# 2) RESERVE 10 % as a final “never-seen” test set
#    The remaining 90 % is used for cross-validation + training
X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=0.10, stratify=y, random_state=42)

# 3) BUILD the model pipeline
#    • StandardScaler: puts every feature on roughly the same scale
#    • RandomForest: 200 decision trees, class_weight="balanced"
model = make_pipeline(
    StandardScaler(),                      # harmless but keeps numbers nice
    RandomForestClassifier(
        n_estimators=200,                  # number of trees
        class_weight="balanced",           # helps the tiny classes
        random_state=42)
)

# 4) 5-FOLD CROSS-VALIDATION on the 90 % training portion
#    We train on 4 slices, validate on the 5th — repeat 5 times.
print("\n=== 5-fold CV ===")
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []

for i, (tr_idx, va_idx) in enumerate(cv.split(X_tr, y_tr), start=1):
    model.fit(X_tr.iloc[tr_idx], y_tr.iloc[tr_idx])          # fit on 4/5
    preds = model.predict(X_tr.iloc[va_idx])                 # test on 1/5
    acc   = accuracy_score(y_tr.iloc[va_idx], preds)
    cv_scores.append(acc)
    print(f"Fold {i}: {acc:.3f}")

print("Average CV accuracy:", round(sum(cv_scores)/len(cv_scores), 3))

# 5) FINAL TEST — train on ALL 90 % then predict the held-out 10 %
model.fit(X_tr, y_tr)
y_hat = model.predict(X_te)

print("\n=== 10 % hold-out test ===")
print(classification_report(y_te, y_hat, digits=3))
print("Balanced accuracy:", round(balanced_accuracy_score(y_te, y_hat), 3))


=== 5-fold CV ===
Fold 1: 0.723
Fold 2: 0.713
Fold 3: 0.745
Fold 4: 0.713
Fold 5: 0.691
Average CV accuracy: 0.717

=== 10 % hold-out test ===
              precision    recall  f1-score   support

       Fold1      0.818     0.500     0.621        18
       Fold2      0.000     0.000     0.000         2
       Fold3      0.667     0.952     0.784        21
       Fold4      0.750     0.750     0.750        12

    accuracy                          0.717        53
   macro avg      0.559     0.551     0.539        53
weighted avg      0.712     0.717     0.691        53

Balanced accuracy: 0.551


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
##KNN 
from sklearn.preprocessing    import StandardScaler
from sklearn.neighbors        import KNeighborsClassifier
from sklearn.metrics          import accuracy_score, balanced_accuracy_score, classification_report

# 1) LOAD the numeric features which were made using features.py
df = pd.read_csv("data/features_grampos.csv")
X  = df[[c for c in df.columns if c.startswith("Frac_")] + ["Length"]]  # 21 features
y  = df["Fold"]                                                         # 4 class labels

# 2) KEEP 10 % aside for a final, independent test
X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=0.10, stratify=y, random_state=42)

# 3) BUILD the model pipeline
#    • StandardScaler: puts every feature on the same scale
#    • KNeighborsClassifier: looks at the 3 nearest neighbours
model = make_pipeline(
    StandardScaler(),
    KNeighborsClassifier(n_neighbors=3)  #change here to experiment with different values of k
)

# 4) 5-FOLD CROSS-VALIDATION on the 90 % training part
print("\n=== 5-fold CV ===")
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []

for i, (tr_idx, va_idx) in enumerate(cv.split(X_tr, y_tr), start=1):
    model.fit(X_tr.iloc[tr_idx], y_tr.iloc[tr_idx])          # train on 4 folds
    preds = model.predict(X_tr.iloc[va_idx])                 # validate on 1 fold
    acc   = accuracy_score(y_tr.iloc[va_idx], preds)
    cv_scores.append(acc)
    print(f"Fold {i}: {acc:.3f}")

print("Average CV accuracy:", round(sum(cv_scores)/len(cv_scores), 3))

# 5) FINAL TEST — train on all 90 % and predict the held-out 10 %
model.fit(X_tr, y_tr)
y_hat = model.predict(X_te)

print("\n=== 10 % hold-out test ===")
print(classification_report(y_te, y_hat, digits=3))
print("Balanced accuracy:", round(balanced_accuracy_score(y_te, y_hat), 3))



=== 5-fold CV ===
Fold 1: 0.681
Fold 2: 0.660
Fold 3: 0.649
Fold 4: 0.596
Fold 5: 0.670
Average CV accuracy: 0.651

=== 10 % hold-out test ===
              precision    recall  f1-score   support

       Fold1      0.526     0.556     0.541        18
       Fold2      0.000     0.000     0.000         2
       Fold3      0.667     0.667     0.667        21
       Fold4      0.615     0.667     0.640        12

    accuracy                          0.604        53
   macro avg      0.452     0.472     0.462        53
weighted avg      0.582     0.604     0.593        53

Balanced accuracy: 0.472


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# For AdaBoost:
adaboost_model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=50)

# For Bagging:
bagging_model = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=50)


TypeError: AdaBoostClassifier.__init__() got an unexpected keyword argument 'base_estimator'