In [3]:
import pandas as pd
import os
import numpy as np
import csv
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, f1_score, matthews_corrcoef

# Load and preprocess negative data
df = pd.read_csv('neg_mit_data.csv')

# Export sequences to a FASTA file
with open('Neg_mit_data.fasta', 'w') as fasta_file:
    for index, row in df.iterrows():
        fasta_file.write(f'>seq{index}\n{row["Sequence"]}\n')

# Run CD-HIT for negative sequences
os.system('cd-hit -i Neg_mit_data.fasta -o Neg_mit_cd-hit_data.txt -c 0.7')

# Load and preprocess positive data
data = pd.read_csv('Mitochondria.csv')
data.columns = data.columns.str.strip()

# Export sequences to a FASTA file
with open('Mitochondria_data.fasta', 'w') as fasta_file:
    for index, row in data.iterrows():
        fasta_file.write(f'>seq{index}\n{row["Sequence"]}\n')

# Run CD-HIT for positive sequences
os.system('cd-hit -i Mitochondria_data.fasta -o Mitochondria_cd-hit_data.txt -c 0.7')

# Process CD-HIT output for positive sequences
input_file = 'Mitochondria_cd-hit_data.txt'
output_file = 'pos_mit_data.csv'
sequences = []

with open(input_file, 'r') as file:
    for line in file:
        line = line.strip()
        if not line.startswith('>'):
            sequences.append([line])

with open(output_file, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Sequence'])
    writer.writerows(sequences)

# Process CD-HIT output for negative sequences
input_file = 'Neg_mit_cd-hit_data.txt'
output_file = 'Neg_mit_data.csv'
sequences = []

with open(input_file, 'r') as file:
    for line in file:
        line = line.strip()
        if not line.startswith('>'):
            sequences.append([line])

with open(output_file, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Sequence'])
    writer.writerows(sequences)

# Add label column
neg_df = pd.read_csv('Neg_mit_data.csv')
neg_df['Label'] = 1
neg_df.to_csv('Neg_mit_data.csv', index=False)

pos_df = pd.read_csv('pos_mit_data.csv')
pos_df['Label'] = 0
pos_df.to_csv('pos_mit_data.csv', index=False)

# Merge datasets
data1 = pd.read_csv('Neg_mit_data.csv')
data2 = pd.read_csv('pos_mit_data.csv')
merged_df = pd.concat([data1, data2], ignore_index=True)
merged_df.to_csv('merged_mit_data.csv', index=False)

# Shuffle the data
data_df = merged_df.sample(frac=1)
X = data_df['Sequence']
y = data_df['Label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the unique amino acids
amino_acids = 'ACDEFGHIKLMNPQRSTVWY'

# Function to calculate amino acid composition
def calculate_aa_composition(sequence):
    composition = np.zeros(len(amino_acids))
    for aa in sequence:
        if aa in amino_acids:
            index = amino_acids.index(aa)
            composition[index] += 1
    return composition / len(sequence)

# Create features based on amino acid composition
X_train_aa_composition = np.array([calculate_aa_composition(seq) for seq in X_train])
X_test_aa_composition = np.array([calculate_aa_composition(seq) for seq in X_test])

# Train the SVM model using the features based on amino acid composition
svm_model_aa = SVC(kernel='rbf', C=1.0, gamma='scale', probability=True)
svm_model_aa.fit(X_train_aa_composition, y_train)




Program: CD-HIT, V4.8.1 (+OpenMP), Apr 07 2021, 10:57:21
Command: cd-hit -i Neg_mit_data.fasta -o
         Neg_mit_cd-hit_data.txt -c 0.7

Started: Sun Jun 23 13:58:37 2024
                            Output                              
----------------------------------------------------------------
total seq: 4097
longest and shortest : 4149 and 66
Total letters: 2378472
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 2M
Buffer          : 1 X 11M = 11M
Table           : 1 X 65M = 65M
Miscellaneous   : 0M
Total           : 79M

Table limit with the given memory limit:
Max number of representatives: 828666
Max number of word counting entries: 90029358

comparing sequences from          0  to       4097
....
     4097  finished        678  clusters

Approximated maximum memory consumption: 83M
writing new database
writing clustering information
program completed !

Total CPU time 0.62
Program: CD-HIT, V4.8.1 (+OpenMP), Apr 07 2021, 10:57:21
Comman

In [4]:
# Evaluate the model
y_pred_aa = svm_model_aa.predict(X_test_aa_composition)
accuracy_aa = accuracy_score(y_test, y_pred_aa)
precision_aa = precision_score(y_test, y_pred_aa)
f1_aa = f1_score(y_test, y_pred_aa)
mcc_aa = matthews_corrcoef(y_test, y_pred_aa)

# Calculate confusion matrix and specificity
conf_matrix = confusion_matrix(y_test, y_pred_aa)
tn, fp, fn, tp = conf_matrix.ravel()
specificity_aa = tn / (tn + fp)

# Print the metrics
print("Accuracy based on amino acid composition:", accuracy_aa)
print("Precision based on amino acid composition:", precision_aa)
print("F1 Score based on amino acid composition:", f1_aa)
print("Matthews Correlation Coefficient (MCC) based on amino acid composition:", mcc_aa)
print("Specificity based on amino acid composition:", specificity_aa)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy based on amino acid composition: 0.8598726114649682
Precision based on amino acid composition: 0.8598726114649682
F1 Score based on amino acid composition: 0.9246575342465754
Matthews Correlation Coefficient (MCC) based on amino acid composition: 0.0
Specificity based on amino acid composition: 0.0
Confusion Matrix:
[[  0  22]
 [  0 135]]
