# BIOL 5340 – Bioinformatics Final Project - PSTR
By Alessandra Rodriguez, Ruthvik Gowravaram, Anushka Pawar,​ Sindhura Rama, Gouri Eslavathu

In [2]:
# libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
import warnings

warnings.filterwarnings("ignore")

# Data Preparation

In [6]:
file_name = '/Users/alessandrarodriguez/Downloads/normalized_gene_counts_twenty_subset.csv'
meta_data = '/Users/alessandrarodriguez/Downloads/metadata.csv'

# open data into a pandas df
df = pd.read_csv(file_name)
# make row names
df.set_index(df.columns[0], inplace=True)
# transpose data, rows = samples, columns = genes
df = df.transpose()

# open meta data into a pandas df
host_data = pd.read_csv(meta_data)
# make row names
host_data.set_index(host_data.columns[0], inplace=True)
# keep only the relevant columns
host_data = host_data.drop(columns=host_data.columns.difference(['Host']))
# PSTR is changed to 0 and all other hosts are changed to 0
host_data['Host'] = (host_data['Host'] == 'PSTR').astype(int)

# make host a column in the data frame
df['Host'] = host_data['Host']
print(df.head(2))

# identify the features as X and the target as y
X = df.drop('Host', axis=1)
y = df['Host']

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

Unnamed: 0    P58905    Q9S752    O05496    Q0D5B9    Q4UK32    Q9UTR6  \
M_c1_C      1.620841  2.630026  2.091298  2.283285  2.146254  3.021643   
M_c3_C      1.620841  3.015490  1.869137  2.049852  2.397577  3.141323   

Unnamed: 0    Q80W93    Q0WR59    Q27960    Q9DCM0  ...    Q63120    C0H419  \
M_c1_C      2.491613  1.620841  1.620841  1.620841  ...  2.793084  2.283285   
M_c3_C      2.915706  2.732385  1.620841  1.620841  ...  3.299849  2.397577   

Unnamed: 0    P22944    P43317    Q11208    Q9M907    K9NVA6    Q9H0C1  \
M_c1_C      4.962174  3.795321  1.620841  2.932480  1.856850  5.624944   
M_c3_C      4.246878  4.476104  1.620841  2.996271  2.115619  6.046560   

Unnamed: 0    Q02440  Host  
M_c1_C      2.241169     0  
M_c3_C      2.434535     0  

[2 rows x 21 columns]


# Logistic Regression

### Explain each algorithm used and expected outcomes
Logistic regression is used for binary classification. The model assigns each instance a probability, based on this probability the instance is assigned a class. In this case, it uses the genes to predict whether or not the host is PSTR.

### Explain if your algorithm is unsupervised or supervised
This is a supervised algorithm because it requires labeled training data which is the host column in this case. 

### How did you get gene numbers or latent factor information for table above?
ANSWER HERE

### How did you generate the performance errors (describe for each algorithm)?
ANSWER HERE

In [7]:
# Initializing logistic regression model
lr_model = LogisticRegression()

# Training the model on the entire training set
lr_model.fit(X_train, y_train)

# Predicting on the test set
y_pred = lr_model.predict(X_test)

# Evaluating the model on the test set
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

# Printing model evaluation on the test set
print("\nAccuracy (Test Set):", accuracy)
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_rep)


Accuracy (Test Set): 0.8888888888888888

Confusion Matrix:
[[8 0]
 [1 0]]

Classification Report:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94         8
           1       0.00      0.00      0.00         1

    accuracy                           0.89         9
   macro avg       0.44      0.50      0.47         9
weighted avg       0.79      0.89      0.84         9



# Linear Discriminant Analysis

### Explain each algorithm used and expected outcomes
Linear discriminant analysis is a classification model that seperates classes of data using linear combinations of features. The outcome is the discriminant functions that best seperate the different classes.

### Explain if your algorithm is unsupervised or supervised
This is a supervised algorithm because it requires labeled training data which is the host column in this case. 

### How did you get gene numbers or latent factor information for table above?
ANSWER HERE

### How did you generate the performance errors (describe for each algorithm)?
ANSWER HERE

In [8]:
# Initializing Linear Discriminant Analysis model
lda = LinearDiscriminantAnalysis()

# Training the model on the training set
lda.fit(X_train, y_train)

# Predicting on the test set
y_pred = lda.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
lda_scalings = lda.scalings_

# Printing the results
print(f"Accuracy: {accuracy:.4f}")
print(f"Mean Squared Error: {mse:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_rep)

Accuracy: 0.8889
Mean Squared Error: 0.1111

Confusion Matrix:
[[8 0]
 [1 0]]

Classification Report:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94         8
           1       0.00      0.00      0.00         1

    accuracy                           0.89         9
   macro avg       0.44      0.50      0.47         9
weighted avg       0.79      0.89      0.84         9



# Quadratic Discriminant Analysis

### Explain each algorithm used and expected outcomes
Linear discriminant analysis is a classification model that seperates classes of data using linear AND non linear combinations of features, similar to LDA. The outcomes include assigning each observation a probability of it belonging to a certain class based on the decision boundaries.

### Explain if your algorithm is unsupervised or supervised
This is a supervised algorithm because it requires labeled training data which is the host column in this case. 

### How did you get gene numbers or latent factor information for table above?
ANSWER HERE

### How did you generate the performance errors (describe for each algorithm)?
ANSWER HERE

In [9]:
# Initialize Quadratic Discriminant Analysis model
qda = QuadraticDiscriminantAnalysis(store_covariance=True)

# Train the model on the training set
qda.fit(X_train, y_train)

# Predict on the test set
y_pred = qda.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

# Print the results
print(f"Accuracy: {accuracy:.4f}")
print(f"Mean Squared Error: {mse:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_rep)

Accuracy: 0.8889
Mean Squared Error: 0.1111

Confusion Matrix:
[[8 0]
 [1 0]]

Classification Report:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94         8
           1       0.00      0.00      0.00         1

    accuracy                           0.89         9
   macro avg       0.44      0.50      0.47         9
weighted avg       0.79      0.89      0.84         9



# Support Vector Machine

### Explain each algorithm used and expected outcomes
Support Vector Machine finds the best hyper planes to differentiate classes of data. Thus the outcome are hyper planes that act as decision boundaries that maximize the space between different classes.

### Explain if your algorithm is unsupervised or supervised
This is a supervised algorithm because it requires labeled training data which is the host column in this case. 

### How did you get gene numbers or latent factor information for table above?
ANSWER HERE

### How did you generate the performance errors (describe for each algorithm)?
ANSWER HERE

In [10]:
# Initialize Support Vector Classification model
svm_model = SVC(probability=True)

# Train the model on the training set
svm_model.fit(X_train, y_train)

# Predict on the test set
y_pred = svm_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

# Print the results
print(f"Accuracy: {accuracy:.4f}")
print(f"Mean Squared Error: {mse:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_rep)

Accuracy: 1.0000
Mean Squared Error: 0.0000

Confusion Matrix:
[[8 0]
 [0 1]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         8
           1       1.00      1.00      1.00         1

    accuracy                           1.00         9
   macro avg       1.00      1.00      1.00         9
weighted avg       1.00      1.00      1.00         9



# Partial Least Squares

### Explain each algorithm used and expected outcomes
Partial least squares seeks to find the latent variables, which are complex/less direct patterns in the data, in order to classify observations. Thus the outcome is a set of components/latent variables that can be used to predict the class/host.

### Explain if your algorithm is unsupervised or supervised
This is a supervised algorithm because it requires labeled training data which is the host column in this case. 

### How did you get gene numbers or latent factor information for table above?
ANSWER HERE

### How did you generate the performance errors (describe for each algorithm)?
ANSWER HERE

In [11]:
# Initialize PLS Regression model
pls_model = PLSRegression(n_components=2)

# Train the model on the training set
pls_model.fit(X_train, y_train)

# Predict on the test set
y_pred = pls_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the results
print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared: {r2:.4f}")


Mean Squared Error: 0.0307
R-squared: 0.6894
