<a href="https://colab.research.google.com/github/odhiambk/CIS-678-Machine-Learning/blob/main/FP2a3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Data Loading and Preprocessing**

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr

# Load ADT data
adt_data = pd.read_csv('https://raw.githubusercontent.com/odhiambk/CIS-678-Machine-Learning/main/training_set_adt.csv')
adt_data.rename(columns={'Unnamed: 0': 'feature_name'}, inplace=True)

# Load RNA data
rna_data = pd.read_csv('https://raw.githubusercontent.com/odhiambk/CIS-678-Machine-Learning/main/training_set_rna.csv')

# Transpose ADT data to have samples as rows
adt_data_transposed = adt_data.iloc[:, 1:].T  # Drop the first column and transpose
adt_data_transposed.columns = adt_data['feature_name']  # Set the feature names as columns

# Transpose RNA data if necessary
rna_data_transposed = rna_data.set_index('Unnamed: 0').T  # Assume 'Unnamed: 0' is sample ID, transpose

# Check the shapes of the datasets
print(f"Transposed ADT Data Shape: {adt_data_transposed.shape}")  # Should be (4000, 25)
print(f"Transposed RNA Data Shape: {rna_data_transposed.shape}")  # Should be (4000, 639)

Transposed ADT Data Shape: (4000, 25)
Transposed RNA Data Shape: (4000, 639)


**Exploratory Data Analysis (EDA)**

In [2]:
# Display the first few rows of the transposed ADT data
adt_data_transposed.head()

# Display the first few rows of the transposed RNA data
rna_data_transposed.head()

# Check for missing values
print(f"Missing values in ADT Data: {adt_data_transposed.isnull().sum().sum()}")
print(f"Missing values in RNA Data: {rna_data_transposed.isnull().sum().sum()}")

Missing values in ADT Data: 0
Missing values in RNA Data: 0


**Modeling Approaches**

Linear Regression

In [3]:
# Dictionary to store Pearson Correlation results for each model
pearson_results_linear = {}

# Iterate through each ADT feature and train a linear regression model
for adt_feature in adt_data_transposed.columns:
    # Prepare data for regression
    X = rna_data_transposed  # RNA features
    y = adt_data_transposed[adt_feature]  # Single ADT feature as target

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Fit a linear regression model
    linear_model = LinearRegression()
    linear_model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred_linear = linear_model.predict(X_test)

    # Evaluate using Pearson Correlation Coefficient
    pearson_corr_linear, _ = pearsonr(y_test, y_pred_linear)
    pearson_results_linear[adt_feature] = pearson_corr_linear

# Display results
pearson_df_linear = pd.DataFrame({'ADT Feature': pearson_results_linear.keys(),
                                  'Pearson Correlation (Linear)': pearson_results_linear.values()})
display(pearson_df_linear)

Unnamed: 0,ADT Feature,Pearson Correlation (Linear)
0,CD11a,0.887073
1,CD11c,0.93117
2,CD123,0.720521
3,CD127-IL7Ra,0.806528
4,CD14,0.940187
5,CD16,0.797635
6,CD161,0.740121
7,CD19,0.896246
8,CD197-CCR7,0.356538
9,CD25,0.355248


Ridge Regression

In [4]:
# Dictionary to store Pearson Correlation results for Ridge Regression
pearson_results_ridge = {}

# Iterate through each ADT feature and train a ridge regression model
for adt_feature in adt_data_transposed.columns:
    # Prepare data for regression
    X = rna_data_transposed  # RNA features
    y = adt_data_transposed[adt_feature]  # Single ADT feature as target

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Fit a ridge regression model
    ridge_model = Ridge(alpha=1.0)  # Adjust alpha for regularization
    ridge_model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred_ridge = ridge_model.predict(X_test)

    # Evaluate using Pearson Correlation Coefficient
    pearson_corr_ridge, _ = pearsonr(y_test, y_pred_ridge)
    pearson_results_ridge[adt_feature] = pearson_corr_ridge

# Display results
pearson_df_ridge = pd.DataFrame({'ADT Feature': pearson_results_ridge.keys(),
                                 'Pearson Correlation (Ridge)': pearson_results_ridge.values()})
display(pearson_df_ridge)

Unnamed: 0,ADT Feature,Pearson Correlation (Ridge)
0,CD11a,0.887122
1,CD11c,0.931197
2,CD123,0.72062
3,CD127-IL7Ra,0.806629
4,CD14,0.940224
5,CD16,0.797748
6,CD161,0.740204
7,CD19,0.896288
8,CD197-CCR7,0.356717
9,CD25,0.35539


**Evaluation Using Pearson Correlation Coefficient**

In [5]:
# Combine results from both models
combined_pearson_df = pd.merge(pearson_df_linear, pearson_df_ridge, on='ADT Feature', how='inner')
display(combined_pearson_df)

Unnamed: 0,ADT Feature,Pearson Correlation (Linear),Pearson Correlation (Ridge)
0,CD11a,0.887073,0.887122
1,CD11c,0.93117,0.931197
2,CD123,0.720521,0.72062
3,CD127-IL7Ra,0.806528,0.806629
4,CD14,0.940187,0.940224
5,CD16,0.797635,0.797748
6,CD161,0.740121,0.740204
7,CD19,0.896246,0.896288
8,CD197-CCR7,0.356538,0.356717
9,CD25,0.355248,0.35539


**Test Data Predictions**

Transposed ADT Data Columns (Proteins): Index(['CD11a', 'CD11c', 'CD123', 'CD127-IL7Ra', 'CD14', 'CD16', 'CD161',
       'CD19', 'CD197-CCR7', 'CD25', 'CD27', 'CD278-ICOS', 'CD28', 'CD3',
       'CD34', 'CD38', 'CD4', 'CD45RA', 'CD45RO', 'CD56', 'CD57', 'CD69',
       'CD79b', 'CD8a', 'HLA.DR'],
      dtype='object')
Number of common samples between RNA and ADT data: 4000
Predictions for all ADT features have been successfully saved to 'final_predictions.csv'.
