In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import VotingRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from collections import defaultdict


In [None]:
# Load the train and test data from the CSV file
train_data = pd.read_csv('/content/drive/MyDrive/Protein_Engineering_Tournament/dataset-02/train.csv')

test_data = pd.read_csv('/content/drive/MyDrive/Protein_Engineering_Tournament/dataset-02/test.csv')

test_data.head(4)

Unnamed: 0,FIOP(fold improvement over positive control),mutated_sequence,DNA
0,,MYPDLKGKVVAITGAATGLGKAMAIRFGKEQAKVVINYYSNKQDPN...,ATGTATCCGGATTTAAAAGGAAAAGTCGTCGCTATTACAGGAGCTG...
1,,MYPDLKGKVVAITGAATGLGKAMAIRFGKEQAKVVINYYSNKQDPN...,ATGTATCCGGATTTAAAAGGAAAAGTCGTCGCTATTACAGGAGCTG...
2,,MYPDLKGKVVAITGAATGLGKAMAIRFGKEQAKVVINYYSNKQDPN...,ATGTATCCGGATTTAAAAGGAAAAGTCGTCGCTATTACAGGAGCTG...
3,,MYPDLKGKVVAITGAATGLGKAMAIRFGKEQAKVVINYYSNKQDPN...,ATGTATCCGGATTTAAAAGGAAAAGTCGTCGCTATTACAGGAGCTG...


In [None]:
# Extract the features (mutated protein sequence and DNA sequence) and target variable (FIOP)
X = train_data[['mutated_sequence', 'DNA']]
X_test = test_data[['mutated_sequence', 'DNA']]
y = train_data['FIOP(fold improvement over positive control)']

X.head(4), y.head(4)

(                                    mutated_sequence  \
 0  MYPDLKGKVVAITGAATGLGKAMAIRFGKEQAKVVINYYSNKQDPN...   
 1  MYPDLKGKVVAITGAATGLGKAMAIRFGKEQAKVPINYYSNKQDPN...   
 2  MYTDLKGKVVAITGAATGLGKAMAIRFGKEQAKVVINYYSNKQDPN...   
 3  MYPDLKGKVVAITGAATGLGKAMAIRFGKEQAKVVINYYSNKQDPN...   
 
                                                  DNA  
 0  ATGTATCCGGATTTAAAAGGAAAAGTCGTCGCTATTACAGGAGCTG...  
 1  ATGTATCCGGATTTAAAAGGAAAAGTCGTCGCTATTACAGGAGCTG...  
 2  ATGTATACGGATTTAAAAGGAAAAGTCGTCGCTATTACAGGAGCTG...  
 3  ATGTATCCGGATTTAAAAGGAAAAGTCGTCGCTATTACAGGAGCTG...  ,
 0    12.148852
 1     0.001672
 2     5.422011
 3     0.000000
 Name: FIOP(fold improvement over positive control), dtype: float64)

In [None]:
type(y), type(X_test)

(pandas.core.series.Series, pandas.core.frame.DataFrame)

In [None]:
data = {'train': X, 'test': X_test, 'train_target': y}

In [None]:
def Handle_missing_values(data):

  imputed_data = defaultdict(lambda: 0)

  for val in ['train', 'test', 'train_target']:
    # Handle missing values in the features on train data

    if val == 'train_target':
      y_imputer = SimpleImputer(strategy='mean')
      y_imputed = y_imputer.fit_transform(y.values.reshape(-1, 1)).ravel()
      imputed_data[val] = y_imputed

    elif val == 'train':
      imputer = SimpleImputer(strategy='most_frequent').fit(data[val])
      X_imputed = imputer.transform(data[val])    ######imputation may be done only with data in the train; for test imputation, fit with train and transform with test - Ashok
      imputed_data[val] = X_imputed
    else:
      X_imputed = imputer.transform(data[val])    ######imputation may be done only with data in the train; for test imputation, fit with train and transform with test - Ashok
      imputed_data[val] = X_imputed


  return imputed_data


In [None]:
imputed_data = Handle_missing_values(data)

In [None]:
imputed_data['train']

array([['MYPDLKGKVVAITGAATGLGKAMAIRFGKEQAKVVINYYSNKQDPNLVKEAVIKAGGEAVVVQGDVTKEEDVKNIVQTAIKEFGTLDIMINNAGVENPVPSHEMPLKDWDKVIATNLTGAFLGSREAIKYFVENDIKGNVINMSSVIEVIPWPLRVHYAASKGGMKNMTKTLAWEYAPKGIRVNNIGPGAINTTNNAERWADPKQKADVESMIPMGYIGEGEEIAAVAAWLASKEASYVTGITLFADGGMTLSPSIQAGRG',
        'ATGTATCCGGATTTAAAAGGAAAAGTCGTCGCTATTACAGGAGCTGCTACTGGGCTCGGAAAAGCGATGGCCATTCGCTTCGGCAAGGAGCAGGCAAAAGTGGTTATCAACTATTATAGTAATAAACAAGATCCGAACCTGGTAAAAGAAGCGGTCATCAAGGCGGGCGGTGAAGCTGTTGTCGTCCAAGGAGATGTCACGAAAGAGGAAGATGTAAAAAATATCGTGCAAACGGCAATTAAAGAGTTCGGCACACTCGATATTATGATTAATAATGCCGGTGTCGAAAATCCTGTGCCATCTCACGAAATGCCGCTCAAGGATTGGGATAAAGTCATCGCTACGAACTTAACGGGTGCCTTTTTAGGAAGCCGTGAAGCGATTAAATATTTCGTAGAAAACGATATCAAGGGAAATGTCATTAACATGTCCAGTGTGATCGAAGTGATTCCTTGGCCGTTACGCGTCCACTATGCCGCAAGTAAAGGCGGGATGAAGAATATGACAAAGACATTAGCGTGGGAATACGCGCCGAAGGGCATTCGCGTCAATAATATTGGGCCAGGTGCGATCAACACGACGAATAATGCTGAGCGCTGGGCTGACCCTAAACAGAAAGCTGATGTAGAAAGCATGATTCCAATGGGATATATCGGCGAAGGCGAGGAGATCGCCGCAGTAGCAGCCTGGCTTGCTTCGAAGGAAGCCAGCTACGTCA

In [None]:
def encode_data(data):

  encoded_data = defaultdict(lambda: 0)
  for val in ['train', 'test']:
  # Perform label encoding on the categorical features (mutated protein sequence and DNA sequence)
    label_encoder = LabelEncoder()
    print(data[val])
    X_encoded = data[val].copy()
    for i in range(X_encoded.shape[1]):
        if isinstance(X_encoded[:, i][0], str):
            # Remove any downstream amino acids after "*"
            X_encoded[:, i] = [seq.split('*')[0] for seq in X_encoded[:, i]]
            X_encoded[:, i] = label_encoder.fit_transform(X_encoded[:, i])
    encoded_data[val] = X_encoded

  return encoded_data

In [None]:
encoded_data = encode_data(imputed_data)

[['MYPDLKGKVVAITGAATGLGKAMAIRFGKEQAKVVINYYSNKQDPNLVKEAVIKAGGEAVVVQGDVTKEEDVKNIVQTAIKEFGTLDIMINNAGVENPVPSHEMPLKDWDKVIATNLTGAFLGSREAIKYFVENDIKGNVINMSSVIEVIPWPLRVHYAASKGGMKNMTKTLAWEYAPKGIRVNNIGPGAINTTNNAERWADPKQKADVESMIPMGYIGEGEEIAAVAAWLASKEASYVTGITLFADGGMTLSPSIQAGRG'
  'ATGTATCCGGATTTAAAAGGAAAAGTCGTCGCTATTACAGGAGCTGCTACTGGGCTCGGAAAAGCGATGGCCATTCGCTTCGGCAAGGAGCAGGCAAAAGTGGTTATCAACTATTATAGTAATAAACAAGATCCGAACCTGGTAAAAGAAGCGGTCATCAAGGCGGGCGGTGAAGCTGTTGTCGTCCAAGGAGATGTCACGAAAGAGGAAGATGTAAAAAATATCGTGCAAACGGCAATTAAAGAGTTCGGCACACTCGATATTATGATTAATAATGCCGGTGTCGAAAATCCTGTGCCATCTCACGAAATGCCGCTCAAGGATTGGGATAAAGTCATCGCTACGAACTTAACGGGTGCCTTTTTAGGAAGCCGTGAAGCGATTAAATATTTCGTAGAAAACGATATCAAGGGAAATGTCATTAACATGTCCAGTGTGATCGAAGTGATTCCTTGGCCGTTACGCGTCCACTATGCCGCAAGTAAAGGCGGGATGAAGAATATGACAAAGACATTAGCGTGGGAATACGCGCCGAAGGGCATTCGCGTCAATAATATTGGGCCAGGTGCGATCAACACGACGAATAATGCTGAGCGCTGGGCTGACCCTAAACAGAAAGCTGATGTAGAAAGCATGATTCCAATGGGATATATCGGCGAAGGCGAGGAGATCGCCGCAGTAGCAGCCTGGCTTGCTTCGAAGGAAGCCAGCTACGTCACAGGCATCACGTT

In [None]:
encoded_data['train']

array([[919, 1111],
       [301, 908],
       [2314, 31],
       ...,
       [1321, 1296],
       [811, 1113],
       [267, 104]], dtype=object)

In [None]:
X_train_encoded, X_test_encoded = encoded_data['train'], encoded_data['test']

In [None]:
# Split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_encoded, imputed_data['train_target'], test_size=0.1, random_state=42) ###test split is too small, at least 0.1 necessary to estimate performance -- Ashok

In [None]:
# Initialize the base models
base_models = [
    ('svr', SVR()),
    ('random_forest', RandomForestRegressor(random_state=42))
]

# Define the parameter grid for SVR
svr_param_grid = {
    'C': [0.1, 1, 10],
    'epsilon': [0.0001, 0.001, 0.002],
    'gamma': ['scale', 'auto']

}

# Define the parameter grid for Random Forest
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2',0.5,'None']
}


In [None]:
# Initialize the VotingRegressor with the base models
ensemble_model = VotingRegressor(estimators=base_models)

# Perform hyperparameter tuning
params = {'svr__' + k: v for k, v in svr_param_grid.items()}
params.update({'random_forest__' + k: v for k, v in rf_param_grid.items()})

random_search = RandomizedSearchCV(ensemble_model, param_distributions=params, n_iter=75, scoring='neg_mean_squared_error', cv=5, random_state=42)
random_search.fit(X_train, y_train)

# Get the best model and its hyperparameters
best_model = random_search.best_estimator_
best_params = random_search.best_params_

# Make predictions on the validation set
val_predictions = best_model.predict(X_val)
mse = mean_squared_error(y_val, val_predictions)
r2 = r2_score(y_val, val_predictions)

print("Best Hyperparameters:", best_params)
print("Validation Set:")
print("Mean Squared Error:", mse)
print("R-squared Score:", r2)


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
65 fits failed out of a total of 375.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
65 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fi

Best Hyperparameters: {'svr__gamma': 'auto', 'svr__epsilon': 0.001, 'svr__C': 10, 'random_forest__n_estimators': 100, 'random_forest__min_samples_split': 2, 'random_forest__min_samples_leaf': 2, 'random_forest__max_features': 'auto', 'random_forest__max_depth': None}
Validation Set:
Mean Squared Error: 3.5477639401683407
R-squared Score: 0.8720532230590594


In [None]:
type(X_test_encoded)

numpy.ndarray

In [None]:
# Make predictions on the unseen data using the best_model
y_pred = best_model.predict(X_test_encoded)

In [None]:
y_pred

array([1.13632951, 1.15556879, 1.15317321, 1.13515929, 1.96468014,
       1.91619861, 1.13293721, 1.09055831, 1.10324695, 1.36449488,
       1.09055831, 2.78602019, 1.08793487, 1.35392794, 1.13210263,
       1.13061763, 1.11770761, 2.522786  , 1.85510243, 1.08793487,
       2.26456801, 1.71110563, 1.34450445, 1.77655187, 1.16064351,
       2.550576  , 3.16134173, 1.15879071, 1.67679949, 2.60055568,
       2.86837219, 1.34317478, 1.37466717, 1.1739267 , 1.16378888,
       2.38627998, 1.17599494, 1.64939384, 2.09253832, 2.93157946,
       3.02250739, 1.34611109, 2.11242733, 1.13061763, 1.38083706,
       1.51730339, 1.48176194, 1.12113447, 1.31053211, 1.43277201,
       1.35360445, 1.15488279, 1.14517679, 1.16671933, 2.50005523,
       1.09059131, 2.62383425, 2.88746558, 1.89076909, 1.78010284,
       2.06271442, 1.52173172, 1.16058788, 1.1483841 , 2.56357363,
       1.80571945, 1.1139452 , 3.2968816 , 2.89772204, 1.10743021,
       1.13299363, 1.38260196, 1.17348144, 1.11673836, 1.93614

In [None]:
# Create a DataFrame with the predicted labels
df_predictions = pd.DataFrame({'Predicted Labels': y_pred})

# Save the DataFrame as a CSV file
df_predictions.to_csv('/content/drive/MyDrive/Protein_Engineering_Tournament/dataset-02/predictions.csv', index=False)