In [3]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report

# Load the dataset
file_path = 'data.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
data.head()


Unnamed: 0,Species,Organism,UV,"Exposure intensity (in J, KJ, W, KW)",Exposure time (in seconds),Organelle,Metabolites names,Proteins names,Genes names,Studied tissue,Exposure severity,Biological responses
0,Arabidopsis thaliana,Human,UV-A,2.92 W,232,cytoplasm,Nicotinamide Adenine Dinucleotide,Cytochrome C,TP53,cell,hermetic,signaling
1,Arabidopsis thaliana,Microorganism,UV-B,78.31 KW,2166,golgi,Glucose,Myosin,BRAF,seed,hermetic,epigenetic
2,Escherichia coli,Plant,UV-A,59.17 J,236,nucleus,Glucose,Albumin,MTOR,seed,harmful,DNA repair
3,Homo sapiens,Animal,UV-A,18.04 W,111,mitochondrion,Glucose,Myosin,EGFR,skin,hermetic,signaling
4,Mus musculus,Plant,UV-B,17.33 J,2275,nucleus,Citric Acid,Collagen,BRCA1,whole body,harmful,DNA repair


In [5]:

# Separate features and target columns
X = data.drop(columns=['Exposure severity', 'Biological responses'])
y_severity = data['Exposure severity']
y_response = data['Biological responses']

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

# Preprocessing for numerical data
numerical_transformer = StandardScaler()

from sklearn.preprocessing import OneHotEncoder

# Preprocessing for categorical data
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Create pipelines for the models
model_severity = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

model_response = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Split the data
X_train, X_test, y_severity_train, y_severity_test, y_response_train, y_response_test = train_test_split(
    X, y_severity, y_response, test_size=0.2, random_state=42)

# Train the models
model_severity.fit(X_train, y_severity_train)
model_response.fit(X_train, y_response_train)

# Make predictions
y_severity_pred = model_severity.predict(X_test)
y_response_pred = model_response.predict(X_test)

# Evaluate the models
severity_report = classification_report(y_severity_test, y_severity_pred)
response_report = classification_report(y_response_test, y_response_pred)

severity_report, response_report



('              precision    recall  f1-score   support\n\n     harmful       0.44      0.69      0.54        49\n    hermetic       0.59      0.34      0.43        65\n\n    accuracy                           0.49       114\n   macro avg       0.52      0.52      0.49       114\nweighted avg       0.53      0.49      0.48       114\n',
 '              precision    recall  f1-score   support\n\n  DNA damage       0.25      0.30      0.27        23\n  DNA repair       0.30      0.17      0.22        35\n  epigenetic       0.08      0.12      0.10        16\n    pathways       0.25      0.25      0.25        20\n   signaling       0.10      0.10      0.10        20\n\n    accuracy                           0.19       114\n   macro avg       0.20      0.19      0.19       114\nweighted avg       0.21      0.19      0.20       114\n')

In [6]:
import pickle

# Save the severity prediction model
with open('severity.pkl', 'wb') as file:
    pickle.dump(model_severity, file)

# Save the biological response prediction model
with open('response.pkl', 'wb') as file:
    pickle.dump(model_response, file)

# Check if the files were saved correctly
import os
severity_model_exists = os.path.exists('severity.pkl')
response_model_exists = os.path.exists('response.pkl')

severity_model_exists, response_model_exists


(True, True)