In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report

In [27]:
def experiment(test_percent):
    # Read the dataset
    file_path = 'Data_Cortex_Nuclear.csv'
    df = pd.read_csv(file_path)

    # Shuffling
    df = df.sample(frac=1, random_state=42)

    # Drop the non used columns
    df = df.drop('MouseID', axis=1)
    df = df.drop('Treatment', axis=1)
    df = df.drop('Behavior', axis=1)
    df = df.drop('class', axis=1)

    # Split the dataset into features (X) and target variable (y)
    # X = df.drop('Genotype', axis=1)  # Features
    X = df[["APP_N", "ITSN1_N", "DYRK1A_N"]]
    y = df['Genotype']  # Target variable

    # Use SimpleImputer to fill NaN values with the mean
    imputer = SimpleImputer(strategy='mean')
    X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_percent, random_state=42)

    # Initialize the RandomForestClassifier
    classifier = RandomForestClassifier(random_state=42)

    # Train the classifier
    classifier.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = classifier.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred, digits=6)
    
    # Feature importances
    feature_importances = classifier.feature_importances_
    
    # Sort features based on importances
    feature_importance_dict = dict(zip(X.columns, feature_importances))
    sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

    # Printing results
    print(f"Percentual for test: {test_percent}")
    print(f'Accuracy: {accuracy}')
    print('Classification Report:\n', classification_rep)
    print('Feature Importances (sorted):')
    for feature, importance in sorted_features:
        print(f'{feature}: {importance}')

In [28]:
experiment(0.1)
experiment(0.2)
experiment(0.3)
experiment(0.5)

Percentual for test: 0.1
Accuracy: 0.8611111111111112
Classification Report:
               precision    recall  f1-score   support

     Control   0.836066  0.910714  0.871795        56
      Ts65Dn   0.893617  0.807692  0.848485        52

    accuracy                       0.861111       108
   macro avg   0.864841  0.859203  0.860140       108
weighted avg   0.863776  0.861111  0.860572       108

Feature Importances (sorted):
APP_N: 0.41871628999782023
ITSN1_N: 0.30179784068028154
DYRK1A_N: 0.27948586932189834
Percentual for test: 0.2
Accuracy: 0.7824074074074074
Classification Report:
               precision    recall  f1-score   support

     Control   0.739837  0.858491  0.794760       106
      Ts65Dn   0.838710  0.709091  0.768473       110

    accuracy                       0.782407       216
   macro avg   0.789274  0.783791  0.781616       216
weighted avg   0.790189  0.782407  0.781373       216

Feature Importances (sorted):
APP_N: 0.4190762580396989
ITSN1_N: 0.2969262