In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.exceptions import ConvergenceWarning
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import joblib
import warnings

In [2]:
df = pd.read_csv('DigitalEye.csv')

In [3]:
pd.options.display.max_columns=None 

In [4]:
#The sum of missing values for each column in the DataFrame
df.isnull().sum()

Name                                        0
Age                                         0
Sex                                         0
wearables                                   0
Duration                                  110
onlineplatforms                             5
Nature                                     19
screenillumination                          2
workingyears                                6
hoursspentdailycurricular                   5
hoursspentdailynoncurricular                2
Gadgetsused                                 0
levelofgadjetwithrespecttoeyes              0
Distancekeptbetweeneyesandgadjet            1
Avgnighttimeusageperday                     0
Blinkingduringscreenusage                   0
Difficultyinfocusingafterusingscreens       0
freqquencyofcomplaints                      0
Severityofcomplaints                        0
RVIS                                        0
Ocularsymptomsobservedlately                2
Symptomsobservingatleasthalfofthet

In [5]:
# Handle missing values
numerical_features = df.select_dtypes(include=['float64']).columns
imputer = SimpleImputer(strategy='median')
df[numerical_features] = imputer.fit_transform(df[numerical_features])

categorical_feature = ['onlineplatforms']
imputer = SimpleImputer(strategy='most_frequent')
df[categorical_feature] = imputer.fit_transform(df[categorical_feature])

In [6]:
import numpy as np
from scipy.stats.mstats import winsorize
# Apply winsorization to variables with outliers
variables_to_winsorize = ['Duration', 'onlineplatforms', 'Nature', 'workingyears', 
                          'hoursspentdailycurricular', 'Gadgetsused', 'frequencyofdryeyes', 
                          'Schimers2righteye']
for variable in variables_to_winsorize:
    df[variable] = winsorize(df[variable], limits=[0.05, 0.05])
# Apply log transformation to variables with skewed distributions
variables_to_log_transform = ['hoursspentdailycurricular', 'Nature']
for variable in variables_to_log_transform:
    df[variable] = np.log1p(df[variable])  

In [7]:
df=pd.get_dummies(df, columns=['Name'])

In [8]:
df['Interaction_term_1'] = df['freqquencyofcomplaints'] * df['frequencyofdryeyes']

In [9]:
bins = [0, 30, 60, float('inf')]
labels = ['Young', 'Adult', 'Senior']
df['Age_Category'] = pd.cut(df['Age'], bins=bins, labels=labels)

In [10]:
# Normalizing or standardizing variables
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[['screenillumination', 'Avgnighttimeusageperday']])
df[['screenillumination', 'c']] = scaled_features

In [11]:
df = pd.get_dummies(df, columns=['Sex', 'Age_Category'])
# Dimensionality Reduction
# Using PCA to reduce dimensionality
pca = PCA(n_components=2)
principal_components = pca.fit_transform(df[['screenillumination', 'screenillumination']])
df['PCA_Component_1'] = principal_components[:, 0]
df['PCA_Component_2'] = principal_components[:, 1]

In [12]:
# Define target variables
target_variables = ['Schimers1Lefteye', 'Schimers1righteye', 'Schimers2Lefteye', 'Schimers2righteye']
# Define X and y
X = df.drop(columns=target_variables)
y = df[target_variables]

In [13]:
# Resampling and train-test split
X_resampled = {}
y_resampled = {}
X_train_dict = {}
X_test_dict = {}
y_train_dict = {}
y_test_dict = {}

In [14]:
undersampler = RandomUnderSampler(random_state=42)
oversampler = RandomOverSampler(random_state=42)
for target_var in target_variables:
    y_target = y[target_var]
    X_under, y_under = undersampler.fit_resample(X, y_target)
    X_over, y_over = oversampler.fit_resample(X, y_target)
    X_resampled[target_var + '_undersampled'] = X_under
    y_resampled[target_var + '_undersampled'] = y_under
    X_resampled[target_var + '_oversampled'] = X_over
    y_resampled[target_var + '_oversampled'] = y_over
    X_train, X_test, y_train, y_test = train_test_split(X_resampled[target_var + '_oversampled'], 
                                                        y_resampled[target_var + '_oversampled'], 
                                                        test_size=0.2, random_state=42)
    X_train_dict[target_var] = X_train
    X_test_dict[target_var] = X_test
    y_train_dict[target_var] = y_train
    y_test_dict[target_var] = y_test

In [15]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-Score:", f1)

In [16]:
# Model training and evaluation for all classification models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42),
    "Decision Tree": DecisionTreeClassifier(),
    "Support Vector Classifier": SVC()
}

for target_var in target_variables:
    print(f"Evaluating models for {target_var}:")
    X_train_scaled = scaler.fit_transform(X_train_dict[target_var])
    X_test_scaled = scaler.transform(X_test_dict[target_var])
    
    for model_name, model in models.items():
        print(f"\n{model_name}:")
        evaluate_model(model, X_train_scaled, y_train_dict[target_var], X_test_scaled, y_test_dict[target_var])
        print("="*50)


Evaluating models for Schimers1Lefteye:

Logistic Regression:
Accuracy: 0.7083333333333334
Precision: 0.7610863095238094
Recall: 0.7083333333333334
F1-Score: 0.6789993185826518

Random Forest:
Accuracy: 0.7152777777777778
Precision: 0.7055169753086421
Recall: 0.7152777777777778
F1-Score: 0.6916595171803505

Gradient Boosting:
Accuracy: 0.7083333333333334
Precision: 0.8774154589371981
Recall: 0.7083333333333334
F1-Score: 0.7357673460798462

Decision Tree:
Accuracy: 0.7152777777777778
Precision: 0.7398671737213404
Recall: 0.7152777777777778
F1-Score: 0.7045774593746086

Support Vector Classifier:
Accuracy: 0.6875
Precision: 0.8172022807439475
Recall: 0.6875
F1-Score: 0.693651410318077
Evaluating models for Schimers1righteye:

Logistic Regression:
Accuracy: 0.875
Precision: 0.8886073747680889
Recall: 0.875
F1-Score: 0.8693900881811015

Random Forest:
Accuracy: 0.8839285714285714
Precision: 0.907374929138322
Recall: 0.8839285714285714
F1-Score: 0.8869151373283948

Gradient Boosting:
Accura

In [17]:
# Define the Gradient Boosting Classifier with the best hyperparameters
best_gb_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

In [18]:
# Fit the model on the training data
best_gb_classifier.fit(X_train_scaled, y_train)

In [19]:
# Save the model
joblib.dump(best_gb_classifier, 'best_gradient_boosting_model.pkl')
print("Best Gradient Boosting Classifier model saved successfully.")

Best Gradient Boosting Classifier model saved successfully.


In [20]:
# Load the saved Gradient Boosting model
saved_gb_classifier = joblib.load('best_gradient_boosting_model.pkl')