In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.exceptions import ConvergenceWarning
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import joblib
import warnings
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import RobustScaler
from scipy.stats import mstats
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

In [2]:
df = pd.read_csv("digital-eye.csv")
df.head()

Unnamed: 0,Name,Age,Sex,wearables,Duration,onlineplatforms,Nature,screenillumination,workingyears,hoursspentdailycurricular,...,Severityofcomplaints,RVIS,Ocularsymptomsobservedlately,Symptomsobservingatleasthalfofthetimes,Complaintsfrequency,frequencyofdryeyes,Schimers1Lefteye,Schimers1righteye,Schimers2Lefteye,Schimers2righteye
0,Syeda Afiya,22.0,2.0,1.0,2.0,1.0,2.0,2.0,1.0,1.0,...,1,1.0,13569.0,1345.0,3.0,2.0,12.0,13.0,16.0,14.0
1,Hemanth,24.0,1.0,1.0,2.0,2.0,2.0,1.0,2.0,1.0,...,0,0.0,6.0,3.0,1.0,1.0,25.0,28.0,30.0,32.0
2,Kranthi,23.0,1.0,5.0,,2.0,2.0,3.0,3.0,4.0,...,2,1.0,135679.0,12345.0,2.0,1.0,8.0,6.0,9.0,8.0
3,Sk.M.A.Khadeer,23.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,...,1,1.0,135.0,23.0,3.0,1.0,13.0,14.0,15.0,14.0
4,B.Shravan kumar,27.0,1.0,3.0,2.0,2.0,1.0,1.0,1.0,1.0,...,0,0.0,7.0,23.0,2.0,3.0,34.0,32.0,35.0,35.0


In [3]:
df.drop(columns=['Name'], inplace=True)
df.head()

Unnamed: 0,Age,Sex,wearables,Duration,onlineplatforms,Nature,screenillumination,workingyears,hoursspentdailycurricular,hoursspentdailynoncurricular,...,Severityofcomplaints,RVIS,Ocularsymptomsobservedlately,Symptomsobservingatleasthalfofthetimes,Complaintsfrequency,frequencyofdryeyes,Schimers1Lefteye,Schimers1righteye,Schimers2Lefteye,Schimers2righteye
0,22.0,2.0,1.0,2.0,1.0,2.0,2.0,1.0,1.0,2.0,...,1,1.0,13569.0,1345.0,3.0,2.0,12.0,13.0,16.0,14.0
1,24.0,1.0,1.0,2.0,2.0,2.0,1.0,2.0,1.0,1.0,...,0,0.0,6.0,3.0,1.0,1.0,25.0,28.0,30.0,32.0
2,23.0,1.0,5.0,,2.0,2.0,3.0,3.0,4.0,4.0,...,2,1.0,135679.0,12345.0,2.0,1.0,8.0,6.0,9.0,8.0
3,23.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,...,1,1.0,135.0,23.0,3.0,1.0,13.0,14.0,15.0,14.0
4,27.0,1.0,3.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,...,0,0.0,7.0,23.0,2.0,3.0,34.0,32.0,35.0,35.0


In [4]:
df.isnull().sum()

Age                                         0
Sex                                         0
wearables                                   0
Duration                                  110
onlineplatforms                             5
Nature                                     19
screenillumination                          2
workingyears                                6
hoursspentdailycurricular                   5
hoursspentdailynoncurricular                2
Gadgetsused                                 0
levelofgadjetwithrespecttoeyes              0
Distancekeptbetweeneyesandgadjet            1
Avgnighttimeusageperday                     0
Blinkingduringscreenusage                   0
Difficultyinfocusingafterusingscreens       0
freqquencyofcomplaints                      0
Severityofcomplaints                        0
RVIS                                        0
Ocularsymptomsobservedlately                2
Symptomsobservingatleasthalfofthetimes      7
Complaintsfrequency               

In [5]:
# Impute missing values
imputer = SimpleImputer(strategy='median')
data_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
from sklearn.impute import SimpleImputer
# Impute missing values for numerical features using mean
numerical_features = df.select_dtypes(include=['float64']).columns
df[numerical_features] = imputer.fit_transform(df[numerical_features])

In [6]:
df.isnull().sum()

Age                                       0
Sex                                       0
wearables                                 0
Duration                                  0
onlineplatforms                           0
Nature                                    0
screenillumination                        0
workingyears                              0
hoursspentdailycurricular                 0
hoursspentdailynoncurricular              0
Gadgetsused                               0
levelofgadjetwithrespecttoeyes            0
Distancekeptbetweeneyesandgadjet          0
Avgnighttimeusageperday                   0
Blinkingduringscreenusage                 0
Difficultyinfocusingafterusingscreens     0
freqquencyofcomplaints                    0
Severityofcomplaints                      0
RVIS                                      0
Ocularsymptomsobservedlately              0
Symptomsobservingatleasthalfofthetimes    0
Complaintsfrequency                       0
frequencyofdryeyes              

In [7]:
data_imputed = data_imputed.apply(lambda x: mstats.winsorize(x, limits=[0.01, 0.01]))

In [8]:
# Step 5: Split the dataset into features and target variables
X = data_imputed.iloc[:, :-4]  # Features
y = data_imputed.iloc[:, -4:]  # Target variables

In [9]:
original_memory_usage = df.memory_usage().sum()
original_num_columns = len(df.columns)
# After one-hot encoding
encoded_df = pd.get_dummies(df, columns=['Ocularsymptomsobservedlately', 'Symptomsobservingatleasthalfofthetimes'])
encoded_memory_usage = encoded_df.memory_usage().sum()
encoded_num_columns = len(encoded_df.columns)
# Print the estimated memory and computational requirements
print("Original DataFrame memory usage:", original_memory_usage)
print("Original DataFrame number of columns:", original_num_columns)
print("Memory usage after one-hot encoding:", encoded_memory_usage)
print("Number of columns after one-hot encoding:", encoded_num_columns)

Original DataFrame memory usage: 64932
Original DataFrame number of columns: 27
Memory usage after one-hot encoding: 102432
Number of columns after one-hot encoding: 166


In [10]:
df=pd.get_dummies(df, columns=['Ocularsymptomsobservedlately', 'Symptomsobservingatleasthalfofthetimes'])

In [11]:
df.head()

Unnamed: 0,Age,Sex,wearables,Duration,onlineplatforms,Nature,screenillumination,workingyears,hoursspentdailycurricular,hoursspentdailynoncurricular,...,Symptomsobservingatleasthalfofthetimes_234.0,Symptomsobservingatleasthalfofthetimes_235.0,Symptomsobservingatleasthalfofthetimes_245.0,Symptomsobservingatleasthalfofthetimes_345.0,Symptomsobservingatleasthalfofthetimes_354.0,Symptomsobservingatleasthalfofthetimes_1234.0,Symptomsobservingatleasthalfofthetimes_1235.0,Symptomsobservingatleasthalfofthetimes_1245.0,Symptomsobservingatleasthalfofthetimes_1345.0,Symptomsobservingatleasthalfofthetimes_12345.0
0,22.0,2.0,1.0,2.0,1.0,2.0,2.0,1.0,1.0,2.0,...,False,False,False,False,False,False,False,False,True,False
1,24.0,1.0,1.0,2.0,2.0,2.0,1.0,2.0,1.0,1.0,...,False,False,False,False,False,False,False,False,False,False
2,23.0,1.0,5.0,1.0,2.0,2.0,3.0,3.0,4.0,4.0,...,False,False,False,False,False,False,False,False,False,True
3,23.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,...,False,False,False,False,False,False,False,False,False,False
4,27.0,1.0,3.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,...,False,False,False,False,False,False,False,False,False,False


In [12]:
df.columns

Index(['Age', 'Sex', 'wearables', 'Duration', 'onlineplatforms', 'Nature',
       'screenillumination', 'workingyears', 'hoursspentdailycurricular',
       'hoursspentdailynoncurricular',
       ...
       'Symptomsobservingatleasthalfofthetimes_234.0',
       'Symptomsobservingatleasthalfofthetimes_235.0',
       'Symptomsobservingatleasthalfofthetimes_245.0',
       'Symptomsobservingatleasthalfofthetimes_345.0',
       'Symptomsobservingatleasthalfofthetimes_354.0',
       'Symptomsobservingatleasthalfofthetimes_1234.0',
       'Symptomsobservingatleasthalfofthetimes_1235.0',
       'Symptomsobservingatleasthalfofthetimes_1245.0',
       'Symptomsobservingatleasthalfofthetimes_1345.0',
       'Symptomsobservingatleasthalfofthetimes_12345.0'],
      dtype='object', length=166)

In [13]:
# Step 5: Split the dataset into features and target variables
X = data_imputed.iloc[:, :-4]  # Features
y = data_imputed.iloc[:, -4:]  # Target variables
# Step 6: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
from sklearn.preprocessing import StandardScaler
# Assuming X_train and X_test are your feature matrices
# Initialize the StandardScaler
scaler = StandardScaler()
# Fit the scaler on the training data and transform it
X_train_scaled = scaler.fit_transform(X_train)
# Transform the test data using the same scaler
X_test_scaled = scaler.transform(X_test)

In [15]:
# Create DataFrame with the correct number of columns
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
n_components = 10
pca = PCA(n_components=n_components)

# Step 7: Create a pipeline to combine scaling and PCA
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Use the scaled data
    ('pca', pca)
])

# Fit the pipeline on training data and transform both training and test data
X_train_pca = pipeline.fit_transform(X_train_scaled)
X_test_pca = pipeline.transform(X_test_scaled)

# Step 8: Train a predictive model on the transformed data
model_pca = RandomForestRegressor(n_estimators=100, random_state=42)
model_pca.fit(X_train_pca, y_train)

In [16]:
y_pred_pca = model_pca.predict(X_test_pca)
mse_pca = mean_squared_error(y_test, y_pred_pca)
r2_pca = r2_score(y_test, y_pred_pca)
rmse_pca = np.sqrt(mse_pca)
print("Random Forest Regressor-Mean Squared Error:", mse_pca)
print("Random Forest Regressor-R-squared Score:", r2_pca)
print("Random Forest Regressor-Root Mean Squared Error:", rmse_pca)

Random Forest Regressor-Mean Squared Error: 38.759545833333334
Random Forest Regressor-R-squared Score: 0.27722812792770707
Random Forest Regressor-Root Mean Squared Error: 6.2257164915641106


In [17]:
# Create an instance of the LinearRegression model
linear_regressor = LinearRegression()

# Train the linear regression model on the training data
linear_regressor.fit(X_train, y_train)

# Make predictions on the test data
y_pred_linear = linear_regressor.predict(X_test)

# Evaluate the performance of the linear regression model
mse_linear = mean_squared_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)
rmse_linear = np.sqrt(mse_linear)
print("Linear Regression - Mean Squared Error:", mse_linear)
print("Linear Regression - R-squared Score:", r2_linear)
print("Linear Regression - Root Mean Squared Error:", rmse_linear)

Linear Regression - Mean Squared Error: 40.41237641050852
Linear Regression - R-squared Score: 0.2479248014474929
Linear Regression - Root Mean Squared Error: 6.357072943620241


In [18]:
# Create an instance of the DecisionTreeRegressor model
decision_tree_regressor = DecisionTreeRegressor()
# Train the decision tree regression model on the training data
decision_tree_regressor.fit(X_train, y_train)
# Make predictions on the test data
y_pred_decision_tree = decision_tree_regressor.predict(X_test)
# Evaluate the performance of the decision tree regression model
mse_decision_tree = mean_squared_error(y_test, y_pred_decision_tree)
r2_decision_tree = r2_score(y_test, y_pred_decision_tree)
rmse_decision_tree = np.sqrt(mse_decision_tree)
print("Decision Tree Regression - Mean Squared Error:", mse_decision_tree)
print("Decision Tree Regression - R-squared Score:", r2_decision_tree)
print("Decision Tree Regression - Root Mean Squared Error:", rmse_decision_tree)

Decision Tree Regression - Mean Squared Error: 50.82083333333334
Decision Tree Regression - R-squared Score: 0.05981721673825041
Decision Tree Regression - Root Mean Squared Error: 7.128873216247666


In [19]:
X_new = pd.DataFrame({
   'Age': [24],
    'Sex': [1],
    'wearables': [1],
    'Duration': [2],  # Updated value
    'onlineplatforms': [2],
    'Nature': [2],
    'screenillumination': [1],  # Updated value
    'workingyears': [2],
    'hoursspentdailycurricular': [1],  # Updated value
    'hoursspentdailynoncurricular': [1],
    'Gadgetsused': [1],
    'levelofgadjetwithrespecttoeyes': [2],
    'Distancekeptbetweeneyesandgadjet': [2],
    'Avgnighttimeusageperday': [1],
    'Blinkingduringscreenusage': [2],  # Updated value
    'Difficultyinfocusingafterusingscreens': [1],  # Updated value
    'freqquencyofcomplaints': [0],
    'Severityofcomplaints': [0],
    'RVIS': [0],
    'Ocularsymptomsobservedlately': [6],
    'Symptomsobservingatleasthalfofthetimes': [3],
    'Complaintsfrequency': [1],
    'frequencyofdryeyes': [1]
})

In [20]:
# Fit the pipeline on training data
pipeline.fit(X_train)

# Apply the same preprocessing steps to X_new
X_new_processed = pipeline.transform(X_new)
# Make predictions
y_new_pred = model_pca.predict(X_new_processed)


In [21]:
# Print the predicted values
print("Predicted Schirmer's test results for the left and right eyes:")
print("Schirmer1 left eye:", y_new_pred[0][0])
print("Schirmer1 right eye:", y_new_pred[0][1])
print("Schirmer2 left eye:", y_new_pred[0][2])
print("Schirmer2 right eye:", y_new_pred[0][3])

Predicted Schirmer's test results for the left and right eyes:
Schirmer1 left eye: 26.17
Schirmer1 right eye: 28.11
Schirmer2 left eye: 30.29
Schirmer2 right eye: 31.4


In [22]:
X_new = pd.DataFrame({
   'Age': [22],
    'Sex': [2],
    'wearables': [1],
    'Duration': [2],  # Updated value
    'onlineplatforms': [1],
    'Nature': [2],
    'screenillumination': [2],  # Updated value
    'workingyears': [1],
    'hoursspentdailycurricular': [1],  # Updated value
    'hoursspentdailynoncurricular': [2],
    'Gadgetsused': [1],
    'levelofgadjetwithrespecttoeyes': [4],
    'Distancekeptbetweeneyesandgadjet': [2],
    'Avgnighttimeusageperday': [2],
    'Blinkingduringscreenusage': [1],  # Updated value
    'Difficultyinfocusingafterusingscreens': [1],  # Updated value
    'freqquencyofcomplaints': [1],
    'Severityofcomplaints': [1],
    'RVIS': [1],
    'Ocularsymptomsobservedlately': [13569],
    'Symptomsobservingatleasthalfofthetimes': [1345],
    'Complaintsfrequency': [3],
    'frequencyofdryeyes': [2]
})

In [23]:
# Apply the same preprocessing steps to X_new
X_new_processed = pipeline.transform(X_new)
# Make predictions
y_new_pred = model_pca.predict(X_new_processed)

In [24]:
# Print the predicted values
print("Predicted Schirmer's test results for the left and right eyes:")
print("Schirmer1 left eye:", y_new_pred[0][0])
print("Schirmer1 right eye:", y_new_pred[0][1])
print("Schirmer2 left eye:", y_new_pred[0][2])
print("Schirmer2 right eye:", y_new_pred[0][3])

Predicted Schirmer's test results for the left and right eyes:
Schirmer1 left eye: 12.6
Schirmer1 right eye: 14.09
Schirmer2 left eye: 16.7
Schirmer2 right eye: 17.65


In [25]:
import pickle
filename = 'schirmers-test-results-demo.pkl'
pickle.dump(model_pca, open(filename, 'wb'))

In [26]:
if filename:
    print("Model is saved")

Model is saved


In [27]:
import pickle
# Assuming 'pipeline' is your Pipeline object and 'scaler' is your StandardScaler object
# Save the pipeline
with open('pipeline1.pkl', 'wb') as file:
    pickle.dump(pipeline, file)
# Save the scaler
with open('scaler1.pkl', 'wb') as file:
    pickle.dump(scaler, file)