In [1]:
import seaborn as sns
import scipy
import skimage
sns.set_style("ticks")
from pathlib import Path

# Get the data 

In [2]:
# in this file is the url to the spreadsheet with the data
# PIGLEG_SPREADSHEET_URL=https://docs.google.com/spreadsheets/d/1G55kXxcJ0...piLaK0snddRU-jzU/edit?usp=sharing
env_paths = list(Path(".").glob("../**/.env.dev"))
print(env_paths)
# find line with PIGLEG_SPREADSHEET_URL and give me the value after the "="
PIGLEG_SPREADSHEET_URL = [line.split("=", 1)[1].strip() for line in open(env_paths[0], "r") if line.startswith("PIGLEG_SPREADSHEET_URL")][0]
PIGLEG_SPREADSHEET_URL = PIGLEG_SPREADSHEET_URL.replace("\"", "")
PIGLEG_SPREADSHEET_URL = PIGLEG_SPREADSHEET_URL.replace("edit?usp=sharing", "export?format=csv")
# PIGLEG_SPREADSHEET_URL

[PosixPath('../.env.dev')]


In [3]:
import pandas as pd
url = PIGLEG_SPREADSHEET_URL
df = pd.read_csv(url)

In [4]:
df.tail()

Unnamed: 0,filename,report_url,email,uploaded_at,finished_at,fps,Needle holder length [m],Scissors length [m],Forceps length [m],Needle holder visibility [s],...,Unnamed: 791,Unnamed: 792,Unnamed: 793,Unnamed: 794,Unnamed: 795,Unnamed: 796,Unnamed: 797,Unnamed: 798,Unnamed: 799,Unnamed: 800
859,test_end_3.mp4,http://suran.kky.zcu.cz:8000//uploader/web_rep...,mjirik@gapps.zcu.cz,2024-01-10 13:08,2024-01-10 12:18,30.0,3045975787.0,0.0,6079601432.0,6133333333.0,...,,,,,,,,,,
860,Glove_2.mp4,http://suran.kky.zcu.cz:8000//uploader/web_rep...,carina.bachmann@med.uni-jena.de,2023-06-13 07:40,2024-01-10 12:21,30.0,5302629762.0,3148750647.0,,1618666667.0,...,,,,,,,,,,
861,,,,,,,,,,,...,,,,,,,,,,
862,,,,,,,,,,,...,,,,,,,,,,
863,,,,,,,,,,,...,,,,,,,,2024-01-14 16:31:50,721336025.0,9958372819.0


In [5]:
# remove all records with filename NA or NaN
dfs = df[df["filename"].notna()]

# take all records with filename containing "Good", "Medium", "Bad", if the filename is the same, take the last one

dfs = dfs[dfs["filename"].str.contains("Good|Medium|Bad")].drop_duplicates(subset=["filename"], keep="last")

In [6]:
# add column "stars_auto" with 0 for "Good", 1 for "Medium", 2 for "Bad"
dfs["stars_auto"] = dfs["filename"].str.contains("Good").astype(int) * 0
dfs["stars_auto"] = dfs["filename"].str.contains("Medium").astype(int) * 1
dfs["stars_auto"] = dfs["filename"].str.contains("Bad").astype(int) * 2
# dfs["filename"].str.contains("Good|Medium|Bad")


In [7]:
len(dfs)

22

In [8]:
dfs.columns

Index(['filename', 'report_url', 'email', 'uploaded_at', 'finished_at', 'fps',
       'Needle holder length [m]', 'Scissors length [m]', 'Forceps length [m]',
       'Needle holder visibility [s]',
       ...
       'Unnamed: 792', 'Unnamed: 793', 'Unnamed: 794', 'Unnamed: 795',
       'Unnamed: 796', 'Unnamed: 797', 'Unnamed: 798', 'Unnamed: 799',
       'Unnamed: 800', 'stars_auto'],
      dtype='object', length=802)

### Replace "," with "."

In [9]:
# find the columns where its values contain single "," and digits
dfs.columns[dfs.apply(lambda x: (x.dtype is str) and (x.str.contains(",\d").any()))]
cols = []
for col in dfs.columns:
    try:
        if (dfs[col].str.contains(",\d").any()):
            # convert , to . and change the column type to float
            dfs[col] = dfs[col].str.replace(",", ".").astype(float)
            cols.append(col)
    except:
        pass
# cols

In [10]:
# dfs

In [11]:
# print name of columns containing needle holder
# dfs.columns[dfs.columns.str.contains("Needle holder")]

In [12]:
# dfs[["Needle holder stitch 1 length [m]", "Needle holder length [m]"]]

### If there is just one stitch, put ths information into stitch 1 measurements

In [13]:
# If column containing " stitch 1" in its name has empty values, fill them with  the values from the column with the same name but withuou " stitch 1"
for col in dfs.columns[dfs.columns.str.contains(" stitch 1")]:
    dfs[col] = dfs[col].fillna(dfs[col.replace(" stitch 1", "")])

In [14]:
# dfs[["Needle holder stitch 1 length [m]", "Needle holder length [m]"]]


# Find the correlation

In [15]:
# dfs.corr()

In [16]:
# find corelation between stars_auto and other numerical columns and sort them
dfs.corr()["stars_auto"].sort_values()



Forceps bbox visibility [%]   -0.479464
Stitches parallelism score    -0.330578
Scissors visibility [%]       -0.325639
n_stitches_by_user            -0.325079
duration_s_tracking           -0.269202
                                 ...   
Unnamed: 793                        NaN
Unnamed: 794                        NaN
Unnamed: 795                        NaN
Unnamed: 796                        NaN
Unnamed: 797                        NaN
Name: stars_auto, Length: 624, dtype: float64

In [17]:
# try to find the best columns to predict stars_auto
corr = dfs.corr()["stars_auto"].sort_values(ascending=False).head(10)
corr


stars_auto                              1.000000
Forceps stitch 1 length [m]             0.397985
qr_data_scene_width_m                   0.336219
qr_data_pix_size                        0.336219
Needle holder area presence [%]         0.319609
Forceps stitch 1 area presence [%]      0.318441
Forceps area presence [%]               0.304575
orig frame_height                       0.292419
Forceps length [m]                      0.277674
Needle holder bbox area presence [%]    0.277623
Name: stars_auto, dtype: float64

# Train predictor

In [18]:
import sklearn
# train a model to predict stars_auto
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
# import SVM
from sklearn.svm import SVR
#train a model to predict stars_auto

columns = list(corr.index)
columns.append("filename")

dfsx = dfs[columns]
print(len(dfsx))
dfsx = dfsx.dropna()
print(len(dfsx))
X = dfsx[corr.index].drop("stars_auto", axis=1)
y = dfsx["stars_auto"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = SVR()
model.fit(X_train, y_train)

# 
# X

22
20


## Score

In [20]:
model.score(X_test, y_test)

-0.7824458388772826

In [27]:
model.score(X_train, y_train)

-0.4338763452314467

## Check the předictions

In [21]:
y_test

816    2
842    0
840    0
822    2
Name: stars_auto, dtype: int64

In [22]:
y_pred = model.predict(X_test)
y_pred

array([0.12636363, 0.11648296, 0.03380476, 0.10143237])

In [23]:
model.predict(X_train)


array([0.10069563, 0.11084839, 0.09947057, 0.10949899, 0.10615264,
       0.09285266, 0.12814825, 0.11112511, 0.12869557, 0.09870658,
       0.10005963, 0.10915682, 0.10023484, 0.10139677, 0.10023488,
       0.08063209])

In [24]:
y_train

829    0
826    2
833    0
824    2
843    0
841    0
835    2
823    2
830    0
857    2
825    0
834    2
828    0
832    0
839    0
827    0
Name: stars_auto, dtype: int64

In [25]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.5

In [26]:
clf.predict(X_train)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [65]:
y_test

816    2
842    0
840    0
822    2
Name: stars_auto, dtype: int32