In [22]:
import pandas as pd
from sklearn import datasets, svm, metrics
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

In [23]:
avila =  pd.read_csv('./avila-bible-datamad1019/training_dataset.csv')

In [24]:
avila.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,scribe
0,0,-0.091897,0.2976,0.079145,0.196496,0.261718,1.26996,0.446679,-0.751707,0.001721,0.998901,Philippus
1,1,-0.091897,0.226939,0.267634,0.024091,0.261718,-0.806282,0.597681,-0.601277,0.126447,-0.909619,Paithonius
2,2,0.167323,0.313302,0.168055,-0.383198,0.261718,0.190314,0.824183,0.55825,-0.247731,-0.148073,Marcus
3,3,-0.017834,-0.22843,0.37077,1.293671,0.17234,0.896237,0.182426,0.416867,1.373706,0.868284,Noaelius
4,4,0.043885,0.407516,-0.120014,0.281743,0.261718,-0.183409,0.106925,0.142896,0.531806,-0.101311,Marcus


In [25]:
X = avila[["F1","F2","F3","F4","F8","F10"]]


In [26]:
y = avila.scribe

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.01)

ValueError: Found input variables with inconsistent numbers of samples: [8012, 12017]

In [54]:
X_test.head()

Unnamed: 0,F1,F2,F3,F4,F8,F10
11332,-0.00549,0.760819,-0.095119,-0.016936,0.086398,-0.731975
204,0.117948,0.124874,0.128935,0.3837,-0.044928,-1.038175
10567,0.105604,-0.087108,0.367214,1.522618,0.856141,0.369237
4648,0.031541,-2.426761,0.37077,0.77977,0.590014,1.189673
8324,-0.116585,0.305451,0.036468,-0.41038,0.438205,-0.289769


In [55]:
y_test.head()

0     Philippus
1        Marcus
2        Marcus
3        Marcus
4    Paithonius
Name: scribe, dtype: object

In [56]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [57]:
models = {
    "svm": LinearSVC(),
    "logistic": LogisticRegression(solver='lbfgs', max_iter=2000),
    "forest": RandomForestClassifier()
}

for modelName, model in models.items():
    print(f"Training model: {modelName}")
    model.fit(X_train, y_train)

Training model: svm




Training model: logistic
Training model: forest




In [58]:
d = {modelName:model.predict(X_test) for modelName, model in models.items()}

df = pd.DataFrame(d)
y_test.reset_index(inplace=True, drop=True)
df["gt"] = y_test
df

Unnamed: 0,svm,logistic,forest,gt
0,Marcus,Marcus,Philippus,Philippus
1,Marcus,Marcus,Marcus,Marcus
2,Marcus,Marcus,Marcus,Marcus
3,Marcus,Marcus,Marcus,Marcus
4,Marcus,Marcus,Paithonius,Paithonius
...,...,...,...,...
1197,Marcus,Marcus,Philippus,Philippus
1198,Marcus,Marcus,Franciscus,Franciscus
1199,Marcus,Marcus,Philippus,Philippus
1200,Marcus,Marcus,Marcus,Marcus


In [59]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from IPython.core.display import display, HTML

switchColor = lambda value: f"<b style=\"color:orange\">{round(value,2)}</b>" if value < 0.8 else f"<b style=\"color:green\">{round(value,2)}</b>"
printBonito = lambda label,val: display(HTML(f"<span style=\"padding-left:20px\">The {label} is: {switchColor(val)}</span>"))

for modelName, model in models.items():    
    print(f"Evaluating model [{modelName}]:")
    printBonito("Accuracy", accuracy_score(df["gt"],df[modelName]))
    printBonito("Precision", precision_score(df["gt"],df[modelName],average='weighted'))
    printBonito("Recall", recall_score(df["gt"],df[modelName],average='weighted'))

Evaluating model [svm]:


  'precision', 'predicted', average, warn_for)


Evaluating model [logistic]:


Evaluating model [forest]:


In [60]:
avila_test =  pd.read_csv('./avila-bible-datamad1019/test_dataset.csv')

In [61]:
avila_test.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10
0,0,-0.042522,0.007106,0.264078,-0.162546,0.17234,-1.055431,0.106925,0.680858,0.781258,-0.901193
1,1,0.31545,0.540986,0.029355,-0.395163,0.261718,-1.055431,0.182426,-0.643408,0.095265,-0.925038
2,2,-0.351118,-0.244132,0.594823,1.592678,0.797987,0.397939,-1.025587,1.157371,1.591976,1.278013
3,3,0.327793,0.336855,-0.020434,1.086893,0.261718,-0.307984,0.220177,0.449873,-0.528364,-0.276773
4,4,0.019197,-0.087108,0.384996,0.794958,0.261718,0.314889,-0.044076,-1.067421,-0.715453,0.440809


In [62]:
X = avila_test[["F1","F2","F3","F4","F8","F10"]]

In [63]:
d = {modelName:model.predict(X) for modelName, model in models.items()}
print(d)

df = pd.DataFrame(d)
df.drop(columns=['svm', 'logistic'], inplace=True)
df.rename(columns={"forest": "scribe"}, inplace=True)
df

{'svm': array(['Marcus', 'Marcus', 'Marcus', ..., 'Marcus', 'Marcus', 'Marcus'],
      dtype=object), 'logistic': array(['Marcus', 'Marcus', 'Marcus', ..., 'Marcus', 'Marcus', 'Marcus'],
      dtype=object), 'forest': array(['Franciscus', 'Ubuntius', 'Noaelius', ..., 'Marcus', 'Philippus',
       'Franciscus'], dtype=object)}


Unnamed: 0,scribe
0,Franciscus
1,Ubuntius
2,Noaelius
3,Marcus
4,Marcus
...,...
8007,Paithonius
8008,Marcus
8009,Marcus
8010,Philippus


In [64]:
df.to_csv('./avila-bible-datamad1019/submission.csv')