In [2]:
import numpy as np
import pandas as pd

In [7]:
# Load bible versions and add version column
asv = pd.read_csv('t_asv.csv')
asv['Version'] = 'asv'
bbe = pd.read_csv('t_bbe.csv')
bbe['Version'] = 'bbe'
dby = pd.read_csv('t_dby.csv', encoding="ISO-8859-1")
dby['Version'] = 'dby'
kjv = pd.read_csv('t_kjv.csv')
kjv['Version'] = 'kjv'
wbt = pd.read_csv('t_wbt.csv')
wbt['Version'] = 'wbt'
web = pd.read_csv('t_web.csv')
web['Version'] = 'web'
ylt = pd.read_csv('t_ylt.csv')
ylt['Version'] = 'ylt'
versions = [asv, bbe, dby, kjv, wbt, web, ylt]

In [8]:
allVersions = pd.concat(versions, ignore_index=True)

In [9]:
# See number of verses initially in allVersions
len(allVersions.index)

217719

In [17]:
allVersionsNoDuplicates = allVersions.drop_duplicates(subset=['t'], keep=False)

In [18]:
# See number of verses without duplicates
len(allVersionsNoDuplicates.index)

186759

In [24]:
# See how many non duplicate verses there were as a percentage 
len(allVersionsNoDuplicates.index) / len(allVersions.index)

0.8577983547600347

In [22]:
# Run on all versions without duplicates at once NB
# Using language processing
from sklearn.model_selection import train_test_split
X = allVersionsNoDuplicates['t']
y = allVersionsNoDuplicates['Version']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# Na√Øve Bayes:
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', MultinomialNB()),
])

text_clf_nb.fit(X_train, y_train)

# Form a prediction set
predictions = text_clf_nb.predict(X_test)

# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

# Print a classification report
print(metrics.classification_report(y_test,predictions))

[[3753  188 2416  184  158  805 1310]
 [  10 9832   31    1    3   42  108]
 [1579  304 4664   71  158 1290 1574]
 [2568  105 1571  826  288  499  787]
 [1437  193 1851  169 1411 1039  785]
 [ 266  474  748   14   85 7513  388]
 [ 352  219  789   26   48  398 8301]]
              precision    recall  f1-score   support

         asv       0.38      0.43      0.40      8814
         bbe       0.87      0.98      0.92     10027
         dby       0.39      0.48      0.43      9640
         kjv       0.64      0.12      0.21      6644
         wbt       0.66      0.20      0.31      6885
         web       0.65      0.79      0.71      9488
         ylt       0.63      0.82      0.71     10133

    accuracy                           0.59     61631
   macro avg       0.60      0.55      0.53     61631
weighted avg       0.60      0.59      0.56     61631



In [21]:
# Run on all versions without duplicates at once LSVC
# Using language processing
from sklearn.model_selection import train_test_split
X = allVersionsNoDuplicates['t']
y = allVersionsNoDuplicates['Version']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

# Linear SVC:
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

text_clf_lsvc.fit(X_train, y_train)

# Form a prediction set
predictions = text_clf_lsvc.predict(X_test)

# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

# Print a classification report
print(metrics.classification_report(y_test,predictions))

[[3287  137 1592 1448  778  611  889]
 [  23 9643   54    1   21  126  106]
 [1558  274 4054  647  908  995 1218]
 [1395   63  589 2909 1005  269  411]
 [ 638  143  764  689 3711  655  521]
 [ 339  392  546  116  410 7554  230]
 [ 357  176  560  231  248  247 8093]]
              precision    recall  f1-score   support

         asv       0.43      0.38      0.40      8742
         bbe       0.89      0.97      0.93      9974
         dby       0.50      0.42      0.46      9654
         kjv       0.48      0.44      0.46      6641
         wbt       0.52      0.52      0.52      7121
         web       0.72      0.79      0.75      9587
         ylt       0.71      0.82      0.76      9912

    accuracy                           0.64     61631
   macro avg       0.61      0.62      0.61     61631
weighted avg       0.62      0.64      0.63     61631



In [23]:
# Run on all versions without duplicates at once LR
# Using language processing
from sklearn.model_selection import train_test_split
X = allVersionsNoDuplicates['t']
y = allVersionsNoDuplicates['Version']

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Logistic Regression:
text_clf_lr = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LogisticRegression()),
])

text_clf_lr.fit(X_train, y_train)

# Form a prediction set
predictions = text_clf_lr.predict(X_test)

# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

# Print a classification report
print(metrics.classification_report(y_test,predictions))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[[3449  152 1802 1105  792  617  897]
 [  27 9521   84    8   34  182  171]
 [1396  277 4339  526  889 1025 1188]
 [1469  109  714 2636  901  339  476]
 [ 613  212  888  428 3602  675  467]
 [ 255  450  583   69  365 7490  276]
 [ 391  163  629  160  280  351 8159]]
              precision    recall  f1-score   support

         asv       0.45      0.39      0.42      8814
         bbe       0.87      0.95      0.91     10027
         dby       0.48      0.45      0.46      9640
         kjv       0.53      0.40      0.46      6644
         wbt       0.52      0.52      0.52      6885
         web       0.70      0.79      0.74      9488
         ylt       0.70      0.81      0.75     10133

    accuracy                           0.64     61631
   macro avg       0.61      0.62      0.61     61631
weighted avg       0.62      0.64      0.63     61631



In [27]:
# See what percentage of duplicates each version had
versionStrings = ['asv', 'bbe', 'dby', 'kjv', 'wbt', 'web', 'ylt']
for v in versionStrings:
    numUnique = len(allVersionsNoDuplicates.loc[allVersionsNoDuplicates['Version'] == v]) / len(allVersions.loc[allVersions['Version'] == v])
    print("Percent of unique verses for version " + v + ": " + str(numUnique))

Percent of unique verses for version asv: 0.8515898787898274
Percent of unique verses for version bbe: 0.9733144712728675
Percent of unique verses for version dby: 0.9383903019389691
Percent of unique verses for version kjv: 0.6485548017876089
Percent of unique verses for version wbt: 0.6781328361087893
Percent of unique verses for version web: 0.9313227445180374
Percent of unique verses for version ylt: 0.9833135067356846


In [31]:
# Remove kjv and wbt because they have the most duplicates
allVersionsNoDuplicatesNoKJV = allVersionsNoDuplicates[allVersionsNoDuplicates.Version != 'kjv']
allVersionsNoDuplicatesNoKJVWBT = allVersionsNoDuplicatesNoKJV[allVersionsNoDuplicatesNoKJV.Version != 'wbt']

In [33]:
# Run on all versions without duplicates at once LSVC No KJV
# Using language processing
from sklearn.model_selection import train_test_split
X = allVersionsNoDuplicatesNoKJV['t']
y = allVersionsNoDuplicatesNoKJV['Version']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

# Linear SVC:
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

text_clf_lsvc.fit(X_train, y_train)

# Form a prediction set
predictions = text_clf_lsvc.predict(X_test)

# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

# Print a classification report
print(metrics.classification_report(y_test,predictions))

[[4507  127 1641  995  621  836]
 [  26 9746   54   26  116  109]
 [2025  231 4231 1069  962 1167]
 [ 806  149  725 4070  573  533]
 [ 460  387  566  408 7561  251]
 [ 515  158  634  310  252 8127]]
              precision    recall  f1-score   support

         asv       0.54      0.52      0.53      8727
         bbe       0.90      0.97      0.93     10077
         dby       0.54      0.44      0.48      9685
         wbt       0.59      0.59      0.59      6856
         web       0.75      0.78      0.77      9633
         ylt       0.74      0.81      0.77      9996

    accuracy                           0.70     54974
   macro avg       0.68      0.69      0.68     54974
weighted avg       0.69      0.70      0.69     54974



In [32]:
# Run on all versions without duplicates at once LSVC No KJV WBT
# Using language processing
from sklearn.model_selection import train_test_split
X = allVersionsNoDuplicatesNoKJVWBT['t']
y = allVersionsNoDuplicatesNoKJVWBT['Version']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

# Linear SVC:
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

text_clf_lsvc.fit(X_train, y_train)

# Form a prediction set
predictions = text_clf_lsvc.predict(X_test)

# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

# Print a classification report
print(metrics.classification_report(y_test,predictions))

[[4974  135 1880  753  936]
 [  37 9583   48  134  114]
 [2282  259 4796 1112 1307]
 [ 557  356  674 7705  279]
 [ 566  179  711  291 8345]]
              precision    recall  f1-score   support

         asv       0.59      0.57      0.58      8678
         bbe       0.91      0.97      0.94      9916
         dby       0.59      0.49      0.54      9756
         web       0.77      0.81      0.79      9571
         ylt       0.76      0.83      0.79     10092

    accuracy                           0.74     48013
   macro avg       0.72      0.73      0.73     48013
weighted avg       0.73      0.74      0.73     48013

