# Initial Setups

This section will setup the notebook for the process of table classification

In [8]:
# iPython Setting
%load_ext autoreload
%autoreload 2

# Imports
import numpy as np
import pandas as pd
from cellstructure import Datasheet
from files import get_list_of_files_with_ext, join

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# String Cleanup

In this section, the individual keywords extracted from the table are cleaned up

In [2]:
# Files and paths
path_to_excel_folder = "/Volumes/T7/thesis-data/test/selected_excel/"

file_list = get_list_of_files_with_ext(
    path_to_folder=path_to_excel_folder,
    ext="xlsx",
    verbose=False,
    randomise=True
)

single_file = file_list[0]

print(single_file)

/Volumes/T7/thesis-data/test/selected_excel/59bf894a49f3c.xlsx


In [3]:
ds = Datasheet()
ds.load_tables_from_excel(path_to_excel=single_file)

print(ds.name)

for table in ds.tables:
    print()
    print(table.name)
    print()
    print(table.raw_df)
    print()
    print(table.low)

59bf894a49f3c.xlsx

Table_1

           0      1      2      3       4       5      6
0     Eff(%)  Pm(W)  Vm(V)  Im(A)  Voc(V)  Isc(A)  FF(%)
1  18.5-18.6   2.87  0.528  5.428   0.632   5.751  78.94
2  18.4-18.5   2.86  0.528  5.408   0.632   5.731  78.84
3  18.3-18.4   2.84  0.528  5.378   0.631   5.712  78.77
4  18.2-18.3   2.83  0.527  5.363   0.631   5.694  78.67
5  18.1-18.2   2.81  0.526  5.338   0.630   5.678  78.54
6  18.0-18.1   2.79  0.525  5.322   0.629   5.664  78.39
7  17.8-18.0   2.78  0.523  5.301   0.629   5.640  78.19
8  17.6-17.8   2.74  0.521  5.266   0.628   5.610  77.90

['Eff', 'Pm', 'W', 'Vm', 'V', 'Im', 'A', 'Voc', 'V', 'Isc', 'A', 'FF']

Table_2

           0                                              1          2  \
0  CELL TYPE                                          a(mm)      b(mm)   
1       M652                                       0.5\n125  62    0.5   
2   Comments                    a——Side length of the cell;              
3             b——Center

# Keyword Modelling

The the keywords and their labels are converted to sentences. It is not the most efficient way to do this, but it will get the job done. Needs to be reformulated if there are a large number of files and tables.

In [4]:
labels_file = "/Volumes/T7/thesis-data/test/selected_excel/labels.csv"

labels_df = pd.read_csv(labels_file, names=['filename', 'tablename', 'class'])
labels_list = labels_df.values.tolist()

ds = Datasheet()

useful_list = []

for label in labels_list:

    path_to_ds = join(path_to_excel_folder, label[0])

    ds.load_tables_from_excel(path_to_ds)

    for table in ds.tables:

        if table.name == label[1]:

            sow = ' '.join(table.low)

            useful_list.append([sow, label[2]])


useful_df = pd.DataFrame(useful_list, columns=['keywords', 'class'])
useful_df.head()

Unnamed: 0,keywords,class
0,Efficiency Code Unit Efficiency Eff Power Ppm ...,e
1,Dimension mm mm mm Thickness Si m m Front Sili...,d
2,Efficiency Code Unit Efficiency Eff Power Ppm ...,e
3,Light Intensity Dependence Intensity Wm Vpm Ipm,o
4,Temperature Coefficients Current Temperature C...,t


In [5]:
path_to_useful_csv = "/Volumes/T7/thesis-data/test/selected_excel/cleaned-up-2.csv"
useful_df.to_csv(path_to_useful_csv, header=False, index=False)

# Fitting the Model

In this section we will try our hand at this prediction thing

### Imports

Setting up the imports for word vectorisation and classification

In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

### Loading Dataset

The dataset that is to be used for training and evaluation is loaded in this section.

In [5]:
path_to_useful_csv = "/Volumes/T7/thesis-data/test/selected_excel/cleaned-up-2.csv"

useful_df = pd.read_csv(path_to_useful_csv, names=['keywords', 'class'])
useful_df.fillna(value="", inplace=True)
useful_df.head()

Unnamed: 0,keywords,class
0,Efficiency Code Unit Efficiency Eff Power Ppm ...,e
1,Dimension mm mm mm Thickness Si m m Front Sili...,d
2,Efficiency Code Unit Efficiency Eff Power Ppm ...,e
3,Light Intensity Dependence Intensity Wm Vpm Ipm,o
4,Temperature Coefficients Current Temperature C...,t


### Word Vectorisation

Putting the words from the tables into context

In [32]:
cv = CountVectorizer(max_features=5000)
tf = TfidfVectorizer(max_features=5000)

X = useful_df['keywords']
y = useful_df['class']

X_tfidf = tf.fit_transform(X)
X_cv = cv.fit_transform(X)

In [42]:
useful_df.head()

useful_df.groupby("class").count()

Unnamed: 0_level_0,keywords
class,Unnamed: 1_level_1
d,40
e,62
o,77
t,36


### KFold Training and Test

In this section we will evaluate

In [33]:
def avg_classification_report(classification_reports: list):
    """
    This function has only been designed for this task of
    table classification only.
    """

    print("Averaging Classifcation Reports")

    d_precision = 0.0
    e_precision = 0.0
    t_precision = 0.0
    o_precision = 0.0

    d_recall = 0.0
    e_recall = 0.0
    t_recall = 0.0
    o_recall = 0.0

    d_f1 = 0.0
    e_f1 = 0.0
    t_f1 = 0.0
    o_f1 = 0.0

    accuracy = 0.0


    for report in classification_reports:

        d_precision = d_precision + report.get("d").get("precision")
        e_precision = e_precision + report.get("e").get("precision")
        t_precision = t_precision + report.get("t").get("precision")
        o_precision = o_precision + report.get("o").get("precision")

        d_recall = d_recall + report.get("d").get("recall")
        e_recall = e_recall + report.get("e").get("recall")
        t_recall = t_recall + report.get("t").get("recall")
        o_recall = o_recall + report.get("o").get("recall")

        d_f1 = d_f1 + report.get("d").get("f1-score")
        e_f1 = e_f1 + report.get("e").get("f1-score")
        t_f1 = t_f1 + report.get("t").get("f1-score")
        o_f1 = o_f1 + report.get("o").get("f1-score")

        accuracy = accuracy + report.get("accuracy")

    total_reports = len(classification_reports)

    d_precision = d_precision / total_reports
    e_precision = e_precision / total_reports
    t_precision = t_precision / total_reports
    o_precision = o_precision / total_reports
    d_recall = d_recall / total_reports
    e_recall = e_recall / total_reports
    t_recall = t_recall / total_reports
    o_recall = o_recall / total_reports
    d_f1 = d_f1 / total_reports
    e_f1 = e_f1 / total_reports
    t_f1 = t_f1 / total_reports
    o_f1 = o_f1 / total_reports
    accuracy = accuracy / total_reports

    avgd_output = {
        "d" : {
            "precision" : d_precision,
            "recall" : d_recall,
            "f1" : d_f1
        },
        "e" : {
            "precision" : e_precision,
            "recall" : e_recall,
            "f1" : e_f1
        },
        "t" : {
            "precision" : t_precision,
            "recall" : t_recall,
            "f1" : t_f1
        },
        "o" : {
            "precision" : o_precision,
            "recall" : o_recall,
            "f1" : o_f1
        },
        "accuracy" : accuracy
    }

    return avgd_output

In [36]:
kf = KFold(n_splits=5, random_state=None, shuffle=False)

test_arr = np.array([1, 2, 3, 4, 5])

knn = KNeighborsClassifier(n_neighbors=7)
nb = MultinomialNB()

knn_tfidf_folds = []
knn_cv_folds = []
nb_tfidf_folds = []
nb_cv_folds = []

for i, (train_index, test_index) in enumerate(kf.split(X_tfidf)):
    print(f"Fold {i}:")
    
    X_train = X_tfidf[train_index]
    X_test = X_tfidf[test_index]
    y_train = y[train_index]
    y_test = y[test_index]

    # K Nearest Neighbours with TF-IDF
    knn.fit(X_train, y_train)
    knn_pred = knn.predict(X_test)
    knn_tfidf_currfold = classification_report(y_test, knn_pred, output_dict=True)
    knn_tfidf_folds.append(knn_tfidf_currfold)

    # Naive Bayes with TF-IDF
    nb.fit(X_train, y_train)
    nb_pred = nb.predict(X_test)
    nb_tfidf_currfold = classification_report(y_test, nb_pred, output_dict=True)
    nb_tfidf_folds.append(nb_tfidf_currfold)

for i, (train_index, test_index) in enumerate(kf.split(X_cv)):
    print(f"Fold {i}:")
    
    X_train = X_cv[train_index]
    X_test = X_cv[test_index]
    y_train = y[train_index]
    y_test = y[test_index]

    # K Nearest Neighbours with Count Vectoriser
    knn.fit(X_train, y_train)
    knn_pred = knn.predict(X_test)
    knn_tfidf_currfold = classification_report(y_test, knn_pred, output_dict=True)
    knn_cv_folds.append(knn_tfidf_currfold)

    # Naive Bayes with Count Vectoriser
    nb.fit(X_train, y_train)
    nb_pred = nb.predict(X_test)
    nb_tfidf_currfold = classification_report(y_test, nb_pred, output_dict=True)
    nb_cv_folds.append(nb_tfidf_currfold)



Fold 0:
Fold 1:
Fold 2:
Fold 3:
Fold 4:
Fold 0:
Fold 1:
Fold 2:
Fold 3:
Fold 4:


In [37]:
def pretty(d, indent=0):
   for key, value in d.items():
      print('\t' * indent + str(key))
      if isinstance(value, dict):
         pretty(value, indent+1)
      else:
         print('\t' * (indent+1) + str(value))

print("KNN TF-IDF Folds")
pretty(avg_classification_report(knn_tfidf_folds))
print()

print("NB TF-IDF")
pretty(avg_classification_report(nb_tfidf_folds))
print()

print("KNN Count Vectoriser")
pretty(avg_classification_report(knn_cv_folds))
print()

print("NB Count Vectoriser")
pretty(avg_classification_report(nb_cv_folds))
print()

KNN TF-IDF Folds
Averaging Classifcation Reports
d
	precision
		1.0
	recall
		0.6561904761904762
	f1
		0.7742857142857142
e
	precision
		0.9550000000000001
	recall
		0.6
	f1
		0.7260453272217979
t
	precision
		1.0
	recall
		0.7878571428571429
	f1
		0.8705128205128204
o
	precision
		0.6099924552098466
	recall
		0.9875
	f1
		0.7477586969747249
accuracy
	0.7720930232558139

NB TF-IDF
Averaging Classifcation Reports
d
	precision
		0.9318181818181819
	recall
		0.975
	f1
		0.9485714285714286
e
	precision
		0.9253621378621378
	recall
		0.9875
	f1
		0.954591575091575
t
	precision
		1.0
	recall
		0.915
	f1
		0.9513725490196079
o
	precision
		0.9491666666666667
	recall
		0.9277272727272727
	f1
		0.9347950439869717
accuracy
	0.944186046511628

KNN Count Vectoriser
Averaging Classifcation Reports
d
	precision
		0.95
	recall
		0.8734126984126984
	f1
		0.8997619047619047
e
	precision
		0.9236363636363636
	recall
		0.8716666666666665
	f1
		0.8948541313758704
t
	precision
		0.975
	recall
		0.715952380

### Traditional - Training and Test

In this section, the training and evaluation of the dataset takes place.

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)

In [10]:
knn = KNeighborsClassifier(n_neighbors=7)

nb = MultinomialNB()

knn.fit(X_train, y_train)
nb.fit(X_train, y_train)

MultinomialNB()

In [11]:
knn_pred = knn.predict(X_test)
nb_pred = nb.predict(X_test)

In [12]:
print("K Nearest Neighbours")
print(classification_report(y_test, knn_pred))

print("Naive Bayes")
print(classification_report(y_test, nb_pred))

K Nearest Neighbours
              precision    recall  f1-score   support

           d       1.00      0.67      0.80         6
           e       1.00      0.70      0.82        10
           o       0.75      1.00      0.86        18
           t       1.00      0.89      0.94         9

    accuracy                           0.86        43
   macro avg       0.94      0.81      0.86        43
weighted avg       0.90      0.86      0.86        43

Naive Bayes
              precision    recall  f1-score   support

           d       1.00      1.00      1.00         6
           e       0.91      1.00      0.95        10
           o       0.94      0.94      0.94        18
           t       1.00      0.89      0.94         9

    accuracy                           0.95        43
   macro avg       0.96      0.96      0.96        43
weighted avg       0.96      0.95      0.95        43



#  Saving the model

In the following sections, the model is being saved for later uses using Pickle

In [13]:
import pickle

In [18]:
# Save the classification model
pickle.dump(nb, open("nb_classifier.pickle", "wb"))

# Save the Vectoriser
pickle.dump(tf, open("vectoriser.pickle", "wb"))

In [19]:
clf = pickle.load(open("nb_classifier.pickle", "rb"))
vec = pickle.load(open("vectoriser.pickle", "rb"))

print(clf)
print(vec)

MultinomialNB()
TfidfVectorizer(max_features=5000)


In [20]:
clf_pred = clf.predict(X_test)
print(clf_pred)
print("Naive Bayes")
print(classification_report(y_test, nb_pred))

['o' 'o' 'e' 'o' 'o' 'd' 'e' 't' 'o' 'o' 'e' 't' 'd' 't' 'd' 'o' 'e' 'o'
 'd' 'o' 'e' 't' 'o' 't' 'o' 'o' 't' 'e' 't' 'o' 'o' 'e' 'e' 'o' 'd' 'e'
 't' 'e' 'o' 'o' 'd' 'e' 'o']
Naive Bayes
              precision    recall  f1-score   support

           d       1.00      1.00      1.00         6
           e       0.91      1.00      0.95        10
           o       0.94      0.94      0.94        18
           t       1.00      0.89      0.94         9

    accuracy                           0.95        43
   macro avg       0.96      0.96      0.96        43
weighted avg       0.96      0.95      0.95        43



# Visualisation & Testing

In the following sections the best model is being tested by loading tables from a new unlabelled file.

In [33]:
sample_ds = Datasheet()
sample_ds.load_tables_from_excel("/Volumes/T7/thesis-data/test/single_excel/test.xlsx")

low = sample_ds.tables[0].low

sow = ' '.join(low)

print(sow)

print(low)

tow = vec.transform([sow])

print(clf.predict(tow))

Electrical Performance No Efficiency Pmpp W Umpp V Impp A Uoc v Isc A
['Electrical', 'Performance', 'No', 'Efficiency', 'Pmpp', 'W', 'Umpp', 'V', 'Impp', 'A', 'Uoc', 'v', 'Isc', 'A']
['e']


In [43]:
sample_ds_2 = Datasheet(
    path_to_excel="/Volumes/T7/thesis-data/test/single_excel/Allesun_AV-166-9M.xlsx",
    path_to_clf="nb_classifier.pickle",
    path_to_vec="vectoriser.pickle"
    )

d
o
t
e
