In [22]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report

from preproces import text_clean

import pickle

In [23]:
VALIDATION_DATASET = True
TEST_DATSET = False

In [24]:
if VALIDATION_DATASET:
    df = pd.read_csv('../../../Dataset/Annotated_Dataset/valid_annotated.csv')
if TEST_DATSET:
    df = pd.read_csv('../../../Dataset/Annotated_Dataset/test_annotated.csv')
    
df.head()

Unnamed: 0,file_id,toc_section,toc_section_pos,toc_section_len,is_section_in_summary
0,30777,Financial and operational highlights,161,22,False
1,30777,Strategic report,183,6,True
2,30777,Global network,189,11,False
3,30777,Chairman’s statement,200,4,True
4,30777,Chief Executive’s review,204,4,True


In [25]:
df.isna().sum()

file_id                  0
toc_section              0
toc_section_pos          0
toc_section_len          0
is_section_in_summary    0
dtype: int64

In [26]:
df['toc_section_cleaned'] = df.toc_section.apply(text_clean)
df.head()

Unnamed: 0,file_id,toc_section,toc_section_pos,toc_section_len,is_section_in_summary,toc_section_cleaned
0,30777,Financial and operational highlights,161,22,False,financi oper highlight
1,30777,Strategic report,183,6,True,strateg report
2,30777,Global network,189,11,False,global network
3,30777,Chairman’s statement,200,4,True,chairman statement
4,30777,Chief Executive’s review,204,4,True,chief execut review


In [42]:
label_encoder = pickle.load(open('model/label_encoder.pkl', 'rb'))

In [28]:
df.is_section_in_summary = label_encoder.transform(df.is_section_in_summary)
df.head()

Unnamed: 0,file_id,toc_section,toc_section_pos,toc_section_len,is_section_in_summary,toc_section_cleaned
0,30777,Financial and operational highlights,161,22,0,financi oper highlight
1,30777,Strategic report,183,6,1,strateg report
2,30777,Global network,189,11,0,global network
3,30777,Chairman’s statement,200,4,1,chairman statement
4,30777,Chief Executive’s review,204,4,1,chief execut review


In [29]:
feature_label_list=['toc_section_cleaned', 'toc_section_pos', 'toc_section_len', 'is_section_in_summary']
feature_list=['toc_section_cleaned', 'toc_section_pos', 'toc_section_len']
categorical_cols = ['toc_section_pos', 'toc_section_len']

In [30]:
df.toc_section_cleaned.fillna('missing', inplace=True)

In [31]:
X = df[feature_list]
y = df.is_section_in_summary

In [43]:
tfidf_vectorizer_toc_section = pickle.load(open('model/tfidf_vectorizer_toc_section.pkl', 'rb'))

In [33]:
X_tfidf = tfidf_vectorizer_toc_section.transform(X.toc_section_cleaned)
X_tfidf_columns = [i + '_' + 'toc' for i in tfidf_vectorizer_toc_section.get_feature_names()]
X_tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=X_tfidf_columns)



In [34]:
clf = pickle.load(open('section_classification_model.pkl', 'rb'))

In [35]:
X_new = X_tfidf_df.copy()
X_new['toc_section_pos'] = X.toc_section_pos
X_new['toc_section_len'] = X.toc_section_len
y_pred = clf.predict(X_new)

In [36]:
if VALIDATION_DATASET:
    print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.93      0.85      6947
           1       0.63      0.31      0.41      2650

    accuracy                           0.76      9597
   macro avg       0.71      0.62      0.63      9597
weighted avg       0.74      0.76      0.73      9597



In [37]:
df_predicted = df.copy()
df_predicted['pred'] = y_pred
y_pred_prob = clf.predict_proba(X_new)
y_pred_prob_df = pd.DataFrame(y_pred_prob,columns=label_encoder.classes_)
df_predicted['False'] = y_pred_prob_df[False]
df_predicted['True'] = y_pred_prob_df[True]
df_predicted

Unnamed: 0,file_id,toc_section,toc_section_pos,toc_section_len,is_section_in_summary,toc_section_cleaned,pred,False,True
0,30777,Financial and operational highlights,161,22,0,financi oper highlight,1,0.412114,0.587886
1,30777,Strategic report,183,6,1,strateg report,0,0.932721,0.067279
2,30777,Global network,189,11,0,global network,0,0.748624,0.251376
3,30777,Chairman’s statement,200,4,1,chairman statement,1,0.018934,0.981066
4,30777,Chief Executive’s review,204,4,1,chief execut review,1,0.006269,0.993731
...,...,...,...,...,...,...,...,...,...
9592,33155,Audit Committee report,6247,345,0,audit committe report,0,0.975820,0.024180
9593,33155,Directors’ Remuneration report,6592,1406,0,director remuner report,0,0.998092,0.001908
9594,33155,Corporate Governance Code,7998,69,1,corpor govern code,0,0.986258,0.013742
9595,33155,Directors’ biographies,8067,570,0,director biographi,0,0.991681,0.008319


In [44]:
if VALIDATION_DATASET:
    pickle.dump(df_predicted, open('out/validation_df_predicted.pkl', 'wb'))
if TEST_DATSET:
    pickle.dump(df_predicted, open('out/test_df_predicted.pkl', 'wb'))