In [44]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report

from preproces import text_clean

import pickle
import os

In [2]:
df = pd.read_csv('../../../Dataset/Annotated_Dataset/train_annotated_label_corrected.csv')
df.head()

Unnamed: 0,file_id,toc_section,toc_section_pos,toc_section_len,is_section_in_summary
0,10023,Highlights,37,60,True
1,10023,The Company at a glance,97,88,True
2,10023,Our strategy for growth,185,37,False
3,10023,Chairman’s statement,222,160,True
4,10023,Chief Executive’s report,382,187,True


In [3]:
df.isna().sum()

file_id                  0
toc_section              0
toc_section_pos          0
toc_section_len          0
is_section_in_summary    0
dtype: int64

In [6]:
df['toc_section_cleaned'] = df.toc_section.apply(text_clean)
df.head()

Unnamed: 0,file_id,toc_section,toc_section_pos,toc_section_len,is_section_in_summary,toc_section_cleaned
0,10023,Highlights,37,60,True,highlight
1,10023,The Company at a glance,97,88,True,compani glanc
2,10023,Our strategy for growth,185,37,False,strategi growth
3,10023,Chairman’s statement,222,160,True,chairman statement
4,10023,Chief Executive’s report,382,187,True,chief execut report


In [7]:
toc_section_values = df.toc_section_cleaned.unique()
print('Num of sections', len(toc_section_values))
num = 0
for toc_section_value in toc_section_values:
    value_counts = df[df['toc_section_cleaned'] == toc_section_value]['is_section_in_summary'].value_counts()
    try:
        total_counts = value_counts.at[False] + value_counts.at[True]
        majority_value = True
        if value_counts.at[False] > value_counts.at[True]:
            majority_value = False
        
        per_majority = value_counts.at[majority_value]/total_counts
        
        if per_majority >= 0.7:
            df.loc[df['toc_section_cleaned'] == toc_section_value, 'is_section_in_summary'] = majority_value
        
    except:
        pass
    if num%500 ==0:
        print('Processed: ',toc_section_value, ' ',  num+1)
    num = num + 1

Num of sections 8592
Processed:  highlight   1
Processed:  inform   501
Processed:  note form part financi statement   1001
Processed:  st tement tot al recognis   1501
Processed:  balanc sheet sabmil plc   2001
Processed:  unaudit statement net commerci oil ga   2501
Processed:  oper efcienc   3001
Processed:  acquisit complet year   3501
Processed:  lambert smith hampton   4001
Processed:  technolog updat   4501
Processed:  corpor govern continu   5001
Processed:  bodycot   5501
Processed:  c orpor govern   6001
Processed:  energyprocess   6501
Processed:  subsurfac expert   7001
Processed:  autocentr kpi   7501
Processed:  sust ainabl review   8001
Processed:  regulatori challeng   8501


In [8]:
label_encoder = LabelEncoder()
df.is_section_in_summary = label_encoder.fit_transform(df.is_section_in_summary)
df.head()

Unnamed: 0,file_id,toc_section,toc_section_pos,toc_section_len,is_section_in_summary,toc_section_cleaned
0,10023,Highlights,37,60,1,highlight
1,10023,The Company at a glance,97,88,1,compani glanc
2,10023,Our strategy for growth,185,37,0,strategi growth
3,10023,Chairman’s statement,222,160,1,chairman statement
4,10023,Chief Executive’s report,382,187,1,chief execut report


In [12]:
feature_label_list=['toc_section_cleaned', 'toc_section_pos', 'toc_section_len', 'is_section_in_summary']
feature_list=['toc_section_cleaned', 'toc_section_pos', 'toc_section_len']
categorical_cols = ['toc_section_pos', 'toc_section_len']

In [45]:
os.mkdir('model')

In [46]:
pickle.dump(label_encoder, open('model/label_encoder.pkl', 'wb'))

In [13]:
df = df[feature_label_list]
df.head()

Unnamed: 0,toc_section_cleaned,toc_section_pos,toc_section_len,is_section_in_summary
0,highlight,37,60,1
1,compani glanc,97,88,1
2,strategi growth,185,37,0
3,chairman statement,222,160,1
4,chief execut report,382,187,1


In [14]:
df.toc_section_cleaned.fillna('missing', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [15]:
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df.is_section_in_summary)

In [16]:
train_df.reset_index(inplace=True)
test_df.reset_index(inplace=True)

In [17]:
X_train = train_df[feature_list]
y_train = train_df.is_section_in_summary
X_test = test_df[feature_list]
y_test = test_df.is_section_in_summary

In [18]:
tfidf_vectorizer_toc_section = TfidfVectorizer(decode_error='replace', tokenizer=None, encoding='utf-8', dtype=np.float32,
                                    smooth_idf=True, min_df=2, max_features=200000, binary=True, 
                                    stop_words=None, sublinear_tf=False, norm='l2', analyzer='word', max_df=0.5,
                                    lowercase=True, use_idf=False, ngram_range=(1,2))

In [19]:
X_train_tfidf = tfidf_vectorizer_toc_section.fit_transform(X_train.toc_section_cleaned)
X_train_tfidf_columns = [i + '_' + 'toc' for i in tfidf_vectorizer_toc_section.get_feature_names()]
X_train_tfidf_df = pd.DataFrame(X_train_tfidf.toarray(), columns=X_train_tfidf_columns)



In [20]:
X_train_new = X_train_tfidf_df.copy()
X_train_new['toc_section_pos'] = X_train.toc_section_pos
X_train_new['toc_section_len'] = X_train.toc_section_len
X_train_new.shape

(54314, 4462)

In [47]:
pickle.dump(tfidf_vectorizer_toc_section, open('model/tfidf_vectorizer_toc_section.pkl', 'wb'))

In [22]:
clf = LogisticRegression(C=10.0, fit_intercept=True, intercept_scaling=1.0, penalty='l2')
clf.fit(X_train_new, y_train)

LogisticRegression(C=10.0, intercept_scaling=1.0)

In [48]:
pickle.dump(clf, open('model/section_classification_model.pkl', 'wb'))

In [24]:
X_test_tfidf = tfidf_vectorizer_toc_section.transform(X_test.toc_section_cleaned)
X_test_tfidf_columns = [i + '_' + 'toc' for i in tfidf_vectorizer_toc_section.get_feature_names()]
X_test_tfidf_df = pd.DataFrame(X_test_tfidf.toarray(), columns=X_test_tfidf_columns)



In [25]:
X_test_new = X_test_tfidf_df.copy()
X_test_new['toc_section_pos'] = X_test.toc_section_pos
X_test_new['toc_section_len'] = X_test.toc_section_len
y_pred = clf.predict(X_test_new)

In [26]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.98      0.96     11022
           1       0.90      0.68      0.78      2557

    accuracy                           0.93     13579
   macro avg       0.92      0.83      0.87     13579
weighted avg       0.92      0.93      0.92     13579



In [28]:
y_pred_tain = clf.predict(X_train_new)
train_df_predicted = train_df.copy()
train_df_predicted['pred'] = y_pred_tain
y_pred_prob_train = clf.predict_proba(X_train_new)
y_pred_prob_train_df = pd.DataFrame(y_pred_prob_train,columns=label_encoder.classes_)
train_df_predicted['False'] = y_pred_prob_train_df[False]
train_df_predicted['True'] = y_pred_prob_train_df[True]
train_df_predicted

Unnamed: 0,index,toc_section_cleaned,toc_section_pos,toc_section_len,is_section_in_summary,pred,False,True
0,39176,chief execut report,316,311,1,1,0.040802,0.959198
1,67000,consolid compani,4139,52,0,0,0.993476,0.006524
2,17816,approach busi,1289,129,1,0,0.801273,0.198727
3,50371,consolid balanc sheet,3227,52,0,0,0.994828,0.005172
4,17050,notic meet,1641,104,0,0,0.851071,0.148929
...,...,...,...,...,...,...,...,...
54309,21588,consolid statement chang equiti,6775,58,0,0,0.997763,0.002237
54310,64849,notic annual gener meet,2467,292,0,0,0.883332,0.116668
54311,47941,director report,2538,80,0,0,0.998567,0.001433
54312,41442,group cash flow statement,12523,2135,0,0,0.994963,0.005037


In [29]:
y_pred_test = clf.predict(X_test_new)
test_df_predicted = test_df.copy()
test_df_predicted['pred'] = y_pred_test
y_pred_prob_test = clf.predict_proba(X_test_new)
y_pred_prob_test_df = pd.DataFrame(y_pred_prob_test,columns=label_encoder.classes_)
test_df_predicted['False'] = y_pred_prob_test_df[False]
test_df_predicted['True'] = y_pred_prob_test_df[True]
test_df_predicted

Unnamed: 0,index,toc_section_cleaned,toc_section_pos,toc_section_len,is_section_in_summary,pred,False,True
0,35012,,307,61,0,0,0.752522,0.247478
1,28434,independ auditor report,1161,7,0,0,0.982338,0.017662
2,49671,conni de lang,2585,0,0,0,0.798756,0.201244
3,67139,lpa group plc,422,9,0,0,0.716284,0.283716
4,15509,consolid cash flow statement,1869,76,0,0,0.995952,0.004048
...,...,...,...,...,...,...,...,...
13574,32560,consolid statement,5372,40,0,0,0.997447,0.002553
13575,58932,consolid statement comprehens incom,1056,14,0,0,0.993358,0.006642
13576,58118,decad deliveri beyond,751,90,0,0,0.764780,0.235220
13577,36879,notic annual gener meet,3437,73,0,0,0.897562,0.102438


In [37]:
df_predicted = pd.concat([train_df_predicted, test_df_predicted], axis=0)
df_predicted

Unnamed: 0,index,toc_section_cleaned,toc_section_pos,toc_section_len,is_section_in_summary,pred,False,True
0,39176,chief execut report,316,311,1,1,0.040802,0.959198
1,67000,consolid compani,4139,52,0,0,0.993476,0.006524
2,17816,approach busi,1289,129,1,0,0.801273,0.198727
3,50371,consolid balanc sheet,3227,52,0,0,0.994828,0.005172
4,17050,notic meet,1641,104,0,0,0.851071,0.148929
...,...,...,...,...,...,...,...,...
13574,32560,consolid statement,5372,40,0,0,0.997447,0.002553
13575,58932,consolid statement comprehens incom,1056,14,0,0,0.993358,0.006642
13576,58118,decad deliveri beyond,751,90,0,0,0.764780,0.235220
13577,36879,notic annual gener meet,3437,73,0,0,0.897562,0.102438


In [49]:
os.mkdir('out')
pickle.dump(df_predicted, open('out/training_df_predicted.pkl', 'wb'))

In [41]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score

In [42]:
y_test_pred_proba = clf.predict_proba(X_test_new)
print('Logistic test roc-auc: {}'.format(roc_auc_score(y_test, y_test_pred_proba[:,1])))
fpr, tpr, thresholds = roc_curve(y_test, y_test_pred_proba[:,1])
accuracy_ls = []
for thre in thresholds:
    y_pred = np.where(y_test_pred_proba[:,1]>thre,1,0)
    accuracy_ls.append(accuracy_score(y_test, y_pred, normalize=True))

accuracy_ls = pd.concat([pd.Series(thresholds), pd.Series(accuracy_ls)], axis=1)

accuracy_ls.columns = ['threshold', 'accuracy']
accuracy_ls.sort_values(by='accuracy', ascending=False, inplace=True)
accuracy_ls.head()

Logistic test roc-auc: 0.9459936386337788


Unnamed: 0,threshold,accuracy
168,0.744291,0.930113
167,0.762621,0.930113
166,0.766665,0.930113
163,0.777389,0.930113
159,0.781724,0.930039


In [39]:
scores = cross_val_score(clf, X_train_new, y_train, cv=5, verbose=1, n_jobs=-1)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.5min finished


0.93 accuracy with a standard deviation of 0.00
