In [1]:
%load_ext autoreload
%autoreload 2

from fastai.text.all import *
from fastai.vision.all import *
import pandas as pd
import torch
from tqdm.notebook import tqdm
from sklearn.metrics import classification_report, accuracy_score

In [2]:
import tensorflow as tf
from tensorflow.keras.models import model_from_json
from tensorflow.keras.layers import Input
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from os.path import join, split, splitext
from pathlib import Path

import pandas as pd
import pickle

import tqdm

In [3]:
seed = 42

# python RNG
import random
random.seed(seed)

# pytorch RNGs
import torch
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

# numpy RNG
import numpy as np
np.random.seed(seed)

In [4]:
OUT_DIM=2

In [5]:
torch.cuda.set_device(1)

In [6]:
SEQUENCE_LEN = 500 # Size of input arrays

In [7]:
models_path = Path("./models/")
weights_path = models_path/"stf_no_weights.keras"
json_path = models_path/"cnn_text.json"
tokenizer_path = models_path/"tokenizer.pickle"

In [8]:
json_file = open(json_path,'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)

In [9]:
model.load_weights(weights_path)

In [10]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [11]:
data_path = Path("/mnt/nas/databases/Tobacco800/unziped/")

In [12]:
train = pd.read_csv(data_path/"train.csv", delimiter=';',  usecols=['binder','docid','class', 'text'])
train.rename(columns={'text':'body'}, inplace=True)
print(train.shape)
train.dropna(inplace=True)
print(train.shape)

(1031, 4)
(1019, 4)


In [13]:
val = train.iloc[-200:,:]; print(val.shape); val.reset_index(drop=True, inplace=True); val.head()

(200, 4)


Unnamed: 0,binder,docid,class,body
0,Tobacco800,pkc56d00,FirstPage,".>->aa, Mailand 20014 / (301) 654-3400 February 20, 1973 Robert C. Hockett, Ph.D. Associate Scientific Director The Gouncil For Tobacco Research - U.S.A. 110 East 59th Street New York, N.Y. 10022 Dear Doctor Hockett: Per your request, we are making available 20 copies of the enclosed reprint for distribution to your staff and Scientific Advisory Board. Sincerely yours Carrie E. Whitmire, Ph.D. Project Director CEW:saf enclosure 6RANCH OFFICE / 503 San Pablo Avenue, Albany, California 94706 (415) 526-6228 CTR C0NTRRCTS 00..."
1,Tobacco800,pkj90c00,FirstPage,"AMERICAN '93 THE NEETROnilER FIELD SALES INFORMATION October 15, 1993 RE: MISTY BILI,BOARP SUPPORT TO: SALES SUPERVISORS IN THE MARKETS EWOLVED MISTY will be supported with billboard advertising during the 4th Quarter in 42 highly developed MISTY markets across the country. Please see the reverse side of this letter for the markets involved and the approximate posting date. These attractive 4-color posters show off MISTY at its fashionable bes..."
2,Tobacco800,ply60e00,FirstPage,"LORILLARD INC. • ONE PARK AVENUE. NEW YORK. N.Y. 10016-5895 • (212) 545-3000 October 5, 1989 Mr. Jeffrey Bluestein Vice President Harley-Davidson 3700 W. Juneau Avenue P.O. Box 653 Milwaukee, WI 53201 RE: Harley-Davidson Cigarettes - POS Material Dear Mr. Bluestein: Enclosed, for your files, are samples of the point-of-sale material we have produced to date in support of Harley-Davidson Cigarettes. I will forward additional samples as they become available. /ec Enclosure cc: R. Goldbrenner V. Lindsley T. Mau A. Pasheluk Sincere..."
3,Tobacco800,pmx82f00-page04_1,FirstPage,"4 4 -/ NEWELL W. ELLISON H. THOMAS AUSTERN HOWARD C. WESTWOOD CHARLES A. HORSKY DONALD HISS JOHN T. SAPIENZA JAMES H. McGLOTHLIN ERNEST W. JENNES STANLEY L.TEMKO JAMES C. MeKAY JOHN W. DOUGLAS HAMILTON CAROTHERS J. RANDOLPH WILSON ROBERTS B. OWEN COOAR F. CZARRA.JR. WILLIAM H. ALLEN DAVID B. ISBELL * JOHN B. JONES. JR. H. EOWARD DUNKELBERGER, JR. BRICE McADOO CLAGETT JOHN S. KOCH ROBERT E. O’MALLEY EUGENE I. LAMBERT JOHN VANDERSTAR NEWMAN T. HALVORSON. JR. HARVEY M. APPLEBAUM MICHAEL S. HORNE JONATHAN D. BLAKE CHARLES E. BUFFON ROBERT N. SAYLER E. EDWARD BRUCE DAVID N. BROWN ..."
4,Tobacco800,pmx82f00-page04_2,NextPage,"COVINGTON & BURLING CONFIDENTIAL: MINNESOTA TOBACCO LITIGATION Committee of Counsel April 3, 1975 Page Two The Chairman assured us that the matter would be e'xamined in depth, and as to the second request, it was not the usual practice but he would talk with his fellow Commissioners. 2. The questions of Publicity about the compliance investigation were reviewed. The original agreement in 1974 for no Publicity had been fully honored. Since the ANPA Government Relations Committee had scheduled a meeting for March 12, and there had been a great deal..."


In [14]:
test_data = pd.read_csv(data_path/"test.csv", delimiter=';',  usecols=['binder','docid','class', 'text'])
test_data.rename(columns={'text':'body'}, inplace=True)
print(test_data.shape)
test_data.head()
test_data.dropna(inplace=True)

(259, 4)


In [15]:
with open(tokenizer_path, 'rb') as handle:
    tokenizer = pickle.load(handle, encoding="utf-8")

In [16]:
sequences_train = tokenizer.texts_to_sequences(train['body'])
sequences_validation = tokenizer.texts_to_sequences(val['body'])
sequences_test = tokenizer.texts_to_sequences(test_data['body'])

In [17]:
X_train = sequence.pad_sequences(sequences_train, maxlen=SEQUENCE_LEN, padding='post')
X_val = sequence.pad_sequences(sequences_validation, maxlen=SEQUENCE_LEN, padding='post')
X_test = sequence.pad_sequences(sequences_test, maxlen=SEQUENCE_LEN, padding='post')

In [18]:
encoder = LabelEncoder()

In [19]:
train_label = train['class'] 
train_label_toTest = encoder.fit_transform(train_label)
train_label = np.transpose(train_label_toTest)
train_label = to_categorical(train_label)


valid_label = val['class'] 
valid_label_toTest = encoder.fit_transform(valid_label)
valid_label = np.transpose(valid_label_toTest)
valid_label = to_categorical(valid_label)

test_label = test_data['class'] 
test_label_toTest = encoder.fit_transform(test_label)
test_label = np.transpose(test_label_toTest)
test_label = to_categorical(test_label)

X_train = np.array(X_train)
X_val = np.array(X_val)
X_test = np.array(X_test)

In [20]:
text_probs_val = model.predict(X_val, verbose=1)



In [21]:
text_probs_test = model.predict(X_test, verbose=1)



In [22]:
path = Path("/mnt/nas/databases/Tobacco800/unziped/page_imgs/raw")

In [23]:
df = pd.read_csv('/mnt/nas/databases/Tobacco800/unziped/train.csv', delimiter=';',  usecols=['binder','docid','class', 'text'])

In [24]:
print(df.shape)
df.dropna(inplace=True)
print(df.shape)

(1031, 4)
(1019, 4)


In [25]:
df['split'] = 'train';print(df.shape); df.head()

(1019, 5)


Unnamed: 0,binder,docid,class,text,split
0,Tobacco800,aah97e00-page02_1,FirstPage,"Dr. M.A. Manzelli, PHILIP MORRIS INC., Research Center, P.O. Box 3 D Richmond, Va. 23206 U.S.A. Dear Art, Please find enclosed a proposed program for your visit in Europe. As you will see, this program contains two major points: a) A two-days meeting in Neuchätel with all people in Europe involved in infestation control (proposed program included). b) Visit of all PM Europe production centers with the purpose of: - training people in Charge of the insect indenti-fication - inspection of the sanitary conditions One of our main problems is the insect Identification and ...",train
1,Tobacco800,aah97e00-page02_2,NextPage,"- 2 - Please let me krow if you have any changes for the proposed program. Looking forward to seeing you again, cc: Messrs. B. Kuhn E. Stoop Encl.: mentioned 1000035625",train
2,Tobacco800,aam09c00,FirstPage,"I NOIJ-04-97 13 = 25 FROM = I D : PAGE 10/17 Wachtell, Lipton, Rosen & Katz MARTIN L1PTON HERBERT M. WaCHYCLL THEODORE GEWSRTZ »ERNA*» W, NUSSBAUM RICHARD O- HATCHER DOUGLAS Sr LieBKAfSKV STEVEN M. SAR NA CMARt.CS T. MEPCRR'GK CHAIN J. FOPTCANG pBTRR C. 1AMCLLDS MICHAEL w. SCHWÄRT* Al.LAR A- MARTIN BARRV A. BRrCR IAWREHCE □. PEDQrWITZ ROBERT 0. MAZUR PAUL VIZGARRÖMSCK JR. PCTCR G- HCIM hAROlD s. «OViKOFF DAVID M. EINHORN KENNET« B. FQRREST METER C. KOPLOW THEODORE M- MIRVIS EDWARD D. MERL1HT RICHARD D. FEINTUCH DANIEL A. NEPP ERIC M. ROTR WAHREN R. STERN ANDREW R. BROWN ST EIN MICHAEL H...",train
3,Tobacco800,aao54e00_1,FirstPage,"i PHILIP .MORRIS INCORPORATED 120 PARK AVENUE, N EW YORK. N. Y. 10017 John T.-Landhy SENIOR VICC PRESIDENT _ , r . n n - . oirector of Marketing February 15, 19 83 Dear Ms. Ober: Your letter of February 3 was received by this Office yesterday. We attempted to reach you by telephone iinmediately but were unable to find a listed number in New Haven. I had hoped to arrange an appointment for one of our executives to discuss with you and your concerned ...",train
4,Tobacco800,aao54e00_2,NextPage,"In the meantime, I hope you and your friends are continuing to enjoy your Marlboro Lights ciga-rettes. We don't think you can find a better"" cigarette on the market. ■Thank you. Ms. Angela Ober 221 Farnham Avenue New Haven, Connecticut 06515",train


In [26]:
df['split'][-200:] = 'valid'; df.iloc[-202:-198];

In [27]:
df

Unnamed: 0,binder,docid,class,text,split
0,Tobacco800,aah97e00-page02_1,FirstPage,"Dr. M.A. Manzelli, PHILIP MORRIS INC., Research Center, P.O. Box 3 D Richmond, Va. 23206 U.S.A. Dear Art, Please find enclosed a proposed program for your visit in Europe. As you will see, this program contains two major points: a) A two-days meeting in Neuchätel with all people in Europe involved in infestation control (proposed program included). b) Visit of all PM Europe production centers with the purpose of: - training people in Charge of the insect indenti-fication - inspection of the sanitary conditions One of our main problems is the insect Identification and ...",train
1,Tobacco800,aah97e00-page02_2,NextPage,"- 2 - Please let me krow if you have any changes for the proposed program. Looking forward to seeing you again, cc: Messrs. B. Kuhn E. Stoop Encl.: mentioned 1000035625",train
2,Tobacco800,aam09c00,FirstPage,"I NOIJ-04-97 13 = 25 FROM = I D : PAGE 10/17 Wachtell, Lipton, Rosen & Katz MARTIN L1PTON HERBERT M. WaCHYCLL THEODORE GEWSRTZ »ERNA*» W, NUSSBAUM RICHARD O- HATCHER DOUGLAS Sr LieBKAfSKV STEVEN M. SAR NA CMARt.CS T. MEPCRR'GK CHAIN J. FOPTCANG pBTRR C. 1AMCLLDS MICHAEL w. SCHWÄRT* Al.LAR A- MARTIN BARRV A. BRrCR IAWREHCE □. PEDQrWITZ ROBERT 0. MAZUR PAUL VIZGARRÖMSCK JR. PCTCR G- HCIM hAROlD s. «OViKOFF DAVID M. EINHORN KENNET« B. FQRREST METER C. KOPLOW THEODORE M- MIRVIS EDWARD D. MERL1HT RICHARD D. FEINTUCH DANIEL A. NEPP ERIC M. ROTR WAHREN R. STERN ANDREW R. BROWN ST EIN MICHAEL H...",train
3,Tobacco800,aao54e00_1,FirstPage,"i PHILIP .MORRIS INCORPORATED 120 PARK AVENUE, N EW YORK. N. Y. 10017 John T.-Landhy SENIOR VICC PRESIDENT _ , r . n n - . oirector of Marketing February 15, 19 83 Dear Ms. Ober: Your letter of February 3 was received by this Office yesterday. We attempted to reach you by telephone iinmediately but were unable to find a listed number in New Haven. I had hoped to arrange an appointment for one of our executives to discuss with you and your concerned ...",train
4,Tobacco800,aao54e00_2,NextPage,"In the meantime, I hope you and your friends are continuing to enjoy your Marlboro Lights ciga-rettes. We don't think you can find a better"" cigarette on the market. ■Thank you. Ms. Angela Ober 221 Farnham Avenue New Haven, Connecticut 06515",train
...,...,...,...,...,...
1026,Tobacco800,thl51a00-page02_2,NextPage,"«AR 18 »Sg 14t99 FROM B/W LAU DEPT PAGE.Q03 P*fl® t Maroh 18, 1992 Letter fco K. fii yeeterday 1» which American would represent that it does not m)w any itlai of the wcxt STRIKs «ad fall mall producta in the United States, inoiuding tu® U.S. duty free narbet# fot re-oacpott to »ttd resale in any country where Brom * wlllieason# or a mamber of the b.a.t. Group of 0Oa&anies, has registered theso tradomarks# and that any sudü aale by any parson woUld be oonaidered an infringement of Brown & Williamson*® tredemark rights. 4 3* Pr...",valid
1027,Tobacco800,tji44a00,FirstPage,"MOV 2 11994 Si iook.J Iakdyü. Bacon November 18, 1994 FEDERAL EXPRESS TO: ETS/IAQ Current Developments Report Distribution Enclosed you will find a copy of the most recent issue of the ETS/XACl Current Developments Report. If you have questions or information rega.rd.ing items contained in this Report, please feel free to contact either of us at (816) 4*74-6550. Sincerely MWC : KAM : lew Enclosure cc: Report: Team 88325694",valid
1028,Tobacco800,tjr72f00-page02_1,FirstPage,"THE TOBACCO INSTITUTE, inc. ■1776 K STREET, NORTHWEST! WASHINGTON, D.C. 2000b i 202.'296-8434 HORACE R. KORNEGAY President May 13, 1975 Mr. Colin Stokes Chairman of the Board of Directors R. J. Reynolds Industries, Inc. 401 North Main Street Winston-Salem, North Carolina 27102 Dear Colin: The terms on which your and other Companies agreed to participate in the establishment and Operation of the Tobacco Institute Testing Laboratory (T.I.T.L.) were set förth in the Institute letter of November 1, 1966, which was accepted by each participating Company. Based on recent discussions which I ha...",valid
1029,Tobacco800,tjr72f00-page02_2,NextPage,"two billion units. Each Company responsible for additional expenses shall be assessed for the per-centage of such additional expenses as is determined by using that Company's total annual sales of USA tax-paid cigarettes as the numerator and the total annual sales of USA tax-paid cigarettes of all Companies responsible for additional expenses as the denominator. In determining the allocation for additional expenses, the total sales of USA tax-paid cigarettes shall be determined by using, as sales figures, the estimated domestic cigarette consumption by Company based on the latest ann...",valid


In [28]:
df_test = pd.read_csv('/mnt/nas/databases/Tobacco800/unziped/test.csv', delimiter=';',  usecols=['binder','docid','class'])

In [29]:
df_test['split'] = 'test'

In [30]:
df = pd.concat([df, df_test], axis=0); df.reset_index(drop=True, inplace=True); df

Unnamed: 0,binder,docid,class,text,split
0,Tobacco800,aah97e00-page02_1,FirstPage,"Dr. M.A. Manzelli, PHILIP MORRIS INC., Research Center, P.O. Box 3 D Richmond, Va. 23206 U.S.A. Dear Art, Please find enclosed a proposed program for your visit in Europe. As you will see, this program contains two major points: a) A two-days meeting in Neuchätel with all people in Europe involved in infestation control (proposed program included). b) Visit of all PM Europe production centers with the purpose of: - training people in Charge of the insect indenti-fication - inspection of the sanitary conditions One of our main problems is the insect Identification and ...",train
1,Tobacco800,aah97e00-page02_2,NextPage,"- 2 - Please let me krow if you have any changes for the proposed program. Looking forward to seeing you again, cc: Messrs. B. Kuhn E. Stoop Encl.: mentioned 1000035625",train
2,Tobacco800,aam09c00,FirstPage,"I NOIJ-04-97 13 = 25 FROM = I D : PAGE 10/17 Wachtell, Lipton, Rosen & Katz MARTIN L1PTON HERBERT M. WaCHYCLL THEODORE GEWSRTZ »ERNA*» W, NUSSBAUM RICHARD O- HATCHER DOUGLAS Sr LieBKAfSKV STEVEN M. SAR NA CMARt.CS T. MEPCRR'GK CHAIN J. FOPTCANG pBTRR C. 1AMCLLDS MICHAEL w. SCHWÄRT* Al.LAR A- MARTIN BARRV A. BRrCR IAWREHCE □. PEDQrWITZ ROBERT 0. MAZUR PAUL VIZGARRÖMSCK JR. PCTCR G- HCIM hAROlD s. «OViKOFF DAVID M. EINHORN KENNET« B. FQRREST METER C. KOPLOW THEODORE M- MIRVIS EDWARD D. MERL1HT RICHARD D. FEINTUCH DANIEL A. NEPP ERIC M. ROTR WAHREN R. STERN ANDREW R. BROWN ST EIN MICHAEL H...",train
3,Tobacco800,aao54e00_1,FirstPage,"i PHILIP .MORRIS INCORPORATED 120 PARK AVENUE, N EW YORK. N. Y. 10017 John T.-Landhy SENIOR VICC PRESIDENT _ , r . n n - . oirector of Marketing February 15, 19 83 Dear Ms. Ober: Your letter of February 3 was received by this Office yesterday. We attempted to reach you by telephone iinmediately but were unable to find a listed number in New Haven. I had hoped to arrange an appointment for one of our executives to discuss with you and your concerned ...",train
4,Tobacco800,aao54e00_2,NextPage,"In the meantime, I hope you and your friends are continuing to enjoy your Marlboro Lights ciga-rettes. We don't think you can find a better"" cigarette on the market. ■Thank you. Ms. Angela Ober 221 Farnham Avenue New Haven, Connecticut 06515",train
...,...,...,...,...,...
1273,Tobacco800,zrz94a00-page02_2,NextPage,,test
1274,Tobacco800,zss86d00,FirstPage,,test
1275,Tobacco800,ztz52d00-page02_1,FirstPage,,test
1276,Tobacco800,ztz52d00-page02_2,NextPage,,test


In [31]:
def splitter(df):
    train = df[df['split']=='train'].index.tolist()
    valid = df[df['split']=='valid'].index.tolist()
    test = df[df['split']=='test'].index.tolist()
    return train,valid, test

In [32]:
def get_x(r): return path/f'{r["docid"]}.tif'
def get_y(r): return r['class']

In [33]:
dblock = DataBlock(blocks=(ImageBlock, CategoryBlock),
                   get_x=get_x,
                   get_y=get_y,
                   splitter=splitter,
                   item_tfms=Resize(460),
                   batch_tfms=[*aug_transforms(size=224, min_scale=0.9,
                                               do_flip=False, max_rotate=0,
                                               max_warp=0),
                               Normalize.from_stats(*imagenet_stats)])

In [34]:
dls = dblock.dataloaders(df, bs=64)

In [35]:
learn = cnn_learner(dls, resnet50, loss_func=CrossEntropyLossFlat())

In [36]:
learn.load("best_image_no_weights_224")

<fastai.learner.Learner at 0x7f915427dbb0>

In [37]:
img_probs_val, labels_val = learn.get_preds()

In [38]:
img_probs_test, labels_test = learn.get_preds(ds_idx=2)

In [50]:
assert (tensor(np.argmax(test_label, axis=1)) == labels_test).all()

In [39]:
tensor([0.3,0.2])**2*tensor([0.1, 0.3])**2

tensor([0.0009, 0.0036])

In [40]:
def late_fusion(pred_image, pred_text, img_weight=1, text_weight=1):
    return pred_image**img_weight * pred_text**text_weight

In [41]:
def evaluate(preds_image, preds_text, targets, img_weight=1, text_weight=1):
    probs = late_fusion(preds_image, preds_text, img_weight, text_weight)
    preds = np.argmax(probs, axis=1)
    print(classification_report(targets, preds, target_names=dls.vocab, digits=4))

In [42]:
weights = [0, 1e-2, 3e-2, 5e-2, 7e-2, 1e-1, 3e-1, 5e-1, 7e-1, 1]

In [43]:
def cross_val(preds_image, preds_text, targets, weights):
    max_acc = 0
    for img_weight in weights:
        for text_weight in weights:
            probs = late_fusion(preds_image, preds_text, img_weight, text_weight)
            preds = np.argmax(probs, axis=1)
            acc = accuracy_score(preds,  targets)
            if acc > max_acc:
                max_acc = acc
                best_img_weight = img_weight
                best_text_weight = text_weight
    return max_acc, best_img_weight, best_text_weight

In [44]:
max_acc, best_img_weight, best_text_weight = cross_val(img_probs_val, text_probs_val, labels_val, weights)

In [45]:
max_acc, best_img_weight, best_text_weight

(0.795, 0.03, 0.05)

In [46]:
evaluate(img_probs_val, text_probs_val, labels_val, best_img_weight, best_text_weight)

              precision    recall  f1-score   support

   FirstPage     0.7297    0.8804    0.7980        92
    NextPage     0.8764    0.7222    0.7919       108

    accuracy                         0.7950       200
   macro avg     0.8031    0.8013    0.7950       200
weighted avg     0.8089    0.7950    0.7947       200



In [47]:
evaluate(img_probs_test, text_probs_test, labels_test, best_img_weight, best_text_weight)

              precision    recall  f1-score   support

   FirstPage     0.8192    0.9667    0.8869       150
    NextPage     0.9390    0.7064    0.8063       109

    accuracy                         0.8571       259
   macro avg     0.8791    0.8365    0.8466       259
weighted avg     0.8696    0.8571    0.8529       259

