In [4]:
import numpy as np
import pandas as pd
import torch
import transformers as ppb # pytorch transformers
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [5]:
df_train = pd.read_csv('train_set.csv')
df_test = pd.read_csv('test_set.csv')
df = df_train.append(df_test)

In [6]:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Normal BERT
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

In [7]:
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [8]:
tokenized = df['sentence'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [9]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [10]:
np.array(padded).shape

(4287, 166)

In [11]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(4287, 166)

In [12]:
input_ids = torch.tensor(np.array(padded))
attention_mask = torch.tensor(attention_mask)
with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [13]:
last_hidden_states[0][:,0,0]

tensor([-0.5774, -0.4186, -0.6535,  ..., -0.7833, -0.2927, -0.3983])

In [14]:
features = last_hidden_states[0][:,0,:].numpy()

In [15]:
features.shape

(4287, 768)

In [16]:
df_train.shape

(2788, 2)

In [17]:
train_feat = features[:df_train.shape[0]]
train_labels = df_train['target'].to_numpy()
test_feat = features[df_train.shape[0]:]
test_labels = df_test['target'].to_numpy()

In [32]:
svc = SVC(kernel='rbf',class_weight='balanced',random_state=0)
svc.fit(train_feat, train_labels)

SVC(class_weight='balanced', random_state=0)

In [33]:
len(train_labels)/(2*np.bincount(train_labels)) #class_weights

array([ 0.51667902, 15.48888889])

In [34]:
svc.score(test_feat, test_labels)

0.9352901934623082

In [35]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

scores = cross_val_score(clf, train_feat, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Dummy classifier score: 0.940 (+/- 0.01)




In [36]:
preds = svc.predict(test_feat)

In [37]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(test_labels, preds)
print(cm)

[[1382   86]
 [  11   20]]


In [38]:
recall = cm[1][1]/(cm[1][1] + cm[1][0])
precision = cm[1][1]/(cm[1][1] + cm[0][1])
f1_score = 2/((1/recall) + (1/precision))
print("Recall: ", recall)
print("Precision: ", precision)
print("F1-Score: ", f1_score)

Recall:  0.6451612903225806
Precision:  0.18867924528301888
F1-Score:  0.29197080291970806


In [39]:
count=0
for idx,i in enumerate(preds):
    if i==1:
        count+=1
        print(idx)
print("Count")
print(count)

27
80
81
83
84
87
261
411
412
413
415
416
417
418
419
420
421
422
423
424
447
577
640
642
643
644
652
761
874
977
1015
1016
1017
1018
1020
1032
1056
1058
1061
1065
1066
1067
1068
1069
1070
1314
1315
1316
1317
1319
1320
1321
1322
1323
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1338
1339
1340
1342
1343
1344
1346
1347
1348
1349
1350
1351
1352
1356
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1372
1373
1374
1375
1376
1377
1378
1380
1381
1382
1383
1385
1386
1387
1388
Count
106
