In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
import transformers as ppb # pytorch transformers
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [3]:
df_train = pd.read_csv('train_set.csv')
df_test = pd.read_csv('test_set_florida.csv')
df = df_train.append(df_test)

In [5]:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

In [6]:
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [7]:
tokenized = df['sentence'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [8]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [9]:
np.array(padded).shape

(3366, 166)

In [10]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(3366, 166)

In [11]:
input_ids = torch.tensor(np.array(padded))
attention_mask = torch.tensor(attention_mask)
with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [12]:
last_hidden_states[0][:,0,0]

tensor([-0.5774, -0.4186, -0.6535,  ..., -0.5505, -0.5987, -0.3164])

In [13]:
features = last_hidden_states[0][:,0,:].numpy()

In [14]:
features.shape

(3366, 768)

In [15]:
df_train.shape

(2788, 2)

In [17]:
train_feat = features[:df_train.shape[0]]
train_labels = df_train['target'].to_numpy()
test_feat = features[df_train.shape[0]:]
test_labels = df_test['target'].to_numpy()

In [18]:
## train data
class trainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)


train_data = trainData(torch.FloatTensor(train_feat), 
                       torch.FloatTensor(train_labels))
## test data    
class testData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)
    

test_data = testData(torch.FloatTensor(test_feat))

In [19]:
BATCH_SIZE=64
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [21]:
class ANNModel(nn.Module):
    
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(ANNModel, self).__init__()
        
        
        self.fc1 = nn.Linear(input_dim, hidden_dim) 
        self.relu1 = nn.ReLU()
        
        #self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        #self.relu2 = nn.ReLU()
        
        #self.fc3 = nn.Linear(hidden_dim, hidden_dim)
        #self.relu3 = nn.ReLU()
        
        self.fc4 = nn.Linear(hidden_dim, output_dim)  
    
    def forward(self, x):
    
        out = self.fc1(x)
        out = self.relu1(out)
        
        
        #out = self.fc2(out)
        #out = self.tanh2(out)
        
        #out = self.fc3(out)
        #out = self.elu3(out)
        
        out = self.fc4(out)
        return out


input_dim = 768
hidden_dim = 150 
output_dim = 1
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model = ANNModel(input_dim, hidden_dim, output_dim)
model.to(device)
print(model)
criterion = nn.BCEWithLogitsLoss()
learning_rate = 0.02
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
EPOCHS=50

cpu
ANNModel(
  (fc1): Linear(in_features=768, out_features=150, bias=True)
  (relu1): ReLU()
  (fc4): Linear(in_features=150, out_features=1, bias=True)
)


In [22]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [23]:
model.train()
for e in range(1, EPOCHS+1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        
        y_pred = model(X_batch)
        
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')

Epoch 001: | Loss: 0.27334 | Acc: 91.955
Epoch 002: | Loss: 0.08330 | Acc: 96.727
Epoch 003: | Loss: 0.08041 | Acc: 96.727
Epoch 004: | Loss: 0.06620 | Acc: 96.818
Epoch 005: | Loss: 0.06227 | Acc: 97.182
Epoch 006: | Loss: 0.05498 | Acc: 97.409
Epoch 007: | Loss: 0.06068 | Acc: 97.250
Epoch 008: | Loss: 0.06689 | Acc: 97.068
Epoch 009: | Loss: 0.04767 | Acc: 97.705
Epoch 010: | Loss: 0.05303 | Acc: 97.455
Epoch 011: | Loss: 0.04624 | Acc: 97.818
Epoch 012: | Loss: 0.05424 | Acc: 97.455
Epoch 013: | Loss: 0.04022 | Acc: 98.136
Epoch 014: | Loss: 0.03507 | Acc: 98.182
Epoch 015: | Loss: 0.04709 | Acc: 98.000
Epoch 016: | Loss: 0.03766 | Acc: 98.205
Epoch 017: | Loss: 0.03469 | Acc: 98.341
Epoch 018: | Loss: 0.02996 | Acc: 98.795
Epoch 019: | Loss: 0.03449 | Acc: 98.250
Epoch 020: | Loss: 0.02702 | Acc: 98.750
Epoch 021: | Loss: 0.02219 | Acc: 98.932
Epoch 022: | Loss: 0.03512 | Acc: 98.591
Epoch 023: | Loss: 0.04280 | Acc: 98.045
Epoch 024: | Loss: 0.02553 | Acc: 98.727
Epoch 025: | Los

In [24]:
#len(train_labels)/(2*np.bincount(train_labels)) #class_weights

In [25]:
y_pred_list = []
model.eval()
with torch.no_grad():
    for X_batch in test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [26]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

scores = cross_val_score(clf, train_feat, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Dummy classifier score: 0.935 (+/- 0.01)




In [27]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(test_labels, y_pred_list)
print(cm)

[[565   5]
 [  5   3]]


In [28]:
recall = cm[1][1]/(cm[1][1] + cm[1][0])
precision = cm[1][1]/(cm[1][1] + cm[0][1])
f1_score = 2/((1/recall) + (1/precision))
print("Recall: ", recall)
print("Precision: ", precision)
print("F1-Score: ", f1_score)

Recall:  0.375
Precision:  0.375
F1-Score:  0.375


In [30]:
count=0
summary_indices=[]
for idx,i in enumerate(y_pred_list):
    if i==1:
        count+=1
        print(idx)
        summary_indices.append(idx)
print("Count")
print(count)

64
127
139
543
552
554
555
557
Count
8


In [31]:
actual_summary_indices=[]
for idx,i in enumerate(test_labels):
    if i==1:
        count+=1
        print(idx)
        actual_summary_indices.append(idx)
print("Count")
print(count)

543
547
548
549
550
552
553
554
Count
16


In [32]:
actual_summary_indices

[543, 547, 548, 549, 550, 552, 553, 554]

In [33]:
summary_indices

[64, 127, 139, 543, 552, 554, 555, 557]

In [34]:
summary_list = [df_test['sentence'][i] for i in summary_indices]
summary_output = ' '.join(summary_list)

In [35]:
actual_summary_list = [df_test['sentence'][i] for i in actual_summary_indices]
actual_summary = ' '.join(actual_summary_list)

In [36]:
actual_summary

"Florida's highway system contains 1,495\xa0mi (2,406\xa0km) of interstate highway, and 10,601\xa0mi (17,061\xa0km) of non-interstate highway, such as state highways and U.S. In 2011, there were about 9,000 retail gas stations in the state. Floridians consumed 21\xa0million gallons of gasoline daily in 2011, ranking it third in national use behind California and Texas.Motorists have the 45th lowest rate of car insurance in the U.S. 24% are uninsured. Drivers between 15 and 19 years of age averaged 364 car crashes a year per ten thousand licensed Florida drivers in 2010. Drivers 70 and older averaged 95 per 10,000 during the same time frame. Intercity bus travel, which utilizes Florida's highway system, is provided by Greyhound, Megabus, and Amtrak Thruway Motorcoach. Before the construction of routes under the Federal Aid Highway Act of 1956, Florida began construction of a long cross-state toll road, Florida's Turnpike. The first section, from Fort Pierce south to the Golden Glades In

In [37]:
summary_output

'The road crossed the St. Johns River at a narrow point called Wacca Pilatka, or the British name "Cow Ford", reflecting the fact that cattle were brought across the river there. In the pre-automobile era, railroads played a key role in the state\'s development, particularly in coastal areas. In 1925, the Seaboard Air Line broke the FEC\'s southeast Florida monopoly and extended its freight and passenger service to West Palm Beach; two years later it extended passenger service to Miami. Florida\'s highway system contains 1,495\xa0mi (2,406\xa0km) of interstate highway, and 10,601\xa0mi (17,061\xa0km) of non-interstate highway, such as state highways and U.S. Intercity bus travel, which utilizes Florida\'s highway system, is provided by Greyhound, Megabus, and Amtrak Thruway Motorcoach. The first section, from Fort Pierce south to the Golden Glades Interchange was completed in 1957. After a second section north through Orlando to Wildwood (near present-day The Villages), and a southward

In [38]:
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeLsum'], use_stemmer=True)
scores = scorer.score(actual_summary,summary_output)

In [39]:
scores

{'rouge1': Score(precision=0.45023696682464454, recall=0.536723163841808, fmeasure=0.4896907216494845),
 'rouge2': Score(precision=0.319047619047619, recall=0.3806818181818182, fmeasure=0.3471502590673575),
 'rougeLsum': Score(precision=0.3127962085308057, recall=0.3728813559322034, fmeasure=0.3402061855670103)}