In [3]:
import numpy as np
import pandas as pd
import torch
import transformers as ppb # pytorch transformers
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [4]:
df_train = pd.read_csv('train_set.csv')
df_test = pd.read_csv('test_set_florida.csv')
df = df_train.append(df_test)

In [5]:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

In [6]:
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [7]:
tokenized = df['sentence'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [8]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [9]:
np.array(padded).shape

(3366, 166)

In [10]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(3366, 166)

In [11]:
input_ids = torch.tensor(np.array(padded))
attention_mask = torch.tensor(attention_mask)
with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [12]:
last_hidden_states[0][:,0,0]

tensor([-0.5774, -0.4186, -0.6535,  ..., -0.5505, -0.5987, -0.3164])

In [13]:
features = last_hidden_states[0][:,0,:].numpy()

In [14]:
features.shape

(3366, 768)

In [15]:
df_train.shape

(2788, 2)

In [16]:
train_feat = features[:df_train.shape[0]]
train_labels = df_train['target'].to_numpy()
test_feat = features[df_train.shape[0]:]
test_labels = df_test['target'].to_numpy()

In [17]:
lr = LogisticRegression(C=1.0, max_iter=2000, class_weight='balanced')
lr.fit(train_feat, train_labels)

LogisticRegression(class_weight='balanced', max_iter=2000)

In [18]:
len(train_labels)/(2*np.bincount(train_labels)) #class_weights

array([ 0.51667902, 15.48888889])

In [19]:
lr.score(test_feat, test_labels)

0.9809688581314879

In [20]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

scores = cross_val_score(clf, train_feat, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Dummy classifier score: 0.931 (+/- 0.01)




In [21]:
preds = lr.predict(test_feat)

In [22]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(test_labels, preds)
print(cm)

[[563   7]
 [  4   4]]


In [23]:
recall = cm[1][1]/(cm[1][1] + cm[1][0])
precision = cm[1][1]/(cm[1][1] + cm[0][1])
f1_score = 2/((1/recall) + (1/precision))
print("Recall: ", recall)
print("Precision: ", precision)
print("F1-Score: ", f1_score)

Recall:  0.5
Precision:  0.36363636363636365
F1-Score:  0.42105263157894735


In [26]:
count=0
summary_indices=[]
for idx,i in enumerate(preds):
    if i==1:
        count+=1
        print(idx)
        summary_indices.append(idx)
print("Count")
print(count)

64
127
139
284
361
458
543
552
553
554
557
Count
11


In [35]:
actual_summary_indices=[]
for idx,i in enumerate(test_labels):
    if i==1:
        count+=1
        print(idx)
        actual_summary_indices.append(idx)
print("Count")
print(count)

543
547
548
549
550
552
553
554
Count
19


In [36]:
actual_summary_indices

[543, 547, 548, 549, 550, 552, 553, 554]

In [27]:
summary_indices

[64, 127, 139, 284, 361, 458, 543, 552, 553, 554, 557]

In [37]:
summary_list = [df_test['sentence'][i] for i in summary_indices]
summary_output = ' '.join(summary_list)

In [41]:
actual_summary_list = [df_test['sentence'][i] for i in actual_summary_indices]
actual_summary = ' '.join(actual_summary_list)

In [42]:
actual_summary

"Florida's highway system contains 1,495\xa0mi (2,406\xa0km) of interstate highway, and 10,601\xa0mi (17,061\xa0km) of non-interstate highway, such as state highways and U.S. In 2011, there were about 9,000 retail gas stations in the state. Floridians consumed 21\xa0million gallons of gasoline daily in 2011, ranking it third in national use behind California and Texas.Motorists have the 45th lowest rate of car insurance in the U.S. 24% are uninsured. Drivers between 15 and 19 years of age averaged 364 car crashes a year per ten thousand licensed Florida drivers in 2010. Drivers 70 and older averaged 95 per 10,000 during the same time frame. Intercity bus travel, which utilizes Florida's highway system, is provided by Greyhound, Megabus, and Amtrak Thruway Motorcoach. Before the construction of routes under the Federal Aid Highway Act of 1956, Florida began construction of a long cross-state toll road, Florida's Turnpike. The first section, from Fort Pierce south to the Golden Glades In

In [38]:
summary_output

'The road crossed the St. Johns River at a narrow point called Wacca Pilatka, or the British name "Cow Ford", reflecting the fact that cattle were brought across the river there. In the pre-automobile era, railroads played a key role in the state\'s development, particularly in coastal areas. In 1925, the Seaboard Air Line broke the FEC\'s southeast Florida monopoly and extended its freight and passenger service to West Palm Beach; two years later it extended passenger service to Miami. In 2012, 75% of the population lived within 10 miles (16\xa0km) of the coastline. The fast-growing I-4 corridor area, which runs through Central Florida and connects the cities of Daytona Beach, Orlando, and Tampa/St. Tourism makes up one of the largest sectors of the state economy, with nearly 1.4\xa0million people employed in the tourism industry in 2016 (a record for the state, surpassing the 1.2\xa0million employment from 2015). Florida\'s highway system contains 1,495\xa0mi (2,406\xa0km) of interst

In [44]:
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeLsum'], use_stemmer=True)
scores = scorer.score(actual_summary,summary_output)

In [45]:
scores

{'rouge1': Score(precision=0.4160839160839161, recall=0.672316384180791, fmeasure=0.5140388768898487),
 'rouge2': Score(precision=0.3263157894736842, recall=0.5284090909090909, fmeasure=0.40347071583514105),
 'rougeLsum': Score(precision=0.32167832167832167, recall=0.519774011299435, fmeasure=0.39740820734341253)}