In [1]:
import numpy as np
import pandas as pd
import torch
import transformers as ppb # pytorch transformers
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [2]:
df_train = pd.read_csv('train_set.csv')
df_test = pd.read_csv('test_set_florida.csv')
df = df_train.append(df_test)

In [3]:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

In [4]:
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [5]:
tokenized = df['sentence'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [6]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [7]:
np.array(padded).shape

(3366, 166)

In [8]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(3366, 166)

In [9]:
input_ids = torch.tensor(np.array(padded))
attention_mask = torch.tensor(attention_mask)
with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [10]:
last_hidden_states[0][:,0,0]

tensor([-0.5774, -0.4186, -0.6535,  ..., -0.5505, -0.5987, -0.3164])

In [11]:
features = last_hidden_states[0][:,0,:].numpy()

In [12]:
features.shape

(3366, 768)

In [13]:
df_train.shape

(2788, 2)

In [14]:
train_feat = features[:df_train.shape[0]]
train_labels = df_train['target'].to_numpy()
test_feat = features[df_train.shape[0]:]
test_labels = df_test['target'].to_numpy()

In [62]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)]
max_features = ['auto']
max_depth = [int(x) for x in np.linspace(1, 8, num = 8)]
max_depth.append(None)
min_samples_split = [1,2,3,4,5,8, 10]
min_samples_leaf = [1,2,3,4,5,8, 10]
bootstrap = [True, False]
class_weight = ['balanced','balanced_subsample']
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
                'class_weight':class_weight}
rf = RandomForestClassifier()
# Instantiate the grid search model
#grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
 #                         cv = 3, n_jobs = -1, verbose = 2, scoring='f1',refit=True)
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = param_grid, 
                               n_iter = 200, cv = 3, verbose=2, random_state=0, n_jobs = -1, scoring='recall')

In [63]:
rf_random.get_params()
#grid_search.get_params()

{'cv': 3,
 'error_score': nan,
 'estimator__bootstrap': True,
 'estimator__ccp_alpha': 0.0,
 'estimator__class_weight': None,
 'estimator__criterion': 'gini',
 'estimator__max_depth': None,
 'estimator__max_features': 'auto',
 'estimator__max_leaf_nodes': None,
 'estimator__max_samples': None,
 'estimator__min_impurity_decrease': 0.0,
 'estimator__min_impurity_split': None,
 'estimator__min_samples_leaf': 1,
 'estimator__min_samples_split': 2,
 'estimator__min_weight_fraction_leaf': 0.0,
 'estimator__n_estimators': 100,
 'estimator__n_jobs': None,
 'estimator__oob_score': False,
 'estimator__random_state': None,
 'estimator__verbose': 0,
 'estimator__warm_start': False,
 'estimator': RandomForestClassifier(),
 'iid': 'deprecated',
 'n_iter': 200,
 'n_jobs': -1,
 'param_distributions': {'n_estimators': [100,
   200,
   300,
   400,
   500,
   600,
   700,
   800,
   900,
   1000,
   1100,
   1200,
   1300,
   1400,
   1500,
   1600,
   1700,
   1800,
   1900,
   2000],
  'max_features':

In [64]:
#grid_search.fit(train_feat, train_labels)
rf_random.fit(train_feat, train_labels)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 269 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  7.7min finished


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=200,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'class_weight': ['balanced',
                                                         'balanced_subsample'],
                                        'max_depth': [1, 2, 3, 4, 5, 6, 7, 8,
                                                      None],
                                        'max_features': ['auto'],
                                        'min_samples_leaf': [1, 2, 3, 4, 5, 8,
                                                             10],
                                        'min_samples_split': [1, 2, 3, 4, 5, 8,
                                                              10],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500, 600, 700, 800,
                             

In [65]:
rf_random.best_params_

{'n_estimators': 1100,
 'min_samples_split': 5,
 'min_samples_leaf': 8,
 'max_features': 'auto',
 'max_depth': 1,
 'class_weight': 'balanced',
 'bootstrap': False}

In [66]:
rf_random.best_score_

0.7666666666666666

In [84]:
rf = RandomForestClassifier(n_estimators = 1100, random_state = 0, max_depth=3, 
                            min_samples_leaf=5, min_samples_split=5, max_features='auto',
                            class_weight='balanced',bootstrap=False)
#scoring=F1
#rf = RandomForestClassifier(n_estimators = 2000, random_state = 0, max_depth=10, 
 #                           min_samples_leaf=4, min_samples_split=5, max_features='auto',
  #                          class_weight='balanced',bootstrap=False)
rf.fit(train_feat, train_labels);

In [85]:
len(train_labels)/(2*np.bincount(train_labels)) #class_weights

array([ 0.51667902, 15.48888889])

In [86]:
rf.score(test_feat, test_labels)

0.9740484429065744

In [87]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

scores = cross_val_score(clf, train_feat, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Dummy classifier score: 0.933 (+/- 0.01)




In [88]:
preds = rf.predict(test_feat)

In [89]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(test_labels, preds)
print(cm)

[[559  11]
 [  4   4]]


In [90]:
recall = cm[1][1]/(cm[1][1] + cm[1][0])
precision = cm[1][1]/(cm[1][1] + cm[0][1])
f1_score = 2/((1/recall) + (1/precision))
print("Recall: ", recall)
print("Precision: ", precision)
print("F1-Score: ", f1_score)

Recall:  0.5
Precision:  0.26666666666666666
F1-Score:  0.34782608695652173


In [91]:
count=0
summary_indices=[]
for idx,i in enumerate(preds):
    if i==1:
        count+=1
        print(idx)
        summary_indices.append(idx)
print("Count")
print(count)

139
361
464
503
505
507
543
545
546
552
553
554
555
556
557
Count
15


In [92]:
actual_summary_indices=[]
for idx,i in enumerate(test_labels):
    if i==1:
        count+=1
        print(idx)
        actual_summary_indices.append(idx)
print("Count")
print(count)

543
547
548
549
550
552
553
554
Count
23


In [93]:
actual_summary_indices

[543, 547, 548, 549, 550, 552, 553, 554]

In [94]:
summary_indices

[139, 361, 464, 503, 505, 507, 543, 545, 546, 552, 553, 554, 555, 556, 557]

In [95]:
summary_list = [df_test['sentence'][i] for i in summary_indices]
summary_output = ' '.join(summary_list)

In [96]:
actual_summary_list = [df_test['sentence'][i] for i in actual_summary_indices]
actual_summary = ' '.join(actual_summary_list)

In [97]:
actual_summary

"Florida's highway system contains 1,495\xa0mi (2,406\xa0km) of interstate highway, and 10,601\xa0mi (17,061\xa0km) of non-interstate highway, such as state highways and U.S. In 2011, there were about 9,000 retail gas stations in the state. Floridians consumed 21\xa0million gallons of gasoline daily in 2011, ranking it third in national use behind California and Texas.Motorists have the 45th lowest rate of car insurance in the U.S. 24% are uninsured. Drivers between 15 and 19 years of age averaged 364 car crashes a year per ten thousand licensed Florida drivers in 2010. Drivers 70 and older averaged 95 per 10,000 during the same time frame. Intercity bus travel, which utilizes Florida's highway system, is provided by Greyhound, Megabus, and Amtrak Thruway Motorcoach. Before the construction of routes under the Federal Aid Highway Act of 1956, Florida began construction of a long cross-state toll road, Florida's Turnpike. The first section, from Fort Pierce south to the Golden Glades In

In [98]:
summary_output

"In 1925, the Seaboard Air Line broke the FEC's southeast Florida monopoly and extended its freight and passenger service to West Palm Beach; two years later it extended passenger service to Miami. The fast-growing I-4 corridor area, which runs through Central Florida and connects the cities of Daytona Beach, Orlando, and Tampa/St. Amusement parks, especially in the Greater Orlando area, make up a significant portion of tourism. Major ports in Florida include Port Tampa Bay in Tampa, Port Everglades in Fort Lauderdale, Port of Jacksonville in Jacksonville, PortMiami in Miami, Port Canaveral in Brevard County, Port Manatee in Manatee County, and Port of Palm Beach in Riviera Beach. Port Tampa Bay meanwhile is the largest in the state, having the most tonnage. It is the largest, most diversified port in Florida, has an economic impact of more than $15.1\xa0billion, and supports more than 80,000 jobs. Florida's highway system contains 1,495\xa0mi (2,406\xa0km) of interstate highway, and 1

In [99]:
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeLsum'], use_stemmer=True)
scores = scorer.score(actual_summary,summary_output)

In [100]:
scores

{'rouge1': Score(precision=0.3562874251497006, recall=0.672316384180791, fmeasure=0.4657534246575343),
 'rouge2': Score(precision=0.2852852852852853, recall=0.5397727272727273, fmeasure=0.3732809430255403),
 'rougeLsum': Score(precision=0.2934131736526946, recall=0.5536723163841808, fmeasure=0.3835616438356164)}