In [1]:
from datasets import load_dataset
import pandas as pd
from transformers import AutoTokenizer,AutoModelForSequenceClassification,AutoModel
import torch
import numpy as np

In [2]:
# loading data

In [3]:
train_set=pd.read_csv("../../datasets/classification/train.csv")

In [4]:
train_set=train_set[['text','target']]

In [5]:
train_set

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...
7608,Two giant cranes holding a bridge collapse int...,1
7609,@aria_ahrary @TheTawniest The out of control w...,1
7610,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,Police investigating after an e-bike collided ...,1


In [6]:
test_set=pd.read_csv("../../datasets/classification/test.csv")

In [7]:
test_set=test_set[['text']]

In [8]:
len(test_set)

3263

In [9]:
test_set

Unnamed: 0,text
0,Just happened a terrible car crash
1,"Heard about #earthquake is different cities, s..."
2,"there is a forest fire at spot pond, geese are..."
3,Apocalypse lighting. #Spokane #wildfires
4,Typhoon Soudelor kills 28 in China and Taiwan
...,...
3258,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,Storm in RI worse than last hurricane. My city...
3260,Green Line derailment in Chicago http://t.co/U...
3261,MEG issues Hazardous Weather Outlook (HWO) htt...


In [10]:
# generating encoding via tokenizer

In [11]:
checkpoint='distilbert-base-uncased'

In [12]:
tokenizer=AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [13]:
texts_train=list(train_set['text'])

In [14]:
text_encodings_train=tokenizer(texts_train,return_tensors='pt',padding=True,truncation=True)

In [15]:
text_encodings_train.keys()

dict_keys(['input_ids', 'attention_mask'])

In [16]:
len(text_encodings_train['input_ids'])

7613

In [17]:
texts_test=list(test_set['text'])

In [18]:
len(texts_test)

3263

In [19]:
text_encodings_test=tokenizer(texts_test,return_tensors='pt',padding=True,truncation=True)

In [20]:
text_encodings_test.keys()

dict_keys(['input_ids', 'attention_mask'])

In [21]:
len(text_encodings_test['input_ids'])

3263

In [22]:
# loading model

In [23]:
model=AutoModel.from_pretrained(checkpoint)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [24]:
text_encodings_train={k:v for k,v in text_encodings_train.items()}

In [25]:
text_encodings_test={k:v for k,v in text_encodings_test.items()}

In [26]:
with torch.no_grad():
    outputs=model(**text_encodings_train)

In [27]:
outputs.last_hidden_state.shape

torch.Size([7613, 84, 768])

In [28]:
with torch.no_grad():
    outputs_test=model(**text_encodings_test)

In [29]:
outputs.last_hidden_state.shape

torch.Size([7613, 84, 768])

In [30]:
# we will be taking embedding of start token

In [31]:
train_embeddings=outputs.last_hidden_state[:,0,:]

In [32]:
test_embeddings=outputs_test.last_hidden_state[:,0,:]

In [33]:
train_embeddings.shape

torch.Size([7613, 768])

In [34]:
test_embeddings.shape

torch.Size([3263, 768])

In [35]:
# training ML model

In [36]:
train_set['embeddings']=train_embeddings.tolist()

In [37]:
train_set

Unnamed: 0,text,target,embeddings
0,Our Deeds are the Reason of this #earthquake M...,1,"[-0.0077901557087898254, 0.24342800676822662, ..."
1,Forest fire near La Ronge Sask. Canada,1,"[-0.5075259208679199, 0.1033860296010971, -0.6..."
2,All residents asked to 'shelter in place' are ...,1,"[-0.14549508690834045, -0.16572265326976776, 0..."
3,"13,000 people receive #wildfires evacuation or...",1,"[-0.1734803318977356, -0.2343343198299408, -0...."
4,Just got sent this photo from Ruby #Alaska as ...,1,"[0.13774585723876953, -0.15708087384700775, -0..."
...,...,...,...
7608,Two giant cranes holding a bridge collapse int...,1,"[-0.01304292306303978, -0.053651973605155945, ..."
7609,@aria_ahrary @TheTawniest The out of control w...,1,"[-0.006948217749595642, -0.11735769361257553, ..."
7610,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,"[-0.30757129192352295, -0.13051368296146393, 0..."
7611,Police investigating after an e-bike collided ...,1,"[-0.37730127573013306, -0.4288996756076813, -0..."


In [38]:
test_set['embeddings']=test_embeddings.tolist()

In [39]:
from sklearn.linear_model import LogisticRegression

In [51]:
xgb_classifier=LogisticRegression(solver='liblinear')

In [52]:
X=np.array(list(train_set['embeddings']))
X_test=np.array(list(test_set['embeddings']))

In [53]:
X.shape

(7613, 768)

In [54]:
y=np.array(list(train_set['target']))

In [55]:
y.shape

(7613,)

In [56]:
xgb_classifier.fit(X,y)

LogisticRegression(solver='liblinear')

In [62]:
predictions=list(xgb_classifier.predict(X_test))

In [63]:
len(predictions)

3263

In [64]:
predictions.count(0)

2014

In [65]:
test_set=pd.read_csv("../../datasets/classification/test.csv")

In [66]:
result=pd.DataFrame()
result['id']=test_set['id']
result['target']=predictions

In [67]:
result

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,0
3259,10865,1
3260,10868,1
3261,10874,1


In [68]:
result.to_csv("../../results/classification/feature_extraction_ml_model.csv",index=False)

In [69]:
# got F1 score to be 0.80784 on kaggle