In [1]:
import pandas as pd
import json
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
import spacy

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
df=pd.read_csv("../data/newcorp.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 266 entries, 0 to 265
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   article_id          266 non-null    int64 
 1   title               266 non-null    object
 2   article_text        266 non-null    object
 3   summary_1_text      266 non-null    object
 4   summary_2_text      266 non-null    object
 5   summary_3_text      266 non-null    object
 6   summary_4_text      266 non-null    object
 7   summary_5_text      266 non-null    object
 8   thesis_1_text       266 non-null    object
 9   thesis_2_text       266 non-null    object
 10  thesis_3_text       266 non-null    object
 11  thesis_4_text       266 non-null    object
 12  thesis_5_text       266 non-null    object
 13  lead                266 non-null    object
 14  body                266 non-null    object
 15  conclusion          266 non-null    object
 16  article_segments    266 no

In [5]:
text_list=[]
label_list=[]

In [6]:
def datasetmaker(x):
    x=json.loads(x)
    for i in range(0,len(x["paragraphs"])):
        for j in range(0,len(x["paragraphs"][i])):
            text_list.append(x["paragraphs"][i][j]["text"])
            label_list.append(x["paragraphs"][i][j]["label"])

In [7]:
df["article_segments"].apply(lambda x:datasetmaker(x))

0      None
1      None
2      None
3      None
4      None
       ... 
261    None
262    None
263    None
264    None
265    None
Name: article_segments, Length: 266, dtype: object

In [8]:
len(text_list)

28772

In [9]:
len(label_list)

28772

In [10]:
df_main={"text":text_list,"labels":label_list}

In [11]:
df_main=pd.DataFrame(df_main)

In [12]:
df_main.head()

Unnamed: 0,text,labels
0,"2015: Beyond Obama, new Congress, we need a re...",title
1,"In the film, ""Girl Interrupted,"" Winona Ryder ...",anecdote
2,.,no-unit
3,The year is 1967,anecdote
4,and,no-unit


In [13]:
df_main.to_pickle("../data/df_main.pkl")

In [None]:
len(text_list)

In [None]:
df_main["labels"].value_counts()

In [None]:
sns.countplot(data=df_main,x="labels")
plt.xticks(rotation=45, ha='right')

In [None]:
df_main[df_main["labels"]=="no-unit"]["text"]

In [None]:
len_no_unit=df_main[df_main["labels"]=="no-unit"]["text"].apply(lambda x:len(x.split(" ")))

In [None]:
len_no_unit.value_counts().head()

#### Feel that a lot of no unit labels consist of just a single punctuation
#### Discarding these should be considered 
#### Makes the dataset very imbalanced 

In [None]:
# dividing the dataset
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df_main["labels"]=le.fit_transform(df_main["labels"])
X=df_main["text"]
y=df_main["labels"]
y.head()
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.20, random_state=42,stratify=y)
Train=pd.concat([X_train,y_train],axis=1)
Test=pd.concat([X_test,y_test],axis=1)
Train.to_csv("../data/train.csv",index=False)
Test.to_csv("../data/test.csv",index=False)

In [None]:
# Using the pre built transformers training method 

In [None]:
from datasets import load_dataset

In [None]:
dataset = load_dataset('csv', data_files={'train': ['../data/train.csv'],'test': '../data/test.csv'})

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
dataset

In [None]:
def tokenize_data(example):
    return tokenizer(example['text'], padding='max_length',truncation=True)

In [None]:
tokenized_datasets = dataset.map(tokenize_data, batched=True)

In [None]:
tokenized_datasets

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])

In [None]:
tokenized_datasets

In [None]:
# tokenized_datasets.set_format("torch")

In [None]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(500))

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=4)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=4)

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=8)

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
print(device)

In [None]:
model.to(device)

In [None]:
from datasets import load_metric

In [None]:
model.train()

for epoch in range(3):
    for batch in train_dataloader:
      batch = {k: v.to(device) for k, v in batch.items()}
      outputs = model(**batch)
      loss = outputs.loss
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()

metric = load_metric("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])


In [None]:
metric.compute()