<a href="https://colab.research.google.com/github/mathguy-r/sentiment-analysis-with-transformers/blob/temp/sentiment_analysis_distilbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup and Initialization

In [None]:
!pip install transformers datasets torch --no-cache-dir
!pip install tensorboard --no-cache-dir


ModuleNotFoundError: No module named 'torch'

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import DataCollatorWithPadding, AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer


## Data Handling

In [None]:
# data = pd.read_csv(obj['Body'])
data = pd.read_csv('data_tagged_labelled_7k_2.csv')
data.head(2)



## Preprocessing

In [None]:
## preprocessing 
from sklearn.model_selection import train_test_split

# data['binary_labelled'] = data['label'].map({'Low':0,'Medium':1,'High':2})
# print(data['binary_labelled'].value_counts())

# X = data.loc[data.binary_labelled==0,'VALUE2'].tolist()[:8000] + \
#     data.loc[data.binary_labelled==1,'VALUE2'].tolist()[:8000] + \
#     data.loc[data.binary_labelled==2,'VALUE2'].tolist()[:8000]

# y = data.loc[data.binary_labelled==0,'binary_labelled'].tolist()[:8000] + \
#     data.loc[data.binary_labelled==1,'binary_labelled'].tolist()[:8000] + \
#     data.loc[data.binary_labelled==2,'binary_labelled'].tolist()[:8000]

X, y = data['text'].tolist(), data['label'].tolist()
print(len(X),len(y))


## Model Training

In [None]:
from transformers import DataCollatorWithPadding, AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model_class, tokenizer_class, pretrained_weights = (AutoModelForSequenceClassification, AutoTokenizer, '.')

tokenizer = tokenizer_class.from_pretrained(pretrained_weights, do_lower_case=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")
model = model_class.from_pretrained(pretrained_weights, num_labels=3)
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

test_df = pd.DataFrame({'text':Xtest,'label':ytest})
test_df.to_csv('test_data.csv',index=False)

train_dataset = Dataset.from_dict(dict(text=Xtrain, label=ytrain))
test_dataset = Dataset.from_dict(dict(text=Xtest, label=ytest))

preprocess_function = lambda x:tokenizer(x['text'], truncation=True)

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }
model_output_dir = "./results"
model_saving_dir = "./saved"
model_logging_dir = "./logs"

%load_ext tensorboard
%tensorboard --logdir '{model_logging_dir}'/runs
training_args = TrainingArguments(
    output_dir=model_output_dir,
    learning_rate=2e-6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir=model_logging_dir,
    logging_strategy="steps",
    logging_steps=100,
    report_to="tensorboard"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()


saving_path = model_saving_dir
trainer.save_model(saving_path)
tokenizer.save_pretrained(saving_path)

#### Logging
for obj in trainer.state.log_history:
    print(obj)
    
# save train results
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)

# save eval results
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

%reload_ext tensorboard
%tensorboard --logdir '{model_logging_dir}'


## Inference

In [None]:
### Load finetuned Model for prediction

from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_class, tokenizer_class, pretrained_weights = (AutoModelForSequenceClassification, AutoTokenizer, 'saved')

finetuned_tokenizer = tokenizer_class.from_pretrained(pretrained_weights, do_lower_case=True)
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")
finetuned_model = model_class.from_pretrained(pretrained_weights, num_labels=3)
from transformers import TextClassificationPipeline
text = "75 thousands"
pipe = TextClassificationPipeline(model=finetuned_model, tokenizer=finetuned_tokenizer, return_all_scores=False)
pipe(text)

trainer.evaluate()