In [1]:
# classifier = TextClassifier.load('en-sentiment')


In [None]:
# %pip install flair
# %pip install transformers torch
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
# Sample DataFrame preparation
all_df = pd.read_csv("data.csv")

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Splitting the dataset
train_df, test_df = train_test_split(all_df, test_size=0.2, random_state=42)

from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
import numpy as np
from torch.utils.data import Dataset

# Initialize tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Encode the labels
label_encoder = LabelEncoder()
train_df['encoded_labels'] = label_encoder.fit_transform(train_df['sentiment'])
test_df['encoded_labels'] = label_encoder.transform(test_df['sentiment'])

# Preparing the dataset for PyTorch
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        texts = [str(text) for text in texts if pd.notnull(text) and text != '']
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)  # Example: Reduced to 128
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Convert DataFrame to Dataset
train_df.dropna(subset=['ProcessedComments'], inplace=True)
train_df = train_df[train_df['ProcessedComments'] != '']
train_df['ProcessedComments'] = train_df['ProcessedComments'].astype(str)

test_df.dropna(subset=['ProcessedComments'], inplace=True)
test_df = test_df[test_df['ProcessedComments'] != '']
test_df['ProcessedComments'] = test_df['ProcessedComments'].astype(str)

# Now proceed to create the dataset instances
train_dataset = SentimentDataset(train_df['ProcessedComments'].to_list(), train_df['encoded_labels'].to_list(), tokenizer)
test_dataset = SentimentDataset(test_df['ProcessedComments'].to_list(), test_df['encoded_labels'].to_list(), tokenizer)

# Initialize the model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label_encoder.classes_))

# Training arguments
training_args = TrainingArguments(
    # fp16=True,  # Enable mixed precision
    # gradient_accumulation_steps=2,  # Accumulate gradients over 2 steps
    output_dir='./results',          
    num_train_epochs=1,              
    per_device_train_batch_size=4,  
    per_device_eval_batch_size=8,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',            
    logging_steps=10,
)

# Initialize Trainer
trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
)

# Train
trainer.train()

: 

In [25]:
import pandas as pd

from flair.data import Corpus
from flair.datasets import CSVClassificationCorpus
from flair.datasets import TREC_6
from flair.embeddings import TransformerDocumentEmbeddings, DocumentPoolEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from flair.data import Sentence
from flair.models import TARSClassifier
from flair.embeddings import FlairEmbeddings, PooledFlairEmbeddings
from flair.data import Dictionary
from flair.embeddings import FlairEmbeddings
from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay, f1_score, precision_score, recall_score

import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np
import warnings
warnings.filterwarnings('ignore')


all_df = pd.read_csv('data.csv')

all_df=all_df[["ProcessedComments","sentiment"]]
all_df.dropna(subset=['ProcessedComments'], inplace=True)

all_df['formatted'] = all_df.apply(lambda row: row['ProcessedComments'] + ' __label__' + str(row['sentiment']), axis=1)


train, test = train_test_split(all_df, train_size = 0.8, shuffle = True, stratify=all_df["sentiment"])

In [39]:
data_folder = 'data'  # Folder where you want to save the files

# Save files
train.to_csv(f'{data_folder}/train.csv', index=False, header=False)
test.to_csv(f'{data_folder}/test.csv', index=False, header=False)
test.to_csv(f'{data_folder}/dev.csv', index=False, header=False)

In [40]:
all_df

Unnamed: 0,ProcessedComments,sentiment,formatted
0,everyone died one person got rich seems like s...,1,everyone died one person got rich seems like s...
1,u attentive sleuth knew something wrong since ...,-1,u attentive sleuth knew something wrong since ...
2,setup brave presearch swash provide liquidity ...,1,setup brave presearch swash provide liquidity ...
3,love tweet week profiting btc,1,love tweet week profiting btc __label__1
4,tldr member congress hold trade digital asset ...,1,tldr member congress hold trade digital asset ...
...,...,...,...
8495,vote feg coin buy billion early cheap possible...,0,vote feg coin buy billion early cheap possible...
8496,feeling people talking b bitcoin fee never rea...,-1,feeling people talking b bitcoin fee never rea...
8497,lmao nailed crowd well knowing enough rambling...,1,lmao nailed crowd well knowing enough rambling...
8498,sure diligence assume every project posted pro...,1,sure diligence assume every project posted pro...


In [41]:
from flair.datasets import CSVClassificationCorpus
from flair.data import Corpus
from pathlib import Path

# Directory where the train, test, and dev CSV files are located
data_folder = Path('data')

# Assuming the formatted data has two columns: the text and '__label__' prefixed label
corpus: Corpus = CSVClassificationCorpus(data_folder,
                                         column_name_map={0: "ProcessedComments", 1: "sentiment"},
                                         train_file='train.csv',
                                         test_file='test.csv',
                                         dev_file='dev.csv',
                                         label_type='sentiment')  # If you don't have a dev set, Flair will split the train set

# Check what the corpus looks like
print(corpus)


2024-04-08 01:36:48,033 Reading data from data
2024-04-08 01:36:48,034 Train: data/train.csv
2024-04-08 01:36:48,034 Dev: data/dev.csv
2024-04-08 01:36:48,034 Test: data/test.csv


RuntimeError: No data provided when initializing corpus object.

In [None]:
from flair.embeddings import FlairEmbeddings, DocumentRNNEmbeddings

# Use Flair's pre-trained embeddings
word_embeddings = [FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward')]

# Initialize document embeddings (consider using DocumentPoolEmbeddings or TransformerDocumentEmbeddings instead)
document_embeddings = DocumentRNNEmbeddings(word_embeddings,
                                            hidden_size=512,
                                            reproject_words=True,
                                            reproject_words_dimension=256)
from flair.models import TextClassifier

# Initialize the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False)

from flair.trainers import ModelTrainer

# Initialize the model trainer
trainer = ModelTrainer(classifier, corpus)

# Start the training process
trainer.train('models/classification',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=10)

