In [61]:
from sklearn.model_selection import train_test_split
from datasets import load_dataset

dataset = load_dataset("csv", data_files="feedingdata.csv",split="train")
dataset = dataset.rename_column("is_recommend", "label")
dataset = dataset.train_test_split(test_size=0.2)


In [63]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [64]:
def preprocess_function(examples):
    return tokenizer(examples["text"] , truncation=True)

In [65]:
tokenized = dataset.map(preprocess_function, batched=True)


Map: 100%|██████████| 80/80 [00:00<00:00, 2020.99 examples/s]
Map: 100%|██████████| 20/20 [00:00<00:00, 862.77 examples/s]


In [66]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [67]:
import evaluate
accuracy = evaluate.load('accuracy')

In [68]:
import numpy as np
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [69]:
id2label = {0: "NO", 1: "YES"}
label2id = {"NO":0, "YES":1}

In [70]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [71]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


[A
[A

[A[A                                       
                                              
  0%|          | 0/10 [05:45<?, ?it/s]
[ACheckpoint destination directory my_awesome_model\checkpoint-5 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'eval_loss': 0.5823211669921875, 'eval_accuracy': 0.75, 'eval_runtime': 6.8153, 'eval_samples_per_second': 2.935, 'eval_steps_per_second': 0.293, 'epoch': 1.0}



[A
[A
                                               

  0%|          | 0/10 [07:55<?, ?it/s]       
[A
[ACheckpoint destination directory my_awesome_model\checkpoint-10 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'eval_loss': 0.4569928050041199, 'eval_accuracy': 0.9, 'eval_runtime': 6.6858, 'eval_samples_per_second': 2.991, 'eval_steps_per_second': 0.299, 'epoch': 2.0}



[A
[A
                                               

  0%|          | 0/10 [10:45<?, ?it/s]       
[A
[A

{'eval_loss': 0.3677009642124176, 'eval_accuracy': 0.95, 'eval_runtime': 6.7576, 'eval_samples_per_second': 2.96, 'eval_steps_per_second': 0.296, 'epoch': 3.0}



[A
[A
                                               

  0%|          | 0/10 [12:57<?, ?it/s]       
[A
[A

{'eval_loss': 0.31894955039024353, 'eval_accuracy': 1.0, 'eval_runtime': 6.8633, 'eval_samples_per_second': 2.914, 'eval_steps_per_second': 0.291, 'epoch': 4.0}



[A
[A
                                               

  0%|          | 0/10 [15:42<?, ?it/s]       
[A
[A

{'eval_loss': 0.2826727032661438, 'eval_accuracy': 1.0, 'eval_runtime': 6.6816, 'eval_samples_per_second': 2.993, 'eval_steps_per_second': 0.299, 'epoch': 5.0}



[A
[A
                                               

  0%|          | 0/10 [18:47<?, ?it/s]       
[A
[A

{'eval_loss': 0.25267094373703003, 'eval_accuracy': 1.0, 'eval_runtime': 6.9562, 'eval_samples_per_second': 2.875, 'eval_steps_per_second': 0.288, 'epoch': 6.0}



[A
[A
                                               

  0%|          | 0/10 [20:59<?, ?it/s]       
[A
[A

{'eval_loss': 0.2326316088438034, 'eval_accuracy': 1.0, 'eval_runtime': 6.6343, 'eval_samples_per_second': 3.015, 'eval_steps_per_second': 0.301, 'epoch': 7.0}



[A
[A
                                               

  0%|          | 0/10 [23:14<?, ?it/s]       
[A
[A

{'eval_loss': 0.21717670559883118, 'eval_accuracy': 1.0, 'eval_runtime': 6.7072, 'eval_samples_per_second': 2.982, 'eval_steps_per_second': 0.298, 'epoch': 8.0}



[A
[A
                                               

  0%|          | 0/10 [24:53<?, ?it/s]       
[A
[A

{'eval_loss': 0.20838549733161926, 'eval_accuracy': 1.0, 'eval_runtime': 7.5182, 'eval_samples_per_second': 2.66, 'eval_steps_per_second': 0.266, 'epoch': 9.0}



[A
[A
                                               

  0%|          | 0/10 [27:24<?, ?it/s]       
[A
[A

{'eval_loss': 0.20403368771076202, 'eval_accuracy': 1.0, 'eval_runtime': 6.4782, 'eval_samples_per_second': 3.087, 'eval_steps_per_second': 0.309, 'epoch': 10.0}



100%|██████████| 50/50 [24:24<00:00, 29.29s/it]

{'train_runtime': 1464.2915, 'train_samples_per_second': 0.546, 'train_steps_per_second': 0.034, 'train_loss': 0.3988014602661133, 'epoch': 10.0}





TrainOutput(global_step=50, training_loss=0.3988014602661133, metrics={'train_runtime': 1464.2915, 'train_samples_per_second': 0.546, 'train_steps_per_second': 0.034, 'train_loss': 0.3988014602661133, 'epoch': 10.0})

In [72]:
trainer.save_model("intent_recognition_model")

In [5]:
from transformers import pipeline
classifier = pipeline('text-classification', model='./intent_recognition_model',max_length=512)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'label': 'YES', 'score': 0.927436351776123}]

In [14]:
import pandas as pd
df = pd.read_csv('laptop_found.csv')
df = df.dropna(subset=['text'])
comments = df['text'].tolist()

In [18]:
x = classifier(comments)

In [29]:
df['is_recommend'] = res

In [26]:
x

[{'label': 'NO', 'score': 0.6992284059524536},
 {'label': 'YES', 'score': 0.895643413066864},
 {'label': 'YES', 'score': 0.9632229804992676},
 {'label': 'YES', 'score': 0.9624307751655579},
 {'label': 'NO', 'score': 0.6937205195426941},
 {'label': 'NO', 'score': 0.671192467212677},
 {'label': 'NO', 'score': 0.6963024735450745},
 {'label': 'YES', 'score': 0.9474329948425293},
 {'label': 'YES', 'score': 0.9145470857620239},
 {'label': 'NO', 'score': 0.6990416646003723},
 {'label': 'NO', 'score': 0.6990416646003723},
 {'label': 'NO', 'score': 0.7022157907485962},
 {'label': 'NO', 'score': 0.6990416646003723},
 {'label': 'YES', 'score': 0.7094553709030151},
 {'label': 'NO', 'score': 0.6874908804893494},
 {'label': 'YES', 'score': 0.9663295149803162},
 {'label': 'YES', 'score': 0.8819924592971802},
 {'label': 'NO', 'score': 0.7135627269744873},
 {'label': 'NO', 'score': 0.6941422820091248},
 {'label': 'NO', 'score': 0.7242318391799927},
 {'label': 'NO', 'score': 0.6526045799255371},
 {'labe

In [27]:
res = []
for i in x:
    if i['label'] == 'YES':
        res.append(1)
    else:
        res.append(0)


In [30]:
df.head()

Unnamed: 0,score,date,author,link,text,thread_id,id,parent_id,laptop_found,is_recommend
0,1,1/1/2023,u/super1s,https://www.reddit.com/r/SuggestALaptop/commen...,What are you looking to do? If it is just scho...,1009n9k,j2gictr,t3_1009n9k,[],0
1,2,1/1/2023,u/MakarsViolin,https://www.reddit.com/r/SuggestALaptop/commen...,I've got a galaxy flex2 alpha for school and p...,10074il,j2gti48,t3_10074il,[],1
2,-1,1/1/2023,u/wikiwiki62,https://www.reddit.com/r/SuggestALaptop/commen...,I would recommend [**Acer Swift X**]( because ...,1009lra,j2gwx8i,t3_1009lra,['Acer Swift X 14 SFX14-71G'],1
3,2,1/1/2023,u/wikiwiki62,https://www.reddit.com/r/SuggestALaptop/commen...,# I would recommend [Acer Swift X]( \n\nbecau...,10074il,j2gx1bd,t3_10074il,['Acer Swift X 14 SFX14-71G'],1
4,4,1/1/2023,u/tylerwatt12,https://www.reddit.com/r/SuggestALaptop/commen...,Please use the laptop form so we can get more ...,100c19m,j2h3alr,t3_100c19m,[],0


In [31]:
df.to_csv('laptop_found_intent.csv', index=False)