In [1]:
from datasets import load_dataset
import pandas as pd
from feature.selector import Selective, SelectionMethod
from textwiser import TextWiser, Embedding, Transformation


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
args={}
args['selection_percentage'] = 0.1

In [45]:
def get_selected_data(df, args, text_column="text", label_column="category", max_clusters=2000):
    num_rows = len(df)
    df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]
    df_T = df.loc[:, text_column].to_frame().T
    
    labels = pd.get_dummies(df[label_column], dtype=int)
    labels.columns = ["label_" + str(i) for i in range(1, len(labels.columns)+1)] 
    labels_T = labels.T
    
    # TextWiser featurization method to create text embeddings
    textwiser = TextWiser(Embedding.TfIdf(), Transformation.NMF(n_components=20))

    # Text-based selection
    # The goal is to select a subset of articles 
    # that is most diverse in the text embedding space of articles
    # and covers the most labels in each topic
    selector = Selective(SelectionMethod.TextBased(num_features=min(round(num_rows*args["selection_percentage"]), max_clusters), 
                                                   featurization_method=textwiser,
                                                   optimization_method='kmeans'))

    # Feature reduction
    subset = selector.fit_transform(df_T, labels_T)
    
    return df.loc[subset.columns, ["instruction", "input", "output"]]

In [5]:
dataset = load_dataset("conceptofmind/flan2021_submix_original", split="train")

Downloading readme: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 536/536 [00:00<00:00, 403kB/s]
Resolving data files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18/18 [00:00<00:00, 148.10it/s]
Downloading data files:   0%|                                                                                                                                                    | 0/1 [00:00<?, ?it/s]
Downloading data:   0%|                                                                                                                                                     | 0.00/301M [00:00<?, ?B/s][A
Downloading data:   1%|█▉                                                                                                                                          | 4.19M/301M [00:00<00:55, 5.36MB/

Downloading data:   0%|                                                                                                                                                     | 0.00/305M [00:00<?, ?B/s][A
Downloading data:   1%|█▉                                                                                                                                          | 4.19M/305M [00:00<00:39, 7.58MB/s][A
Downloading data:   4%|█████▊                                                                                                                                      | 12.6M/305M [00:01<00:31, 9.43MB/s][A
Downloading data:   7%|█████████▋                                                                                                                                  | 21.0M/305M [00:02<00:28, 9.88MB/s][A
Downloading data:  10%|█████████████▍                                                                                                                              | 29.4M/305M [00:02<00:25

Downloading data:   4%|█████▊                                                                                                                                      | 12.6M/306M [00:01<00:23, 12.6MB/s][A
Downloading data:   7%|█████████▌                                                                                                                                  | 21.0M/306M [00:01<00:20, 13.9MB/s][A
Downloading data:  10%|█████████████▍                                                                                                                              | 29.4M/306M [00:02<00:19, 14.2MB/s][A
Downloading data:  12%|█████████████████▎                                                                                                                          | 37.7M/306M [00:02<00:19, 13.7MB/s][A
Downloading data:  15%|█████████████████████▏                                                                                                                      | 46.1M/306M [00:03<00:18

Downloading data:  10%|█████████████▍                                                                                                                              | 29.4M/305M [00:03<00:27, 9.89MB/s][A
Downloading data:  12%|█████████████████▎                                                                                                                          | 37.7M/305M [00:03<00:22, 12.1MB/s][A
Downloading data:  15%|█████████████████████▏                                                                                                                      | 46.1M/305M [00:04<00:26, 9.87MB/s][A
Downloading data:  18%|█████████████████████████                                                                                                                   | 54.5M/305M [00:05<00:22, 11.1MB/s][A
Downloading data:  21%|████████████████████████████▊                                                                                                               | 62.9M/305M [00:06<00:20

Downloading data:  15%|█████████████████████▏                                                                                                                      | 46.1M/305M [00:03<00:17, 14.4MB/s][A
Downloading data:  18%|█████████████████████████                                                                                                                   | 54.5M/305M [00:04<00:21, 11.6MB/s][A
Downloading data:  21%|████████████████████████████▉                                                                                                               | 62.9M/305M [00:05<00:20, 12.0MB/s][A
Downloading data:  23%|████████████████████████████████▊                                                                                                           | 71.3M/305M [00:06<00:18, 12.9MB/s][A
Downloading data:  26%|████████████████████████████████████▌                                                                                                       | 79.7M/305M [00:06<00:16

Downloading data:  21%|████████████████████████████▊                                                                                                               | 62.9M/305M [00:04<00:15, 15.4MB/s][A
Downloading data:  23%|████████████████████████████████▋                                                                                                           | 71.3M/305M [00:04<00:15, 14.8MB/s][A
Downloading data:  26%|████████████████████████████████████▌                                                                                                       | 79.7M/305M [00:05<00:14, 15.4MB/s][A
Downloading data:  29%|████████████████████████████████████████▍                                                                                                   | 88.1M/305M [00:05<00:13, 16.0MB/s][A
Downloading data:  32%|████████████████████████████████████████████▎                                                                                               | 96.5M/305M [00:06<00:12

Downloading data:  26%|████████████████████████████████████▌                                                                                                       | 79.7M/305M [00:05<00:14, 16.1MB/s][A
Downloading data:  29%|████████████████████████████████████████▎                                                                                                   | 88.1M/305M [00:06<00:13, 16.5MB/s][A
Downloading data:  32%|████████████████████████████████████████████▏                                                                                               | 96.5M/305M [00:06<00:12, 17.2MB/s][A
Downloading data:  34%|████████████████████████████████████████████████▍                                                                                            | 105M/305M [00:07<00:13, 15.4MB/s][A
Downloading data:  37%|████████████████████████████████████████████████████▎                                                                                        | 113M/305M [00:07<00:13

Downloading data:  32%|████████████████████████████████████████████▍                                                                                               | 96.5M/304M [00:06<00:13, 15.4MB/s][A
Downloading data:  34%|████████████████████████████████████████████████▌                                                                                            | 105M/304M [00:07<00:14, 13.6MB/s][A
Downloading data:  37%|████████████████████████████████████████████████████▍                                                                                        | 113M/304M [00:08<00:13, 14.0MB/s][A
Downloading data:  40%|████████████████████████████████████████████████████████▍                                                                                    | 122M/304M [00:09<00:14, 12.5MB/s][A
Downloading data:  43%|████████████████████████████████████████████████████████████▎                                                                                | 130M/304M [00:09<00:13

Downloading data:  37%|████████████████████████████████████████████████████▍                                                                                        | 113M/304M [00:08<00:12, 15.8MB/s][A
Downloading data:  40%|████████████████████████████████████████████████████████▎                                                                                    | 122M/304M [00:08<00:11, 16.0MB/s][A
Downloading data:  43%|████████████████████████████████████████████████████████████▏                                                                                | 130M/304M [00:09<00:10, 16.0MB/s][A
Downloading data:  45%|████████████████████████████████████████████████████████████████                                                                             | 138M/304M [00:09<00:10, 15.4MB/s][A
Downloading data:  48%|████████████████████████████████████████████████████████████████████                                                                         | 147M/304M [00:10<00:09

Downloading data:  43%|████████████████████████████████████████████████████████████▏                                                                                | 130M/305M [00:08<00:10, 16.7MB/s][A
Downloading data:  45%|████████████████████████████████████████████████████████████████                                                                             | 138M/305M [00:09<00:09, 18.0MB/s][A
Downloading data:  48%|███████████████████████████████████████████████████████████████████▉                                                                         | 147M/305M [00:09<00:08, 18.0MB/s][A
Downloading data:  51%|███████████████████████████████████████████████████████████████████████▊                                                                     | 155M/305M [00:10<00:11, 13.4MB/s][A
Downloading data:  54%|███████████████████████████████████████████████████████████████████████████▋                                                                 | 164M/305M [00:11<00:10

Downloading data:  48%|███████████████████████████████████████████████████████████████████▊                                                                         | 147M/305M [00:10<00:10, 15.2MB/s][A
Downloading data:  51%|███████████████████████████████████████████████████████████████████████▋                                                                     | 155M/305M [00:11<00:11, 13.2MB/s][A
Downloading data:  54%|███████████████████████████████████████████████████████████████████████████▌                                                                 | 164M/305M [00:11<00:10, 14.0MB/s][A
Downloading data:  56%|███████████████████████████████████████████████████████████████████████████████▍                                                             | 172M/305M [00:12<00:08, 15.8MB/s][A
Downloading data:  59%|███████████████████████████████████████████████████████████████████████████████████▎                                                         | 180M/305M [00:12<00:07

Downloading data:  54%|███████████████████████████████████████████████████████████████████████████▌                                                                 | 164M/305M [00:11<00:08, 15.9MB/s][A
Downloading data:  56%|███████████████████████████████████████████████████████████████████████████████▍                                                             | 172M/305M [00:12<00:10, 13.0MB/s][A
Downloading data:  59%|███████████████████████████████████████████████████████████████████████████████████▎                                                         | 180M/305M [00:12<00:08, 14.3MB/s][A
Downloading data:  62%|███████████████████████████████████████████████████████████████████████████████████████▏                                                     | 189M/305M [00:13<00:07, 16.2MB/s][A
Downloading data:  65%|███████████████████████████████████████████████████████████████████████████████████████████                                                  | 197M/305M [00:13<00:06

Downloading data:  59%|███████████████████████████████████████████████████████████████████████████████████                                                          | 180M/306M [00:13<00:09, 13.9MB/s][A
Downloading data:  62%|██████████████████████████████████████████████████████████████████████████████████████▉                                                      | 189M/306M [00:14<00:08, 14.0MB/s][A
Downloading data:  64%|██████████████████████████████████████████████████████████████████████████████████████████▊                                                  | 197M/306M [00:14<00:07, 14.5MB/s][A
Downloading data:  67%|██████████████████████████████████████████████████████████████████████████████████████████████▋                                              | 206M/306M [00:15<00:07, 12.7MB/s][A
Downloading data:  70%|██████████████████████████████████████████████████████████████████████████████████████████████████▌                                          | 214M/306M [00:15<00:06

Downloading data:  65%|███████████████████████████████████████████████████████████████████████████████████████████▎                                                 | 197M/305M [00:15<00:06, 15.8MB/s][A
Downloading data:  67%|███████████████████████████████████████████████████████████████████████████████████████████████▏                                             | 206M/305M [00:16<00:06, 15.1MB/s][A
Downloading data:  70%|███████████████████████████████████████████████████████████████████████████████████████████████████                                          | 214M/305M [00:16<00:06, 14.3MB/s][A
Downloading data:  73%|██████████████████████████████████████████████████████████████████████████████████████████████████████▉                                      | 222M/305M [00:17<00:05, 13.9MB/s][A
Downloading data:  76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 231M/305M [00:18<00:05

Downloading data:  70%|███████████████████████████████████████████████████████████████████████████████████████████████████                                          | 214M/305M [00:16<00:06, 14.0MB/s][A
Downloading data:  73%|██████████████████████████████████████████████████████████████████████████████████████████████████████▉                                      | 222M/305M [00:16<00:05, 14.9MB/s][A
Downloading data:  76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 231M/305M [00:17<00:05, 14.4MB/s][A
Downloading data:  78%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                              | 239M/305M [00:17<00:04, 16.0MB/s][A
Downloading data:  81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                          | 247M/305M [00:18<00:03

Downloading data:  76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                  | 231M/305M [00:16<00:04, 16.0MB/s][A
Downloading data:  78%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                              | 239M/305M [00:17<00:03, 17.2MB/s][A
Downloading data:  81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                          | 247M/305M [00:17<00:03, 18.5MB/s][A
Downloading data:  84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                      | 256M/305M [00:17<00:02, 19.7MB/s][A
Downloading data:  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                  | 264M/305M [00:18<00:02

Downloading data:  81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                           | 247M/306M [00:17<00:04, 14.4MB/s][A
Downloading data:  84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                       | 256M/306M [00:17<00:03, 14.1MB/s][A
Downloading data:  86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 264M/306M [00:18<00:02, 16.1MB/s][A
Downloading data:  89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 273M/306M [00:18<00:01, 17.1MB/s][A
Downloading data:  92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍           | 281M/306M [00:18<00:01

Downloading data:  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                  | 264M/305M [00:18<00:02, 14.6MB/s][A
Downloading data:  90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏              | 273M/305M [00:19<00:02, 14.8MB/s][A
Downloading data:  92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████           | 281M/305M [00:19<00:01, 16.5MB/s][A
Downloading data:  95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████       | 289M/305M [00:20<00:00, 16.2MB/s][A
Downloading data:  98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉   | 298M/305M [00:20<00:00

In [6]:
df = pd.DataFrame(dataset)
df.head()

Unnamed: 0,inputs,targets,task_source,task_name,template_type
0,"Aleksandr Chumakov, club, FC Torpedo Moscow; F...",Aleksandr Chumakov has played for FC Torpedo M...,Flan2021,gem/web_nlg_en:1.1.0,zs_noopt
1,"I kill Kurtz, they kill me, the brass running ...","Já zabiju Kurtze, oni zabijou mě, hlavouni, kt...",Flan2021,wmt16_translate/cs-en:1.0.0,zs_noopt
2,i'm 10x cooler than all of you! \nWhat is the ...,positive,Flan2021,sentiment140:1.0.0,zs_noopt
3,"test: Tämä tarkoittaa, että avustuksia on jaet...","Returning to the Palestinian elections, I do n...",Flan2021,wmt16_translate/fi-en:1.0.0,fs_noopt
4,"How do you say ""In this regard, we would like ...","En este sentido, quisiéramos brindar un agrade...",Flan2021,para_crawl_enes,zs_opt


In [24]:
len(df['task_name'].unique())

70

In [26]:
len(df['template_type'].unique())

4

In [23]:
df['task_name'].value_counts()

task_name
glue/mnli:2.0.0                216560
wmt14_translate/fr-en:1.0.0    109197
trivia_qa/rc:1.1.0             109120
paws_wiki:1.1.0                109019
wmt16_translate/fi-en:1.0.0    108923
                                ...  
unified_qa_science_inst          2135
glue/wnli:2.0.0                  2111
super_glue/wsc.fixed:1.0.2       1832
super_glue/copa:1.0.2            1302
super_glue/cb:1.0.2               745
Name: count, Length: 70, dtype: int64

In [25]:
task_name_counts = df['task_name'].value_counts().to_frame()
task_name_counts.head()

Unnamed: 0_level_0,count
task_name,Unnamed: 1_level_1
glue/mnli:2.0.0,216560
wmt14_translate/fr-en:1.0.0,109197
trivia_qa/rc:1.1.0,109120
paws_wiki:1.1.0,109019
wmt16_translate/fi-en:1.0.0,108923


In [27]:
df["input"] = ""
df.rename(columns={"inputs":"instruction", "targets":"output"}, inplace=True)
df.head()

Unnamed: 0,instruction,output,task_source,task_name,template_type,input
0,"Aleksandr Chumakov, club, FC Torpedo Moscow; F...",Aleksandr Chumakov has played for FC Torpedo M...,Flan2021,gem/web_nlg_en:1.1.0,zs_noopt,
1,"I kill Kurtz, they kill me, the brass running ...","Já zabiju Kurtze, oni zabijou mě, hlavouni, kt...",Flan2021,wmt16_translate/cs-en:1.0.0,zs_noopt,
2,i'm 10x cooler than all of you! \nWhat is the ...,positive,Flan2021,sentiment140:1.0.0,zs_noopt,
3,"test: Tämä tarkoittaa, että avustuksia on jaet...","Returning to the Palestinian elections, I do n...",Flan2021,wmt16_translate/fi-en:1.0.0,fs_noopt,
4,"How do you say ""In this regard, we would like ...","En este sentido, quisiéramos brindar un agrade...",Flan2021,para_crawl_enes,zs_opt,


In [28]:
unique_task_names = list(task_name_counts.index)

In [38]:
from tqdm.notebook import tqdm

In [49]:
selected_dfs = []
for i, task_name in enumerate(unique_task_names):
    df_to_select = df[df['task_name']==task_name]
    selected_dfs.append(get_selected_data(df_to_select, args, text_column="text", 
                                          label_column="task_name", max_clusters=2000)) # label_column is not used
    print(i)
merged_df = pd.concat(selected_dfs, ignore_index=True)
merged_df.to_csv("../data/flan2021.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


6


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


7


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


8


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


9


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


10


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


11


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


12


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


13


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


14


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


15


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


16


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


17


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


18


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


19


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


20


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


21


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


22


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


23


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


24


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


25


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


26


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


27


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


28


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


29


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


30


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


31


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


32


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


33


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


34


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


35


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


36


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


37


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


39


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


40


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


41


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


42


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


43


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


44


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


45


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


46


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


47


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


48


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


49


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


50


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


51


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


52


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


53


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


54


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


55


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


56


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


57


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


58


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


59


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


60


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


61


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


62


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


63


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


65


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


66


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


67


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


68


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]


69
