In [1]:
from datasets import load_dataset
import pandas as pd
from feature.selector import Selective, SelectionMethod
from textwiser import TextWiser, Embedding, Transformation


In [2]:
args={}
args['selection_percentage'] = 0.1

In [3]:
def get_selected_data(df, args, text_column="text", label_column="category", max_clusters=2000):
    num_rows = len(df)
    df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]
    df_T = df.loc[:, text_column].to_frame().T
    
    labels = pd.get_dummies(df[label_column], dtype=int)
    labels.columns = ["label_" + str(i) for i in range(1, len(labels.columns)+1)] 
    labels_T = labels.T
    
    # TextWiser featurization method to create text embeddings
    textwiser = TextWiser(Embedding.TfIdf(), Transformation.NMF(n_components=20))

    # Text-based selection
    # The goal is to select a subset of articles 
    # that is most diverse in the text embedding space of articles
    # and covers the most labels in each topic
    selector = Selective(SelectionMethod.TextBased(num_features=min(round(num_rows*args["selection_percentage"]), max_clusters), 
                                                   featurization_method=textwiser,
                                                   optimization_method='kmeans'))

    # Feature reduction
    subset = selector.fit_transform(df_T, labels_T)
    
    return df.loc[subset.columns, ["instruction", "input", "output"]]

In [4]:
dataset = load_dataset("conceptofmind/t0_submix_original", split="train")

Downloading readme:   0%|          | 0.00/530 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/272M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/273M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/274M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/275M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/276M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/273M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/274M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/273M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/275M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/274M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1650308 [00:00<?, ? examples/s]

In [5]:
df = pd.DataFrame(dataset)
df.head()

Unnamed: 0,inputs,targets,task_source,task_name,template_type
0,Q: What type of molecules sit within a membran...,the abdomen\n--,P3,sciq_Direct_Question_Closed_Book_,fs_noopt
1,- headercolor is lightsteelblue - birth dat...,Ans: wandee kameaim,P3,wiki_bio_guess_person,fs_noopt
2,Read the following article and select the best...,Thomas Jefferson,P3,race_high_Select_the_best_answer_generate_span_,fs_noopt
3,"Information: - Chand Bardai ( , September 30...",rajasthan,P3,wiki_hop_original_choose_best_object_affirmati...,zs_opt
4,When Patch finally found his way out of the se...,D,P3,quail_no_prompt_id,zs_noopt


In [6]:
len(df['task_name'].unique())

193

In [7]:
len(df['template_type'].unique())

4

In [8]:
df['task_name'].value_counts()

task_name
wiki_qa_Decide_good_answer                              12635
social_i_qa_Show_choices_and_generate_index             12597
amazon_polarity_Is_this_review_negative                 12560
social_i_qa_Check_if_a_random_answer_is_valid_or_not    12554
race_middle_Select_the_best_answer_generate_span_       12551
                                                        ...  
wiki_qa_Topic_Prediction_Question_Only                    640
wiki_qa_Topic_Prediction_Answer_Only                      638
wiki_qa_Jeopardy_style                                    637
wiki_qa_Topic_Prediction_Question_and_Answer_Pair         616
wiki_qa_Direct_Answer_to_Question                         610
Name: count, Length: 193, dtype: int64

In [9]:
task_name_counts = df['task_name'].value_counts().to_frame()
task_name_counts.head()

Unnamed: 0_level_0,count
task_name,Unnamed: 1_level_1
wiki_qa_Decide_good_answer,12635
social_i_qa_Show_choices_and_generate_index,12597
amazon_polarity_Is_this_review_negative,12560
social_i_qa_Check_if_a_random_answer_is_valid_or_not,12554
race_middle_Select_the_best_answer_generate_span_,12551


In [10]:
df["input"] = ""
df.rename(columns={"inputs":"instruction", "targets":"output"}, inplace=True)
df.head()

Unnamed: 0,instruction,output,task_source,task_name,template_type,input
0,Q: What type of molecules sit within a membran...,the abdomen\n--,P3,sciq_Direct_Question_Closed_Book_,fs_noopt,
1,- headercolor is lightsteelblue - birth dat...,Ans: wandee kameaim,P3,wiki_bio_guess_person,fs_noopt,
2,Read the following article and select the best...,Thomas Jefferson,P3,race_high_Select_the_best_answer_generate_span_,fs_noopt,
3,"Information: - Chand Bardai ( , September 30...",rajasthan,P3,wiki_hop_original_choose_best_object_affirmati...,zs_opt,
4,When Patch finally found his way out of the se...,D,P3,quail_no_prompt_id,zs_noopt,


In [11]:
unique_task_names = list(task_name_counts.index)

In [12]:
from tqdm.notebook import tqdm as log_progress

In [13]:
selected_dfs = []
for task_name in log_progress(unique_task_names):
    df_to_select = df[df['task_name']==task_name]
    selected_dfs.append(get_selected_data(df_to_select, args, text_column="text", 
                                          label_column="task_name", max_clusters=2000)) # label_column is not used
merged_df = pd.concat(selected_dfs, ignore_index=True)
merged_df.to_csv("../data/t0.csv", index=False)

  0%|          | 0/193 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["inp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["inp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["inp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["inp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["inp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["inp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["inp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["inp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["inp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["inp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["inp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["inp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["inp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["inp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["input"] + " " + df["output"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df["instruction"] + " " + df["inp