In [None]:
import pandas as pd
df=pd.read_csv('importance-cleaned.csv')


df['iteration'] = pd.to_numeric(df['iteration'], errors='coerce')

# unique iteration count per base model
iters_per_base = (
    df.groupby('base_model')['iteration']
      .nunique(dropna=True)
      .reset_index(name='unique_iterations')
      .sort_values('base_model')
)
print(iters_per_base)

# OPTIONAL: list missing iterations (should be empty lists if all 0..49 present)
all_iters = set(range(50))
missing_by_base = (
    df.dropna(subset=['iteration'])
      .groupby('base_model')['iteration']
      .apply(lambda s: sorted(all_iters - set(s.astype(int).unique())))
      .reset_index(name='missing_iterations')
      .sort_values('base_model')
)
print(missing_by_base)


   base_model  unique_iterations
0  gemma3-12b                 50
1   llama-pro                 50
2     mistral                 50
3        phi4                 50
   base_model missing_iterations
0  gemma3-12b                 []
1   llama-pro                 []
2     mistral                 []
3        phi4                 []


In [6]:
#check human data
df=pd.read_excel("importance-human.xlsx")

#filter only human data
df_new=df[df['condition'] == 'human']

#save to csv
df_new.to_csv("importance-humanOnly.csv", index=False)

df_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 162 entries, 0 to 161
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   response_id     162 non-null    object
 1   respondent_id   162 non-null    object
 2   condition       162 non-null    object
 3   variant_id      162 non-null    object
 4   technology      162 non-null    object
 5   rating_numeric  162 non-null    int64 
 6   rating_text     162 non-null    object
 7   justification   0 non-null      object
dtypes: int64(1), object(7)
memory usage: 11.4+ KB


In [None]:
import pandas as pd

LLM_IN  = "importance-cleaned.csv"
HUM_IN  = "importance-humanOnly.csv"
OUT_FILE = "merged_importance_zeroshot.csv"

# load
llm = pd.read_csv(LLM_IN)
hum = pd.read_csv(HUM_IN)

# set source
llm['source'] = 'llm'
hum['source'] = 'human'

# add condition
llm['condition'] = 'ZEROSHOT'
hum['condition'] = 'ZEROSHOT'

# keep exactly these shared columns (in this order)
cols = [
    'row_id','source', 'condition', 'base_model','variant_id','model', 'dc_solution',
    'rating','label','iteration','timestamp'
]

# some columns may be missing in human (base_model/variant_id/model/iteration/timestamp) â†’ create if needed
for c in cols:
    if c not in llm.columns: llm[c] = pd.NA
    if c not in hum.columns: hum[c] = pd.NA

merged = pd.concat([hum[cols], llm[cols]], ignore_index=True)


merged.to_csv(OUT_FILE, index=False)
print("Saved:", OUT_FILE, "rows:", len(merged))
print("Merged DataFrame info:")
print(merged.info())

In [1]:
import pandas as pd
df=pd.read_csv("merged_importance_zeroshot.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13362 entries, 0 to 13361
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   row_id       13362 non-null  object 
 1   source       13362 non-null  object 
 2   condition    13362 non-null  object 
 3   base_model   13200 non-null  object 
 4   variant_id   13362 non-null  object 
 5   model        13362 non-null  object 
 6   dc_solution  13362 non-null  object 
 7   rating       13362 non-null  int64  
 8   label        13362 non-null  object 
 9   iteration    13200 non-null  float64
 10  timestamp    13200 non-null  object 
dtypes: float64(1), int64(1), object(9)
memory usage: 1.1+ MB
