In [136]:
from __future__ import division
import pandas as pd
from itertools import combinations_with_replacement
import string


# Read-in files

In [137]:
a = pd.read_csv("../data/adjs_conc-abs.csv")
a2 = pd.read_csv("../data/adjs_phys-soc.csv")
n = pd.read_csv("../data/nouns_conc-abs.csv")
n2 = pd.read_csv("../data/nouns_ani.csv")
s = pd.read_csv("../data/syntax.csv")
v = pd.read_csv("../data/verbs_conc-abs.csv")
v2 = pd.read_csv("../data/verbs_phys-psych.csv")

# Make unordered pairs with self pairs

If you have 40 unique words, and you want all unordered pairs without self-pairs, the math is:

$$
\binom{41}{2} = \frac{40 \times 41}{2} = 820
$$


Explanation:
$$
\binom{n + 1}{2} = \frac{n(n-1)}{2}
$$

is the number of combinations of 
𝑛
n elements taken 2 at a time.

This avoids:
- self-pairs like (apple, apple),
- duplicate unordered pairs like (apple, banana) and (banana, apple).

# unordered pairs

In [144]:
import pandas as pd
import itertools
import string

def add_feature_match_specific(df_list, save_path_prefix):
    results = {}  # To store split DataFrames

    for df in df_list:
        base_name = getattr(df, 'name', 'df')
        unique_values = df['FeatureMatch'].unique()

        for val in unique_values:
            split_df = df[df['FeatureMatch'] == val].copy()

            # Save to CSV
            save_path = f"{save_path_prefix}{base_name}_{val}.csv"
            split_df.to_csv(save_path, index=False)
            print(f"Saved: {save_path}")

            # Store for later use
            results[f"{base_name}_{val}"] = split_df

    return results


def process_and_save_pairwise_dfs(df_list, save_path_prefix="../exp_files/"):
    all_split_results = {}  # To collect all final split DataFrames

    for df in df_list:
        if not hasattr(df, 'name'):
            raise ValueError("Each DataFrame in df_list must have a `.name` attribute.")
        print(f"Processing {df.name}...")

        # Use combinations to avoid ordered pairs (no (A,B) and (B,A), only one of them)
        pairs = list(itertools.combinations(df.itertuples(index=False), 2))
        print(f"{df.name}: {len(pairs)} unique unordered pairs")

        df_pairs = pd.DataFrame([
            {
                'Word1': p1.Word,
                'Word2': p2.Word,
                'FeatureCombo1': p1.FeatureCombo,
                'FeatureCombo2': p2.FeatureCombo
            }
            for p1, p2 in pairs
        ])
        df_pairs.name = df.name
        print(f"{df.name}: {len(df_pairs)} rows in pair DataFrame")

        df_pairs['FeatureMatch'] = df_pairs.apply(
            lambda row: classify_feature_match(row['FeatureCombo1'], row['FeatureCombo2']),
            axis=1
        )

        df_pairs['FeatureMatch'] = df_pairs.apply(
            lambda row: 'SelfPair' if row['Word1'] == row['Word2'] else row['FeatureMatch'],
            axis=1
        )

        # Collect results from each processed df
        split_results = add_feature_match_specific([df_pairs], save_path_prefix)
        all_split_results.update(split_results)

    return all_split_results


In [148]:
# Set .name attributes
for df, name in zip([v, v2, n, n2, s, a, a2], ['v', 'v2', 'n', 'n2', 's', 'a', 'a2']):
    df.name = name

# Process and capture results
results = process_and_save_pairwise_dfs([v, v2, n, n2, s, a, a2])

# Now `results` is a dictionary where keys are like 'v_SomeFeatureMatch'
# and values are the corresponding split DataFrames

# Example usage:
# results['v_MatchType1'].head()


Processing v...
v: 780 unique unordered pairs
v: 780 rows in pair DataFrame
Saved: ../exp_files/v_MaxMatch.csv
Saved: ../exp_files/v_ConceptualMatchingOnly.csv
Saved: ../exp_files/v_MaxMismatch.csv
Saved: ../exp_files/v_ValenceMatchingOnly.csv
Processing v2...
v2: 780 unique unordered pairs
v2: 780 rows in pair DataFrame
Saved: ../exp_files/v2_MaxMatch.csv
Saved: ../exp_files/v2_ValenceMatchingOnly.csv
Saved: ../exp_files/v2_ConceptualMatchingOnly.csv
Saved: ../exp_files/v2_MaxMismatch.csv
Processing n...
n: 780 unique unordered pairs
n: 780 rows in pair DataFrame
Saved: ../exp_files/n_MaxMatch.csv
Saved: ../exp_files/n_MaxMismatch.csv
Saved: ../exp_files/n_ConceptualMatchingOnly.csv
Saved: ../exp_files/n_ValenceMatchingOnly.csv
Processing n2...
n2: 780 unique unordered pairs
n2: 780 rows in pair DataFrame
Saved: ../exp_files/n2_MaxMatch.csv
Saved: ../exp_files/n2_ConceptualMatchingOnly.csv
Saved: ../exp_files/n2_ValenceMatchingOnly.csv
Saved: ../exp_files/n2_MaxMismatch.csv
Processing

KeyError: 'v_MatchType1'

In [150]:
results.head()

AttributeError: 'dict' object has no attribute 'head'

In [138]:

# You want to cycle only within the first n letters (A to J), 
# and then loop back around to A (not K, L, etc.) 
# but track how many loops have occurred to produce AA, BB, etc.

# You want to cycle through A–J repeatedly for each Word1 group — just with a shifting starting letter per group.
# The label length should stay 1 letter (A–J) 
# until the 10th unique value, 
# at which point we start doubling (AA–JJ), and later tripling, and so on.

def generate_group_labels_strict(n, group_index, base_letters):
    """
    Generate n group labels from a base_letters subset (e.g., A-J), shifting the start
    per group_index, and repeating letters (A, AA, AAA...) only after 10 groups.
    """
    base_len = len(base_letters)
    labels = []

    for i in range(n):
        pos = (group_index + i) % base_len
        repeat = (group_index) // base_len + 1  # only increase length every 10 groups
        label = base_letters[pos] * repeat
        labels.append(label)

    return labels




def add_feature_match_specific(df_list, save_path_prefix):
    base_letters = list(string.ascii_uppercase[:10])  # A–J

    for df in df_list:
        base_name = getattr(df, 'name', 'df')
        unique_values = df['FeatureMatch'].unique()

        for val in unique_values:
            split_df = df[df['FeatureMatch'] == val].copy()

            group_labels = []
            for group_index, word1_val in enumerate(split_df['Word1'].unique()):
                group_size = (split_df['Word1'] == word1_val).sum()
                labels = generate_group_labels_strict(group_size, group_index, base_letters)
                group_labels.extend(labels)

            split_df['Group'] = group_labels

            save_path = f"{save_path_prefix}{base_name}_{val}.csv"
            split_df.to_csv(save_path, index=False)
            print(f"Saved: {save_path}")



def process_and_save_pairwise_dfs(df_list, save_path_prefix="../exp_files/"):
    for df in df_list:
        if not hasattr(df, 'name'):
            raise ValueError("Each DataFrame in df_list must have a `.name` attribute.")
        print(f"Processing {df.name}...")

        # Use combinations to avoid ordered pairs (no (A,B) and (B,A), only one of them)
        pairs = list(itertools.combinations(df.itertuples(index=False), 2))
        print(f"{df.name}: {len(pairs)} unique unordered pairs")

        df_pairs = pd.DataFrame([
            {
                'Word1': p1.Word,
                'Word2': p2.Word,
                'FeatureCombo1': p1.FeatureCombo,
                'FeatureCombo2': p2.FeatureCombo
            }
            for p1, p2 in pairs
        ])
        df_pairs.name = df.name
        print(f"{df.name}: {len(df_pairs)} rows in pair DataFrame")

        df_pairs['FeatureMatch'] = df_pairs.apply(
            lambda row: classify_feature_match(row['FeatureCombo1'], row['FeatureCombo2']),
            axis=1
        )

        # This line is now optional, since (A, A) won't be included unless you use combinations_with_replacement
        df_pairs['FeatureMatch'] = df_pairs.apply(
            lambda row: 'SelfPair' if row['Word1'] == row['Word2'] else row['FeatureMatch'],
            axis=1
        )

        # Call your function with a list containing one df_pairs
        add_feature_match_specific([df_pairs], save_path_prefix)



In [139]:
for df, name in zip([v, v2, n, n2, s, a, a2], ['v', 'v2', 'n', 'n2', 's', 'a', 'a2']):
    df.name = name

# Then run the function
process_and_save_pairwise_dfs([v, v2, n, n2, s, a, a2])

Processing v...
v: 780 unique unordered pairs
v: 780 rows in pair DataFrame
Saved: ../exp_files/v_MaxMatch.csv
Saved: ../exp_files/v_ConceptualMatchingOnly.csv
Saved: ../exp_files/v_MaxMismatch.csv
Saved: ../exp_files/v_ValenceMatchingOnly.csv
Processing v2...
v2: 780 unique unordered pairs
v2: 780 rows in pair DataFrame
Saved: ../exp_files/v2_MaxMatch.csv
Saved: ../exp_files/v2_ValenceMatchingOnly.csv
Saved: ../exp_files/v2_ConceptualMatchingOnly.csv
Saved: ../exp_files/v2_MaxMismatch.csv
Processing n...
n: 780 unique unordered pairs
n: 780 rows in pair DataFrame
Saved: ../exp_files/n_MaxMatch.csv
Saved: ../exp_files/n_MaxMismatch.csv
Saved: ../exp_files/n_ConceptualMatchingOnly.csv
Saved: ../exp_files/n_ValenceMatchingOnly.csv
Processing n2...
n2: 780 unique unordered pairs
n2: 780 rows in pair DataFrame
Saved: ../exp_files/n2_MaxMatch.csv
Saved: ../exp_files/n2_ConceptualMatchingOnly.csv
Saved: ../exp_files/n2_ValenceMatchingOnly.csv
Saved: ../exp_files/n2_MaxMismatch.csv
Processing

# Ordered Pairs

In [None]:
df_list = [v,v2,n,n2,s,a,a2]

In [134]:

# You want to cycle only within the first n letters (A to J), 
# and then loop back around to A (not K, L, etc.) 
# but track how many loops have occurred to produce AA, BB, etc.

# You want to cycle through A–J repeatedly for each Word1 group — just with a shifting starting letter per group.
# The label length should stay 1 letter (A–J) 
# until the 10th unique value, 
# at which point we start doubling (AA–JJ), and later tripling, and so on.

def generate_group_labels_strict(n, group_index, base_letters):
    """
    Generate n group labels from a base_letters subset (e.g., A-J), shifting the start
    per group_index, and repeating letters (A, AA, AAA...) only after 10 groups.
    """
    base_len = len(base_letters)
    labels = []

    for i in range(n):
        pos = (group_index + i) % base_len
        repeat = (group_index) // base_len + 1  # only increase length every 10 groups
        label = base_letters[pos] * repeat
        labels.append(label)

    return labels




def add_feature_match_specific(df_list, save_path_prefix):
    base_letters = list(string.ascii_uppercase[:10])  # A–J

    for df in df_list:
        base_name = getattr(df, 'name', 'df')
        unique_values = df['FeatureMatch'].unique()

        for val in unique_values:
            split_df = df[df['FeatureMatch'] == val].copy()

            group_labels = []
            for group_index, word1_val in enumerate(split_df['Word1'].unique()):
                group_size = (split_df['Word1'] == word1_val).sum()
                labels = generate_group_labels_strict(group_size, group_index, base_letters)
                group_labels.extend(labels)

            split_df['Group'] = group_labels

            save_path = f"{save_path_prefix}{base_name}_{val}.csv"
            split_df.to_csv(save_path, index=False)
            print(f"Saved: {save_path}")



def process_and_save_pairwise_dfs(df_list, save_path_prefix="../exp_files/"):
    for df in df_list:
        if not hasattr(df, 'name'):
            raise ValueError("Each DataFrame in df_list must have a `.name` attribute.")
        print(f"Processing {df.name}...")

        pairs = list(itertools.product(df.itertuples(index=False), repeat=2))
        print(f"{df.name}: {len(pairs)} pairs")

        df_pairs = pd.DataFrame([
            {
                'Word1': p1.Word,
                'Word2': p2.Word,
                'FeatureCombo1': p1.FeatureCombo,
                'FeatureCombo2': p2.FeatureCombo
            }
            for p1, p2 in pairs
        ])
        df_pairs.name = df.name
        print(f"{df.name}: {len(df_pairs)} rows in pair DataFrame")

        df_pairs['FeatureMatch'] = df_pairs.apply(
            lambda row: classify_feature_match(row['FeatureCombo1'], row['FeatureCombo2']),
            axis=1
        )

        df_pairs['FeatureMatch'] = df_pairs.apply(
            lambda row: 'SelfPair' if row['Word1'] == row['Word2'] else row['FeatureMatch'],
            axis=1
        )

        # Call your function with a list containing one df_pairs
        add_feature_match_specific([df_pairs], save_path_prefix)


In [135]:
for df, name in zip([v, v2, n, n2, s, a, a2], ['v', 'v2', 'n', 'n2', 's', 'a', 'a2']):
    df.name = name

# Then run the function
process_and_save_pairwise_dfs([v, v2, n, n2, s, a, a2])

Processing v...
v: 1600 pairs
v: 1600 rows in pair DataFrame
Saved: ../exp_files/v_SelfPair.csv
Saved: ../exp_files/v_MaxMatch.csv
Saved: ../exp_files/v_ConceptualMatchingOnly.csv
Saved: ../exp_files/v_MaxMismatch.csv
Saved: ../exp_files/v_ValenceMatchingOnly.csv
Processing v2...
v2: 1600 pairs
v2: 1600 rows in pair DataFrame
Saved: ../exp_files/v2_SelfPair.csv
Saved: ../exp_files/v2_MaxMatch.csv
Saved: ../exp_files/v2_ValenceMatchingOnly.csv
Saved: ../exp_files/v2_ConceptualMatchingOnly.csv
Saved: ../exp_files/v2_MaxMismatch.csv
Processing n...
n: 1600 pairs
n: 1600 rows in pair DataFrame
Saved: ../exp_files/n_SelfPair.csv
Saved: ../exp_files/n_MaxMatch.csv
Saved: ../exp_files/n_MaxMismatch.csv
Saved: ../exp_files/n_ConceptualMatchingOnly.csv
Saved: ../exp_files/n_ValenceMatchingOnly.csv
Processing n2...
n2: 1600 pairs
n2: 1600 rows in pair DataFrame
Saved: ../exp_files/n2_SelfPair.csv
Saved: ../exp_files/n2_MaxMatch.csv
Saved: ../exp_files/n2_ConceptualMatchingOnly.csv
Saved: ../exp_