In [1]:
import pandas as pd

def sample(df, group_cols, n):
    return df.groupby(group_cols, group_keys=False).apply(lambda x: x.sample(n=min(len(x), n)))

def get_value_counts(df, cols):
    for c in cols:
        print(df[c].value_counts())

def replace_col_name(df, col, old, new):
    df[col] = df[col].replace(old, new)
    return df

def fix_entities(df):
    df['entity'] = df['entity'].str.strip().str.lower()
    return df

## handle full_data

#### full_data/sentiment_terms.csv

In [6]:
sentiment_full = pd.read_csv('full_data/sentiment_terms.csv')
print(sentiment_full.columns)

Index(['sentiment', 'term'], dtype='object')


In [7]:
get_value_counts(sentiment_full, ['sentiment'])

sentiment
positive    80
negative    80
Name: count, dtype: int64


In [9]:
sentiment_sampled = sample(sentiment_full, ['sentiment'], 20)
get_value_counts(sentiment_sampled, ['sentiment'])

sentiment
negative    20
positive    20
Name: count, dtype: int64


In [10]:
sentiment_sampled.to_csv('data/sentiment_terms.csv', index=False)

#### full_data/culture_terms.csv

In [11]:
culture_full = pd.read_csv('full_data/culture_terms.csv')
print(culture_full.columns)

Index(['culture', 'entity', 'term'], dtype='object')


In [12]:
get_value_counts(culture_full, ['culture', 'entity'])

culture
Arab       200
Western    200
Name: count, dtype: int64
entity
Authors             40
Beverage            40
Clothing-Female     40
Clothing-Male       40
Food                40
Location            40
Names-Female        40
Names-Male          40
Religious Places    40
Sports Clubs        40
Name: count, dtype: int64


In [15]:
fix_entities(culture_full)
get_value_counts(culture_full, ['culture', 'entity'])

culture
Arab       200
Western    200
Name: count, dtype: int64
entity
authors             40
beverage            40
clothing-female     40
clothing-male       40
food                40
location            40
names-female        40
names-male          40
religious places    40
sports clubs        40
Name: count, dtype: int64


In [16]:
culture_sampled = sample(culture_full, ['culture', 'entity'], 5)
get_value_counts(culture_sampled, ['culture', 'entity'])

culture
Arab       50
Western    50
Name: count, dtype: int64
entity
authors             10
beverage            10
clothing-female     10
clothing-male       10
food                10
location            10
names-female        10
names-male          10
religious places    10
sports clubs        10
Name: count, dtype: int64


In [19]:
culture_sampled.to_csv('data/culture_terms.csv', index=False)
culture_full.to_csv('full_data/culture_terms.csv', index=False)

#### full_data/context_sentences.csv

In [21]:
context_full = pd.read_csv('full_data/context_sentences.csv')
print(context_full.columns)

Index(['Entity Type', 'Original Prompt', 'Processed Prompt', 'Sentiment',
       'culture', 'term'],
      dtype='object')


In [23]:
context_full = context_full.rename(columns={'Entity Type': 'entity'})
context_full = context_full.rename(columns={'Processed Prompt': 'sentence'})
context_full = context_full.drop(columns=['Original Prompt', 'term', 'Sentiment'])

print(context_full.columns)

Index(['entity', 'sentence', 'culture'], dtype='object')


In [25]:
fix_entities(context_full)
get_value_counts(context_full, ['culture', 'entity'])

culture
Arab       1000
Western    1000
Name: count, dtype: int64
entity
authors             200
beverage            200
clothing-female     200
clothing-male       200
food                200
locations           200
names-female        200
names-male          200
religious places    200
sports clubs        200
Name: count, dtype: int64


In [26]:
context_full.to_csv('full_data/context_sentences.csv', index=False)

#### full_data/neutral_sentences.xlsx

In [29]:
neutral_full = pd.read_excel('full_data/neutral_sentences.xlsx')
print(neutral_full.columns)

Index(['Entity Type', 'Prompt', 'Sentiment'], dtype='object')


In [30]:
neutral_full = neutral_full.rename(columns={'Entity Type': 'entity'})
neutral_full = neutral_full.rename(columns={'Prompt': 'sentence'})
neutral_full = neutral_full.drop(columns=['Sentiment'])

print(neutral_full.columns)

Index(['entity', 'sentence'], dtype='object')


In [32]:
fix_entities(neutral_full)
get_value_counts(neutral_full, ['entity'])

entity
authors             5
beverage            5
clothing-female     5
clothing-male       5
food                5
location            5
names-female        5
names-male          5
religious places    5
sports clubs        5
Name: count, dtype: int64


In [33]:
neutral_full.to_csv('full_data/neutral_sentences.csv', index=False)

In [36]:
neutral_sampled = sample(neutral_full, ['entity'], 3)
get_value_counts(neutral_sampled, ['entity'])

entity
authors             3
beverage            3
clothing-female     3
clothing-male       3
food                3
location            3
names-female        3
names-male          3
religious places    3
sports clubs        3
Name: count, dtype: int64


In [37]:
neutral_sampled.to_csv('data/neutral_sentences.csv', index=False)

## generate sampled context sentences

In [3]:
culture_sampled = pd.read_csv('data/culture_terms.csv')
get_value_counts(culture_sampled, ['culture', 'entity'])

culture
Arab       50
Western    50
Name: count, dtype: int64
entity
authors             10
beverage            10
clothing-female     10
clothing-male       10
food                10
location            10
names-female        10
names-male          10
religious places    10
sports clubs        10
Name: count, dtype: int64


In [7]:
neutral_sampled = pd.read_csv('data/neutral_sentences.csv')
print(neutral_sampled.columns)
get_value_counts(neutral_sampled, ['entity'])

Index(['entity', 'sentence'], dtype='object')
entity
authors             3
beverage            3
clothing-female     3
clothing-male       3
food                3
location            3
names-female        3
names-male          3
religious places    3
sports clubs        3
Name: count, dtype: int64


In [8]:
rows_list = []

for _, row in neutral_sampled.iterrows():
    entity_type = row['entity']
    prompt = row['sentence']
    
    filtered_terms = culture_sampled[culture_sampled['entity'] == entity_type]
    
    for _, term_row in filtered_terms.iterrows():
        culture = term_row['culture']
        entity = term_row['entity']
        term = term_row['term']
        
        sentence = prompt.replace('[MASK]', term)
        
        rows_list.append({'culture': culture, 'entity': entity, 'sentence': sentence})

context_sampled = pd.DataFrame(rows_list)

In [9]:
get_value_counts(context_sampled, ['culture', 'entity'])

culture
Arab       150
Western    150
Name: count, dtype: int64
entity
authors             30
beverage            30
clothing-female     30
clothing-male       30
food                30
location            30
names-female        30
names-male          30
religious places    30
sports clubs        30
Name: count, dtype: int64


In [10]:
context_sampled.shape

(300, 3)

In [11]:
context_sampled.to_csv('data/context_sentences.csv', index=False)