In [1]:
import pandas as pd


df = pd.read_csv("data/study (3).csv")
bios = list(df['bio_string'])
bios[:3]


['"America will never be destroyed from the outside. If we falter & lose our freedoms, it will be because we destroyed ourselves." ~ Abraham Lincoln',
 '"In a world of thieves, the only final sin is stupidity."',
 '"One man gathers what another man spills" ~ St. Stephen ~ Aoxomoxoa, 1969\n\nhttps://t.co/i9JEm8qjl4\nhttps://t.co/XHgSaaWUAQ\nhttps://t.co/VZn53sxeFo\nhttps://t.co/1HLdXQ4izK']

In [2]:
from src.util import generate_personal_identifiers

pis = [generate_personal_identifiers(b) for b in bios]
for p in pis:
    assert len(p)>0
    
pis = [', '.join(pi) for pi in pis]
pis[:10]

['"america will never be destroyed from the outside, if we falter, lose our freedoms, it will be because we destroyed ourselves ", abraham lincoln',
 '"in a world of thieves, only final sin is stupidity "',
 '"one man gathers what another man spills", st, stephen, aoxomoxoa, 1969',
 '#1, news, weather site in southwest florida',
 '#argirlslead, is a movement started by the women serving in the arkansas house, promoting positive image, leadership for young girls across the state',
 '#b1, #cutthecheck, #nonfbaally',
 '#bitcoin, btc, #ethereum, eth, #solana, sol, #avalanche, avax',
 '#defundthebbc, #backboris, conservative, proud of my country',
 '#donotcomply',
 '#enhypen, my ray of sunshine, lifes too short to waste a second']

In [3]:
from src.inference.projection import load_sbert_based_model, get_sentence_projections
from sentence_transformers import SentenceTransformer

sbertft = SentenceTransformer('navidmadani/mpnet-twitter-freq100', device='cuda', cache_folder='../../hf-cache')
sbertft.eval()

sbert = SentenceTransformer('all-mpnet-base-v2', device='cuda', cache_folder='../../hf-cache')
sbert.eval()


SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

## probing

In [3]:
from src.inference.projection import projection_measures
projection_measures[-1]

{'group': 'gender',
 'names': ['woman', 'man'],
 'sets': [['woman',
   'girl',
   'she',
   'mother',
   'daughter',
   'gal',
   'female',
   'her',
   'herself'],
  ['man', 'boy', 'he', 'father', 'son', 'guy', 'male', 'his', 'himself']],
 'paper': 'bolukbasi_words',
 'is_paired': True}

In [6]:
res = get_sentence_projections(['woman', 'girl', 'she', 'mother', 'wife',
                                'man', 'boy', 'he', 'father', 'husband'], model=sbertft, as_dict=True,
                               measures={'gender': [['mother of x', 'grand mother'], ['father of x', 'grand father']]})
[r['gender'] for r in res]

Getting sentence embeddings


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Projecting embeddings


[-0.26586914,
 -0.1255384,
 -0.16264248,
 -0.2253285,
 -0.23157203,
 0.07829218,
 0.0005588483,
 0.0013886724,
 0.19419289,
 0.21773551]

In [37]:
res = get_sentence_projections(['faithful father of five, its time we do something for this country'], model=sbertft, as_dict=True,
                               measures={'gender': [['mother of x', 'grand mother'], ['father of x', 'grand father']]})
[r['gender'] for r in res]

Getting sentence embeddings


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Projecting embeddings


[0.014541577]

In [9]:
res = get_sentence_projections(['woman', 'girl', 'she', 'mother', 'wife',
                                'man', 'boy', 'he', 'father', 'husband'], model=sbertft, as_dict=True)
[r['gender'] for r in res]

Getting sentence embeddings


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Projecting embeddings


[-0.15033233,
 0.1774702,
 0.06439619,
 -0.25659806,
 -0.26021755,
 0.300084,
 0.32820556,
 0.21511394,
 0.042706218,
 0.05499517]

In [11]:
res = get_sentence_projections(['dad, ph d'], model=sbertft, as_dict=True,measures={'gender': [['mother of x', 'grand mother'], ['father of x', 'grand father']]})
[r['gender'] for r in res]

Getting sentence embeddings


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Projecting embeddings


[0.15582924]

In [12]:
get_sentence_projections(['trump supporter', '18yo girl', '18yo boy'], model=sbertft, as_dict=True,measures={'gender': [['mother of x', 'grand mother'], ['father of x', 'grand father']]})

Getting sentence embeddings


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Projecting embeddings


[{'age': 0.24847963,
  'politics': 0.364953,
  'religion': 0.1880905,
  'education': -0.032502428,
  'employment status': 0.10967165,
  'gender': -0.041096084},
 {'age': -0.0054445732,
  'politics': -0.20800136,
  'religion': -0.32240912,
  'education': 0.41466582,
  'employment status': 0.13156968,
  'gender': -0.040934924},
 {'age': -0.0059722736,
  'politics': -0.15180863,
  'religion': -0.28775397,
  'education': 0.3873757,
  'employment status': 0.13521032,
  'gender': 0.047617458}]

## gender analysis

In [23]:
male_bios = [x for x in bios_str if 'father' in x or 'dad' in x or 'husband' in x]
female_bios = [x for x in bios_str if 'mother' in x or 'mom' in x or 'wife' in x]

In [31]:
sbert_results = get_sentence_projections(female_bios, model=sbert, as_dict=True, device='cuda:1')
sbertft_results = get_sentence_projections(female_bios, model=sbertft, as_dict=True, device='cuda:1')
sbert_df = pd.DataFrame(sbert_results)
sbertft_df = pd.DataFrame(sbertft_results)

Getting sentence embeddings


Batches:   0%|          | 0/59 [00:00<?, ?it/s]

Projecting embeddings
Getting sentence embeddings


Batches:   0%|          | 0/59 [00:00<?, ?it/s]

Projecting embeddings


In [32]:
len(male_bios), len(female_bios)

(5941, 7521)

In [34]:
sbert_gen, sbertft_gen = sbert_df['gender'], sbertft_df['gender']

vis_df = pd.DataFrame({'bio': female_bios, 'sbert': sbert_gen, 'sbertft': sbertft_gen})
vis_df


Unnamed: 0,bio,sbert,sbertft
0,"artist formerly known as darealyest, nvm, some...",-0.186775,0.021444
1,"mom, iconoclast, restless mind, politics, news...",-0.141225,-0.059513
2,"married bisexual gen-x, #fella, forger mom of ...",-0.185388,-0.160634
3,"mommy illustrator cosmic hyphy, jonathan major...",-0.082297,-0.060898
4,"mum, wife, daughter, sister, housekeeper, is i...",-0.154423,-0.171180
...,...,...,...
7516,"happy atheist, humanitarian, wife to retired 2...",-0.022137,-0.184630
7517,fuck off mom im 13 now,-0.117799,-0.052867
7518,"retired, grandmother, we need less government ...",-0.103996,-0.214518
7519,"entrepreneur, activist, christian, wife, mothe...",-0.044189,-0.103803


In [30]:
import plotly.express as px


fig = px.scatter(vis_df, x="sbert", y="sbertft", hover_name="bio")

fig.update_traces(textposition='top center')

fig.update_layout(
    height=800,
    title_text='Male bios'
)

fig.show()

In [35]:
import plotly.express as px


fig = px.scatter(vis_df, x="sbert", y="sbertft", hover_name="bio")

fig.update_traces(textposition='top center')

fig.update_layout(
    height=800,
    title_text='Female bios'
)

fig.show()

## full analysis

In [8]:
sbert_results = get_sentence_projections(bios, model=sbert, as_dict=True, device='cuda:1')
sbertft_results = get_sentence_projections(bios, model=sbertft, as_dict=True, device='cuda:1')
pd.DataFrame(sbert_results).to_csv('sbert_prj_raw.csv', header=True, index=False)
pd.DataFrame(sbertft_results).to_csv('sbertft_prj_raw.csv', header=True, index=False)

Getting sentence embeddings


Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Projecting embeddings
Getting sentence embeddings


Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Projecting embeddings


In [8]:
get_sentence_projections(['trump supporter', 'Aides to former guy Trump say he tears up pieces of paper after he’s done reading them. Melania continues to try and hand him their prenup. :ocean: #resist #BLM'], model=sbert, as_dict=True)

Getting sentence embeddings


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Projecting embeddings


[{'age': -0.034933075,
  'politics': 0.08879271,
  'religion': 0.0074828803,
  'education': 0.097981974,
  'employment status': 0.11890643},
 {'age': 0.06299456,
  'politics': 0.058707878,
  'religion': -0.02140664,
  'education': 0.122805804,
  'employment status': 0.081292465}]

In [None]:
pd.read_parquet('sampled_bios (1).parquet')

## generating pi projections for all bios

In [38]:
import pandas as pd
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.window import Window
import pickle

from glob import glob
from pyspark.sql import SparkSession
from tqdm import tqdm


spark = (
    SparkSession
    .builder
    .master("local[{}]".format(30))
    .config("spark.driver.memory", "{}g".format(90))
    .config("spark.driver.maxResultSize", f"{10}g")
    .getOrCreate()
)


In [39]:
df = spark.read.parquet('/user/smadani/navid/data/share_quality_pi_dataset2.parquet/share_quality_pi_dataset2.parquet/')

bios = [x.pi for x in df.select("pi").collect()]
uids = [x.uid for x in df.select("uid").collect()]

                                                                                

In [40]:
len(bios), bios[0], uids[0]

(143883,
 ['lrearning cicerone level 2',
  'beer judge',
  'taster w',
  'camra',
  'studying beer judge with bjcp',
  '#coys',
  '@beckenhamtown',
  'goal w',
  'out a deadline is a dream'],
 3145921)

In [41]:
bios_str = [", ".join(bio) for bio in bios]
bios_str[0]

'lrearning cicerone level 2, beer judge, taster w, camra, studying beer judge with bjcp, #coys, @beckenhamtown, goal w, out a deadline is a dream'

In [42]:
from src.inference.projection import load_sbert_based_model, get_sentence_projections
from sentence_transformers import SentenceTransformer

sbertft = SentenceTransformer('navidmadani/mpnet-twitter-freq100', device='cuda:1', cache_folder='../../hf-cache')
sbertft.eval()

sbert = SentenceTransformer('all-mpnet-base-v2', device='cuda:1', cache_folder='../../hf-cache')
sbert.eval()


SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [43]:
from src.inference.projection import projection_measures
projection_measures

[{'group': 'age',
  'names': ['young', 'old'],
  'sets': [['young', 'new', 'youthful', 'young'],
   ['old', 'old', 'elderly', 'aged']],
  'paper': 'this_long',
  'is_paired': True},
 {'group': 'politics',
  'names': ['democrat', 'republican'],
  'sets': [['democratic party supporter', 'left-leaning', 'democrat'],
   ['republican party supporter', 'right-leaning', 'republican']],
  'paper': 'unk',
  'is_paired': True},
 {'group': 'religion',
  'names': ['atheist', 'religious'],
  'sets': [['atheistic', 'agnostic', 'non-believing', 'skeptical'],
   ['religious', 'faithful', 'christian', 'believe in lord']],
  'paper': 'unk',
  'is_paired': True},
 {'group': 'education',
  'names': ['educated', 'uneducated'],
  'sets': [['educated', 'higher education'], ['uneducated', 'unschooled']],
  'paper': 'unk',
  'is_paired': True},
 {'group': 'employment status',
  'names': ['employed', 'unemployed'],
  'sets': [['employed', 'hired', 'working', 'on the job'],
   ['unemployed', 'jobless', 'out of w

In [44]:
sbert_results = get_sentence_projections(bios_str, model=sbert, as_dict=True, device='cuda:1')
sbertft_results = get_sentence_projections(bios_str, model=sbertft, as_dict=True, device='cuda:1')

for uid, bio, d1, d2 in zip(uids, bios_str, sbert_results, sbertft_results):
    d1['bio'] = bio
    d1['uid'] = uid
    d2['bio'] = bio
    d2['uid'] = uid

pd.DataFrame(sbert_results).to_csv('sbert_gendermod.csv', header=True, index=False)
pd.DataFrame(sbertft_results).to_csv('sbertft_gendermod.csv', header=True, index=False)

Getting sentence embeddings


Batches:   0%|          | 0/1125 [00:00<?, ?it/s]

Projecting embeddings
Getting sentence embeddings


Batches:   0%|          | 0/1125 [00:00<?, ?it/s]

Projecting embeddings


In [8]:
!head sbert_olddims.csv

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
age,politics,religion,education,employment status,gender,bio,uid
-0.046513535,-0.0846096,0.034728557,-0.17119646,0.029989576,0.045206904,"lrearning cicerone level 2, beer judge, taster w, camra, studying beer judge with bjcp, #coys, @beckenhamtown, goal w, out a deadline is a dream",3145921
-0.0083937785,0.052987766,-0.06494491,-0.031109286,-0.019493954,-0.09101539,"rt news, current affairs, irish, international news",8973062
0.0013707591,-0.07861272,-0.13330252,-0.11803298,0.029220426,0.07293843,"trade corporate, private asset management ceo ethical philosophy of science dissident",10614242
-0.06515519,0.0012780693,0.009247474,0.0076627443,-0.020567544,-0.0123394355,"omni-american, staff writer at the a