In [1]:
import pandas as pd
import pyterrier as pt
if not pt.started():
    pt.init()
import re
import os

PyTerrier 0.7.1 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)


No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [4]:
author_df = pd.read_csv('../all_author_df.csv')
author_df = author_df.drop('Unnamed: 0', axis=1).reset_index().rename(columns={'index': 'docno'})
author_df["docno"] = author_df["docno"].apply(str)
author_df['bio_cleaned'] = author_df['author_bio'].apply(clean_bio)

In [5]:
def clean_bio(bio):
    try:
        bio_alphanumeric = re.sub(r'[^a-zA-Z0-9]', ' ', bio)
        clean_bio = re.sub("\s\s+" , " ", bio_alphanumeric)
        return clean_bio
    except:
        return None

In [7]:
queries = pd.DataFrame([["q0", "healthcare professional"], 
                        ["q1", "baseball analyst"],
                        ["q2", "talk show host"],
                        ["q3", "mechanical engineer"],
                        ["q4", "neurosurgeon"],
                        ["q5", "movie director"],
                        ["q6", "accountant"],
                        ["q7", "social media influencer"],
                        ["q8", "lawyer"],
                        ["q9", "coffee barista"],
                        ["q10", "industrial engineer"],
                        ["q11", "statistician"],
                        ["q12", "data scientist"],
                        ["q13", "cryptocurrency investor"],
                        ["q14", "investment banker"],
                        ["q15", "olympian"],
                        ["q16", "software engineer"],
                        ["q17", "NLP expert"],
                        ["q18", "NFT investor"],
                        ["q19", "marine biologist"],
                        ["q20", "doctor"], 
                        ["q21", "stockbroker"],
                        ["q22", "veterinarian"],
                        ["q23", "dentist"],
                        ["q24", "receptionist"],
                        ["q25", "pharmacist"],
                        ["q26", "teacher"],
                        ["q27", "architect"],
                        ["q28", "police officer"],
                        ["q29", "technology CEO"]],
                        columns=["qid", "query"])

In [10]:
#Positional Indexing
index_dir = '../bio_index_positions_final'
index_ref = index_dir + '/data.properties'
index_pos = pt.IndexFactory.of(index_ref)
index = pt.IndexFactory.of(index_ref)
print(index.getCollectionStatistics().toString())

Number of documents: 10008
Number of terms: 19591
Number of postings: 71051
Number of fields: 0
Number of tokens: 73441
Field names: []
Positions:   true



In [11]:
bm25 = pt.BatchRetrieve(index, wmodel="BM25") 
dlh = pt.BatchRetrieve(index, wmodel="DLH")
pl2 = pt.BatchRetrieve(index, wmodel="PL2")
tfidf = pt.BatchRetrieve(index, wmodel="TF_IDF")

In [13]:
bm25_res = author_df.copy()

for i in range(30):
    rename_col = f'score_bm25_q{i}'
    res = bm25(queries.iloc[[i]]).rename(columns={'score': rename_col})[['docno', rename_col]]
    bm25_res = bm25_res.merge(res, how='left', on='docno').fillna(0)

In [14]:
pl2_res = author_df.copy()

for i in range(30):
    rename_col = f'score_pl2_q{i}'
    res = pl2(queries.iloc[[i]]).rename(columns={'score': rename_col})[['docno', rename_col]]
    pl2_res = pl2_res.merge(res, how='left', on='docno').fillna(0)

In [15]:
final_res = bm25_res.merge(pl2_res, on=['author_id', 'name', 'handle', 'author_bio'], how="left")
final_res[final_res['score_bm25_q0'] > 0].sample(5)

Unnamed: 0,docno_x,handle,author_id,author_bio,name,followers_count_x,following_count_x,score_bm25_q0,score_bm25_q1,score_bm25_q2,...,score_pl2_q20,score_pl2_q21,score_pl2_q22,score_pl2_q23,score_pl2_q24,score_pl2_q25,score_pl2_q26,score_pl2_q27,score_pl2_q28,score_pl2_q29
1345,1345,VAMPKILLER1997,1216963589187092480,Shean🦇🩸💉\n\n 24 • no pronouns (name only) • BP...,Shean but evil ⚰️✟,314,181.0,5.799203,0.0,0.0,...,0.0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0
6119,6119,Mielfia,3194783815,"Alt/Personal account, have the most gamer care...",Fia,226,325.0,5.091974,0.0,0.0,...,0.0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0
6724,6724,T0MFRAUDY,1334174043327631360,Professional Twitter User,Bre 🦃🍁,1187,952.0,9.210584,0.0,0.0,...,0.0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0
2247,2247,lorraine_ganley,1083830694,#HelloMyNameIs Lorraine. I'm Director of Nursi...,lorraine ganley,238,178.0,5.307739,0.0,0.0,...,0.0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0
3566,3566,cameroncraig,28447324,A #jobboard with 1000s of #jobs across the #US...,Cameron Craig Group,23841,24006.0,7.119683,0.0,0.0,...,0.0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0


In [16]:
for i in range(0,30):
    n_query = f'q{i}'
    cols_to_pull = ['author_id', 'name', 'handle', 'author_bio'] + [col for col in final_res.columns if n_query in col]
    final_res[cols_to_pull].to_csv('../results_for_annotation/' + n_query + '_results.csv')