# Merge speaker attributes quote

In [1]:
import pandas as pd


For each quote, get the corresponding speaker attribute row in the respective dataset. This way the search for speaker attributes is O(1), thus enabling data analysis.

In [2]:
df_Trump = pd.read_pickle("../data/Trump_with_dates.pkl")
df_Clinton = pd.read_pickle("../data/Clinton_with_dates.pkl")

df_Trump
df_Clinton

Unnamed: 0,quoteID,quotation,speaker,qids,numOccurrences,probas,urls,phase,date
12,2018-07-16-000103,[ Ensuring ] the orchestrating and timing of M...,Corey Lewandowski,[Q20740735],2,"[[Corey Lewandowski, 0.7179], [None, 0.2754], ...",[http://www.theweek.co.uk/95082/donald-trump-s...,E,"[2018-07-17 06:00:00, 2018-07-16 14:05:34]"
66,2018-05-09-001003,300-plus years of them cold shoulders... Obama...,Charlamagne Tha God,[Q16203002],1,"[[Charlamagne Tha God, 0.4806], [None, 0.2924]...",[https://www.portlandmercury.com/music/2018/05...,E,[2018-05-09 11:00:00]
366,2018-01-07-002036,All I can say is it's not a hoax. The Russians...,Lindsey Graham,[Q22212],1,"[[Lindsey Graham, 0.5251], [None, 0.2936], [Ch...",[http://postandcourier.com/politics/would-lind...,E,[2018-01-07 15:40:00]
1024,2018-01-16-011608,being too nonchalant about Mr. Trump's rants.,Floyd Abrams,[Q3365171],3,"[[Floyd Abrams, 0.7512], [None, 0.2421], [Hill...",[http://www.washingtontimes.com/news/2018/jan/...,E,"[2018-01-16 20:06:13, 2018-01-17 05:01:43]"
1091,2018-09-24-011280,Brett Kavanaugh is poised to join Neil Gorsuch...,Raul Labrador,[Q555393],1,"[[Raul Labrador, 0.6471], [None, 0.2436], [Pre...",[http://www.spokesman.com/stories/2018/sep/25/...,E,[2018-09-24 23:21:10]
...,...,...,...,...,...,...,...,...,...
5243971,2020-04-05-029136,To say that I'm infuriated with the recent act...,Dwight Ball,[Q5318112],1,"[[Dwight Ball, 0.6293], [None, 0.3336], [Justi...",[https://www.cbc.ca/news/politics/trudeau-will...,E,[2020-04-05 23:11:52]
5243994,2020-02-05-103219,Trump offends and disrespects the Venezuelan p...,Jorge Arreaza,[Q6623799],11,"[[Jorge Arreaza, 0.9164], [None, 0.0726], [Pre...",[https://www.rawstory.com/2020/02/imwithfred-t...,E,"[2020-02-05 19:09:04, 2020-02-05 19:25:17, 202..."
5243996,2020-03-13-071475,"Trump tried to mitigate the issue, saying it i...",Hassan Nasrallah,[Q181182],1,"[[Hassan Nasrallah, 0.922], [None, 0.0741], [P...",[http://israelnationalnews.com/News/News.aspx/...,E,[2020-03-13 22:15:06]
5243997,2020-03-15-037086,Trump's do-over approach -- he unlocked $50 bi...,Newt Gingrich,[Q182788],40,"[[Newt Gingrich, 0.5146], [None, 0.3958], [Don...",[http://uspolitics.einnews.com/article/5120893...,E,"[2020-03-15 00:00:00, 2020-03-15 00:00:00, 202..."


In [3]:
speaker_attributes_updated = pd.read_parquet("../data/speaker_attributes_updated.parquet")

speaker_attributes_updated

Unnamed: 0,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,id,label,candidacy,type,religion
0,"[Washington, President Washington, G. Washingt...",[+1732-02-22T00:00:00Z],"[Great Britain, United States of America]",[male],1395141751,,W000178,"[politician, military officer, farmer, cartogr...",[independent politician],,Q23,George Washington,"[1792 United States presidential election, 178...",item,[Episcopal Church]
1,"[Douglas Noel Adams, Douglas Noël Adams, Dougl...",[+1952-03-11T00:00:00Z],[United Kingdom],[male],1395737157,[White British],,"[playwright, screenwriter, novelist, children'...",,,Q42,Douglas Adams,,item,
2,"[Paul Marie Ghislain Otlet, Paul Marie Otlet]",[+1868-08-23T00:00:00Z],[Belgium],[male],1380367296,,,"[writer, lawyer, librarian, information scient...",,,Q1868,Paul Otlet,,item,
3,"[George Walker Bush, Bush Jr., Dubya, GWB, Bus...",[+1946-07-06T00:00:00Z],[United States of America],[male],1395142029,,,"[politician, motivational speaker, autobiograp...",[Republican Party],,Q207,George W. Bush,"[2000 United States presidential election, 200...",item,"[United Methodist Church, Episcopal Church, Me..."
4,"[Velázquez, Diego Rodríguez de Silva y Velázqu...",[+1599-06-06T00:00:00Z],[Spain],[male],1391704596,,,[painter],,,Q297,Diego Velázquez,,item,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9055976,[Barker Howard],,[United States of America],[male],1397399351,,,[politician],,,Q106406560,Barker B. Howard,,item,
9055977,[Charles Macomber],,[United States of America],[male],1397399471,,,[politician],,,Q106406571,Charles H. Macomber,,item,
9055978,,[+1848-04-01T00:00:00Z],,[female],1397399751,,,,,,Q106406588,Dina David,,item,
9055979,,[+1899-03-18T00:00:00Z],,[female],1397399799,,,,,,Q106406593,Irma Dexinger,,item,


Add the id of the special_attribute row to each quote

In [4]:
def search_author_index(QID, authors):
    res = authors[authors["id"] == QID]
    if res.empty:
        return -1
    return res.index[0]

In [5]:
def assign_value(chunk, index, key, QID, authors):
    chunk.at[index, key] = search_author_index(QID, authors)

In [6]:
def add_author_id(chunk, authors):
    import concurrent.futures
    executor = concurrent.futures.ThreadPoolExecutor(30)
    if "authorId" not in chunk: 
        chunk.insert(chunk.shape[1], "authorId", -1)
    count = 0
    for index, row in chunk.iterrows():
        executor.submit(assign_value, chunk, index, "authorId", row.qids[0], authors)
        count += 1
        if (count % 100 == 0):
            print(count)
    executor.shutdown()
    return chunk[chunk["authorId"] != -1]

Store the result of the merge

In [None]:
clinton = add_author_id(df_Clinton, speaker_attributes_updated)
trump = add_author_id(df_Trump, speaker_attributes_updated)

clinton
trump

In [None]:
clinton.to_pickle("../data/Clinton_with_attributes.pkl")
trump.to_pickle("../data/Clinton_with_attributes.pkl")