# Map Dates

In [7]:
import pandas as pd

In [49]:
def filter_dataframe(year):
# This function will be used to filter the dataset selecting the quotes of a given year containing a given word
# in a free text search fashion.

    list_df = []
    with pd.read_json("../data/quotebank-"+year+".json.bz2", lines=True,  chunksize = 10000, compression = 'bz2') as df_reader:
        import concurrent.futures
        executor = concurrent.futures.ThreadPoolExecutor(30)
        for chunk in df_reader: #we read chunk by chunk in order not to store everything in memory
            executor.submit(filter_chunk, list_df, chunk)
        executor.shutdown()
            
    df = pd.concat(list_df) #we concatenate the dataframes together to obtain a unique one
    return df

In [50]:
def filter_chunk(list_df, chunk):
    chunk.drop(columns=["articleID", "phase", "title", "url", "articleLength", "names"], inplace=True)
    chunk.quotations = chunk.quotations.transform(lambda x: filtering_func(x))
    chunk = chunk[chunk["quotations"].str.len() > 0]
    list_df.append(chunk)  #we append it to the list of dataframes
            
        
def filtering_func(lst):
    final_lst = []
    for el in lst:
        if (("Trump" in el["quotation"]) or ("Clinton" in el["quotation"])):
            final_lst.append(el["quoteID"])
    return final_lst
    

In [None]:
quoteID_date = filter_dataframe("2020")

In [38]:
quoteID_date

Unnamed: 0,quotations,date
1,[2019-03-25-054542],2020-02-17 00:00:00
9,[2020-02-18-082033],2020-02-18 00:11:22
35,[2020-02-13-037918],2020-02-18 00:00:00
38,"[2020-01-02-030693, 2020-01-02-060856, 2020-01...",2020-01-02 14:33:10
47,[2018-05-09-022250],2020-02-18 18:24:18
...,...,...
299838,"[2020-02-25-055636, 2020-02-25-024189, 2020-02...",2020-02-25 00:00:00
299840,"[2020-02-25-055636, 2020-02-25-024189, 2020-02...",2020-02-25 00:00:00
299856,[2020-02-25-062694],2020-02-25 21:11:30
299958,[2020-02-27-067393],2020-02-28 05:28:21


In [34]:
quoteID_date.to_pickle('2020_quoteID_date.pkl')

In [30]:
quoteID_cleaned = pd.read_pickle('2020_quoteID_date.pkl')
quoteID_cleaned

Unnamed: 0,quotations,date
1,[2019-03-25-054542],2020-02-17 00:00:00
9,[2020-02-18-082033],2020-02-18 00:11:22
35,[2020-02-13-037918],2020-02-18 00:00:00
38,"[2020-01-02-030693, 2020-01-02-060856, 2020-01...",2020-01-02 14:33:10
47,[2018-05-09-022250],2020-02-18 18:24:18
...,...,...
9808,[2020-02-12-084875],2020-02-12 18:50:58
9863,[2020-04-07-043642],2020-04-07 19:39:16
9928,[2020-02-14-049463],2020-02-15 00:44:00
9962,"[2020-02-15-059489, 2020-02-15-032075, 2020-02...",2020-02-15 14:23:07


In [44]:
def update_quotes(quotes, quoteID_date):
    new_quotes = pd.DataFrame().reindex_like(quotes)
    for quote in quotes:
        for qid in quote.qids:
            print(quote_ID_date[qid in quote_ID_date["qids"].values])
            break

In [3]:
trump_cleaned = pd.read_csv('../data/df_Trump_cleaned.csv')
trump_cleaned

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase
0,2015-06-18-018819,How long do you think it'll take for Donald Tr...,Chris Matthews,"['Q15735939', 'Q25189328', 'Q5107375', 'Q51108...",2015-06-18 10:00:00,1,"[['Chris Matthews', '0.3869'], ['None', '0.319...",['http://talkingpointsmemo.com/dc/donald-trump...,E
1,2015-09-16-006359,And I'm just pointing out the absurd on both s...,Kathleen Madigan,['Q6376814'],2015-09-16 05:44:37,1,"[['Kathleen Madigan', '0.8025'], ['None', '0.1...",['http://northjersey.com/arts-and-entertainmen...,E
2,2015-05-11-003168,"And you had Michael Jackson roller skating, an...",Jim McMahon,"['Q1689159', 'Q17306267', 'Q6196886']",2015-05-11 02:03:27,1,"[['Jim McMahon', '0.8731'], ['None', '0.1232']...",['http://www.seattletimes.com/seattle-news/bel...,E
3,2015-10-20-044420,I like that he's not spinning things. He seems...,,[],2015-10-20 23:46:58,1,"[['None', '0.7832'], ['Ben Carson', '0.0755'],...",['http://detroitnews.com/story/news/politics/2...,E
4,2015-12-22-031341,"I promise, I won't talk about Trump again,",Jeb Bush,['Q221997'],2015-12-22 20:43:59,10,"[['Jeb Bush', '0.7816'], ['None', '0.1677'], [...",['http://www.politico.com/story/2015/12/jeb-bu...,E
...,...,...,...,...,...,...,...,...,...
935286,2020-02-05-103219,Trump offends and disrespects the Venezuelan p...,Jorge Arreaza,['Q6623799'],2020-02-05 00:00:00,11,"[['Jorge Arreaza', '0.9164'], ['None', '0.0726...",['https://www.rawstory.com/2020/02/imwithfred-...,E
935287,2020-02-05-103235,"Trump survived, but he is the most unpopular p...",,[],2020-02-05 23:11:42,3,"[['None', '0.8786'], ['Donald Trump', '0.1214']]",['https://www.wellsvilledaily.com/zz/news/2020...,E
935288,2020-03-13-071475,"Trump tried to mitigate the issue, saying it i...",Hassan Nasrallah,['Q181182'],2020-03-13 22:15:06,1,"[['Hassan Nasrallah', '0.922'], ['None', '0.07...",['http://israelnationalnews.com/News/News.aspx...,E
935289,2020-03-15-037086,Trump's do-over approach -- he unlocked $50 bi...,Newt Gingrich,['Q182788'],2020-03-15 00:00:00,40,"[['Newt Gingrich', '0.5146'], ['None', '0.3958...",['http://uspolitics.einnews.com/article/512089...,E


In [55]:
def filter_chunk_by_number_of_qids(initial_chunk, qid_number):
    return initial_chunk[initial_chunk["qids"].str.len() == 1]

In [56]:
print(filter_chunk_by_number_of_qids(trump_cleaned, 12))

trump_cleaned

KeyError: 935291