In [1]:
import pandas as pd
from matplotlib import pyplot as plt

In [None]:
# get the MIND data 
MIND_w_topics=pd.read_csv("~/efs/home/pcorona_content/MIND_Train_w_Topics.csv")
MIND_w_topics.sort_values(by=['Title', '_score_'], ascending=False, inplace=True)
MIND_w_topics.drop_duplicates(subset=['__uniqueid__'], keep='first', inplace=True)

In [None]:
MIND_w_topics.columns

## Defining Categories 

We need to have 4 categories statically, with bigrams developed, to be used in multidocument summarization. Groupings are as follows: 

- Level 1) 1 particular topic (manually generated)
- Level 2) 1 larger topic (manually generated)
- Level 3) 1 full sub-category (MIND pulled) 
- Level 4) 1 full category (MIND pulled)

Sports 

| Level | Corp Name | Doc Number |
|-------|-----------|------------|
| 1| UConn | 3 | 
| 1 | Kansas | 5 | 
| 2 | Basketball Games | 116 |  
| 2 | Zion | 145 | 
| 3 | Basketball All | 4880 | 
| 4 | Sports | 31106 | 

In [None]:
# LEVEL 1 Corp - between 3-5 articles 
# summary expected to include information about Crystel Daggerfield (the only senior) and their upcoming season 
uconn_corp = MIND_w_topics.loc[(MIND_w_topics['__uniqueid__'] == 115904) | 
                                (MIND_w_topics['__uniqueid__'] == 92703) | # this one is short and only about an upcoming scheduled game. Consider removing if having issues with length
                                (MIND_w_topics['__uniqueid__'] == 222206)].reset_index()

# summary expected to include information about players redshirting and upcoming rivalry between Missouri and Kansas
kansas_corp = MIND_w_topics.loc[(MIND_w_topics['__uniqueid__'] == 88709) | # about redshirting 
                                (MIND_w_topics['__uniqueid__'] == 186604) | # about redshirting 
                                (MIND_w_topics['__uniqueid__'] == 213915) | # about rivalry and upcoming games 
                                (MIND_w_topics['__uniqueid__'] == 552606) | # about rivalry and upcoming games             
                                (MIND_w_topics['__uniqueid__'] == 181004)].reset_index() # about great player 

# LEVEL 2 Corp
# about Zion Williamson (100ish articles)              
zion_corp = MIND_w_topics[MIND_w_topics['SubCategory'].str.contains('basketball') & MIND_w_topics['Text'].str.contains('Zion Williamson')].reset_index()

# about basketball games in general (35 articles)
basketballsmall_corp = MIND_w_topics.loc[(MIND_w_topics['Abstract'] != None) & 
                       (MIND_w_topics['Abstract'].str.contains("basketball")) & 
                       (MIND_w_topics['Abstract'].str.contains("game"))].reset_index()

# LEVEL 3 Corp - categorized using MIND SubCategory                
basketball_corp = MIND_w_topics[MIND_w_topics['SubCategory'].str.contains('basketball')].reset_index()

# LEVEL 4 Corp - categorized using MIND Category                
sports_category = MIND_w_topics[MIND_w_topics['Category'].str.contains('sports')].reset_index()


In [None]:
# biden_corp = MIND_w_topics.loc[(MIND_w_topics['Abstract'] != None) & 
#                        (MIND_w_topics['Abstract'].str.contains("Biden")) & 
#                        (MIND_w_topics['Abstract'].str.contains("debate")) & 
#                        (MIND_w_topics['__uniqueid__'] != 85212)] # removing one article not of relevance 


# politics = MIND_w_topics.loc[(MIND_w_topics['Abstract'] != None) & (
#                        (MIND_w_topics['Abstract'].str.contains("politic")) | 
#                        (MIND_w_topics['Abstract'].str.contains("government")))].sample(n = 50, random_state=9)



## Generating Bigrams 

For each set of documents: 
- bigrams need to generated for each document 
- file needs to be exported


In [None]:
from occams.summarize import SummaryUnits, TermFrequencyScheme, extract_summary, SummaryExtractor,IncidenceStructure, TermFrequencySummaryExtractor, Extract
from occams.nlp import DocumentProcessor, TermOrder 

docparser = DocumentProcessor(TermOrder.BIGRAMS, language='english', download=True).process

In [None]:
def generate_background(corpus): 
        # get the bigrams 
        documents = [docparser(text) for text in corpus['Text']]

        # generate the single document occams summary (short)
        scheme = TermFrequencyScheme.POSITIONAL_DENSE # utilizes area of document to weight relevance
        target_length=75
        extracts = [extract_summary(documents=[doc], budget=target_length, units=SummaryUnits.WORDS, scheme=scheme)
                for doc in documents]

        corpus['occams_sum75'] = [i.summary() for i in extracts]

        # generate the single document occams summary (long)
        target_length=150
        extracts = [extract_summary(documents=[doc], budget=target_length, units=SummaryUnits.WORDS, scheme=scheme)
                for doc in documents]

        corpus['occams_sum150'] = [i.summary() for i in extracts]

        # get the bigrams and store in counter 
        corpus['bigrams'] = [docparser(str(corpus.Text[i])).get_term_counts() for i in corpus.index] 
        
        extractor = [TermFrequencySummaryExtractor.from_documents([doc], units=SummaryUnits.WORDS, scheme=scheme) for doc in documents]
        corpus['term_weights'] = [e.term_weights_mapping() for e in extractor]

        return corpus

In [None]:
import multisum 

In [None]:
uconn_corp = generate_background(uconn_corp)
kansas_corp = generate_background(kansas_corp)
zion_corp = generate_background(zion_corp)
basketballsmall_corp = generate_background(basketballsmall_corp)
basketball_corp = generate_background(basketball_corp)
sports_category = generate_background(sports_category)

In [None]:
import pickle as pi 

uconn_corp.to_pickle("data/uconn_corp.pickle")
kansas_corp.to_pickle("data/kansas_corp.pickle")
zion_corp.to_pickle("data/zion_corp.pickle")
basketballsmall_corp.to_pickle("data/basketballsmall_corp.pickle")
basketball_corp.to_pickle("data/basketball_corp.pickle")
sports_category.to_pickle("data/sports_corp.pickle")


In [None]:
# generate the term_weights for each document 
sub = MIND_w_topics[MIND_w_topics['Category'] == 'news'].dropna(subset=['Text']).sample(n=100).reset_index()
df = generate_background(sub)

In [None]:
def find_max_term(i, j, key): 

    vali = None
    valj = None
    
    try: 
        vali = df['term_weights'][i][key]
    except: 
        # do nothing
        pass

    try:
        valj = df['term_weights'][j][key]
    except: 
        # do nothing
        pass

    if vali is not None and valj is not None: 
        return max(vali, valj)
    elif vali is None: 
        return valj 
    else:
        return vali


## Look into groupings by term weights
__see if certain groupings cause higher summary scores__


In [None]:
metrics = pd.DataFrame(columns = ['totalNumTerms', 'termsSum', 'summaryScore', 'summary', 'setOverlap', 'coverage'])
# identify which summaries have enough overlap by term count
for i in range(0, len(df) - 1):
    i_terms = set(df['term_weights'][i].keys())
    for j in range(i + 1, len(df)): 
        # checking the overlap of terms 
        j_terms = set(df['term_weights'][j].keys())
        term_overlap = len(i_terms.intersection(j_terms))
        keys = i_terms.union(j_terms)
        totalNumTerms = len(keys)
        allTerms = {k: find_max_term(i, j, k) for k in keys}
        termsSum = sum(allTerms.values())

        # generates summary for these two documents
        minicorp = df[i:j + 1].reset_index()
        documents, doc_incidences, load_time = multisum.load_corpus(minicorp, 'Text')  
        build_time, extractTime, sentences, doc_titles, extractor, sentence_weights = multisum.summarize_collect(minicorp, documents, doc_incidences, 'position', 150, True)
        summaryScore = extractor.extract(budget=150).weight()
        summary = "".join([s.text + "\n" for s in sentences])
   
        metrics = metrics.append({'totalNumTerms': totalNumTerms, 'termsSum' : termsSum, 'summaryScore': 
            summaryScore, 'summary' : summary, 'setOverlap' : term_overlap}, ignore_index=True)

metrics['coverage'] = metrics['summaryScore'] * 100 / metrics['termsSum']

In [None]:
metrics.to_csv("out/sumpairs.csv")

### Generate a back corpus of bigrams

This can be utilized accross the board.

In [None]:
# generate the bigrams for all the MIND documents
MIND_w_topics['bigrams'] = [docparser(str(MIND_w_topics.Text[i])).get_term_counts() for i in MIND_w_topics.index] 
MIND_w_topics.to_pickle("data/mind.pickle")

In [None]:
# take bigrams, aggregate and write an individual counter as well
from collections import Counter
import pickle as pi 

def aggregate_bg(bigrams): 
    bg = Counter() 
    for ele in bigrams: 
        bg.update(ele)
    return bg

bigrams = aggregate_bg(MIND_w_topics['bigrams'])

with open('data/backcorp_bigrams.pickle', 'wb') as outputfile:
    pi.dump(bigrams, outputfile)

### Here is the full pipeline to generate new groupings based on id

In [None]:
def pipeline(name, query, df):
    df[df['entity_']]
    corp = generate_background(corp)
    corp.to_pickle("data/" + name + ".pickle")

### Graph Set Overlap Trials

In [4]:
import plotly.express as px
cov = pd.read_csv("out/sumpairs.csv")

In [7]:
# fig = px.scatter(cov, x='coverage', y='summaryScore', trendline="ols", trendline_options=dict(log_x=True)) # add color= for a grouping var 

fig = px.scatter(cov, x='coverage', y='summaryScore', trendline="lowess") # add color= for a grouping var 
fig.update_traces(marker=dict(size=2))
fig.show()

In [8]:
fig = px.scatter(cov, x='setOverlap', y='summaryScore') # add color= for a grouping var 
fig.update_traces(marker=dict(size=2))
fig.show()

In [9]:
fig = px.scatter(cov, x='termsSum', y='summaryScore') # add color= for a grouping var 
fig.update_traces(marker=dict(size=2))
fig.show()

In [10]:
fig = px.scatter(cov, x='totalNumTerms', y='summaryScore') # add color= for a grouping var 
fig.update_traces(marker=dict(size=2))
fig.show()

In [11]:
cov_high = cov[cov['summaryScore'] > 275]
cov_low = cov[cov['summaryScore'] <= 275]

In [12]:
cov_high.describe()

Unnamed: 0.1,Unnamed: 0,totalNumTerms,termsSum,summaryScore,setOverlap,coverage
count,2276.0,2276.0,2276.0,2276.0,2276.0,2276.0
mean,2942.954745,695.191125,529.801332,378.72728,8.880053,96.327518
std,1588.154502,399.6003,302.426501,62.71525,9.691647,56.26107
min,0.0,138.0,106.823884,275.482602,0.0,15.947804
25%,1393.75,407.0,311.542864,327.624098,3.0,54.062282
50%,3689.5,585.0,444.443879,368.712095,6.0,84.458341
75%,4277.25,874.0,664.261002,428.111227,11.0,126.357089
max,4949.0,2748.0,2087.205855,600.092999,109.0,486.733165
