In [21]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
import string
import heapq

In [2]:
def processWords(raw, removePunc=False, addStopwords=False, stem=False):
    raw = str(raw).lower()
    if removePunc:
        nopunc = [c for c in raw if c not in string.punctuation]
        raw = ''.join(nopunc)
    stopwordsList = []
    stopwordsList_en = set(stopwords.words('english'))
    stopwordsList_fr = set(stopwords.words('french'))
    stopwordsList.extend(stopwordsList_en)
    stopwordsList.extend(stopwordsList_fr)
    if addStopwords:
        stopwordsList.extend(additionalStopwords())
    nostop = [w for w in raw.split() if w.lower() not in stopwordsList]
    if stem:
        stemmer = PorterStemmer()
        return ' '.join([stemmer.stem(t) for t in nostop])
    else:
        return ' '.join(nostop)

def additionalStopwords():
    stopwordsFile = open('additional_stopwordsList.txt')
    return [w.lower().replace('\n', '') for w in stopwordsFile.readlines()]

In [3]:
def wordWeight(s):
    fdist = nltk.FreqDist(s.split())
    fdict = dict(fdist)
    wdict = {}
    for key in fdict.keys():
        wdict[key] = fdict[key]/max(fdict.values())
    return wdict

In [14]:
def sentenceScore(sentences:list, wordWeightDict:dict):
    sentScore = {}
    for sentence in sentences:
        for word in nltk.word_tokenize(sentence):
            if word in wordWeightDict.keys():
                if sentence not in sentScore.keys():
                    sentScore[sentence] = wordWeightDict[word]
                else:
                    sentScore[sentence] += wordWeightDict[word]
    return sentScore

In [5]:
df = pd.read_json('data.json')

In [6]:
df.dropna(subset = ['descr'], inplace=True)

In [7]:
df['descr_no_punc'] = df['descr'].apply(lambda x: processWords(x, removePunc=True))
df['descr_sent_token'] = df['descr'].apply(lambda x: nltk.sent_tokenize(processWords(x)))
df['descr_word_wt'] = df['descr_no_punc'].apply(lambda x: wordWeight(x))

In [18]:
df['descr_sent_score'] = df.apply(lambda df: sentenceScore(df['descr_sent_token'],df['descr_word_wt']),axis=1)

In [33]:
df['descr_summ']= df['descr_sent_score'].apply(lambda d:heapq.nlargest(3,d,key=d.get))

In [34]:
df['descr_summ'][14]

['role dimensions: role direct in-direct impact following financial measures: regulatory capital ; economic regulatory capital; counterparty credit risk exposure measurements; qualifications knowledge skills: must-have skills: 3+ years related work experience financial institution strong knowledge financial products, specifically possessing knowledge derivatives ( otc) well sft strong experience working basel iii osfi car guideline strong experience ccr, ccr capital ( cem, sa-ccr) must experience bcar advanced knowledge excel, vba, sql, database concepts advanced python programming experience participation mid-scale/large development projects, hands-on experience conducting testing strong capability writing business requirement test plans strong capability creating mock-up data required test pre-specified cases nice-to-have skills: broad understanding risk management methodologies experience using work tracking systems (e.g.',
 'cr eate test plan including test scenarios sample data su

In [19]:
type(df['descr_sent_score'][1])

dict

In [26]:
summ = heapq.nlargest(5,df['descr_sent_score'][3],key=df['descr_sent_score'][3].get)

In [28]:
type(summ)

list

In [29]:
for s in summ:
    print(s)

financial derivatives pricing, var, counterparty credit risk, drc/irc, cva/xva, frtb ccar models) based industry best practices;be able learn work quantitative analytical areas credit modeling, forecasting stress testing, customer behavior modeling, new innovations machine learning artificial intelligence;carry various complex financial analyses including derivatives valuation independently;support contribute variety market risk areas projects clients delivering high quality value-added work timely base;work clients independently demonstrating capabilities accountabilities subject matter expertcontribute growth continued business practices within financial engineering analytics (fea) groupemploy structured approach project management ensure complete client satisfaction project profitability external posting qualifications solid academic background phd master degree mathematical finance, financial engineering relevant post graduate degree; frm, prm, cfa asset;solid knowledge skills mode