# Importing the libraries

In [1]:
import nltk
import pandas as pd
import numpy as np
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams

In [2]:
ps = PorterStemmer()
lm = WordNetLemmatizer()

## Importing sample article

In [3]:
article = open('sample2.txt','r').read()

In [4]:
article

'Democrats unveiled two articles of impeachment Tuesday against US President Donald Trump after weeks of arguing there is overwhelming evidence that the US leader abused his office and deserves to be removed.\nIf the charges -- abuse of power and obstruction of Congress -- are approved by the full House of Representatives in a vote expected next week, it would put Trump in the historic position of being the third US leader ever impeached and placed on trial in the Senate.\n"Our president holds the ultimate public trust," said House Judiciary Committee Jerry Nadler.\n"When he betrays that trust and puts himself before country, he endangers the constitution, he endangers our democracy and he endangers our national security."\nNadler, in a solemn and deeply serious moment for the nation, was joined by House Speaker Nancy Pelosi in the US Capitol to lay out the charges facing Trump.\nThe president is alleged to have wielded the power of the presidency for personal and political gain by pre

## Applying various operations such as Tokenize, Lemmatize and Stopwords

In [5]:
tokens = word_tokenize(article)

In [6]:
tokens

['Democrats',
 'unveiled',
 'two',
 'articles',
 'of',
 'impeachment',
 'Tuesday',
 'against',
 'US',
 'President',
 'Donald',
 'Trump',
 'after',
 'weeks',
 'of',
 'arguing',
 'there',
 'is',
 'overwhelming',
 'evidence',
 'that',
 'the',
 'US',
 'leader',
 'abused',
 'his',
 'office',
 'and',
 'deserves',
 'to',
 'be',
 'removed',
 '.',
 'If',
 'the',
 'charges',
 '--',
 'abuse',
 'of',
 'power',
 'and',
 'obstruction',
 'of',
 'Congress',
 '--',
 'are',
 'approved',
 'by',
 'the',
 'full',
 'House',
 'of',
 'Representatives',
 'in',
 'a',
 'vote',
 'expected',
 'next',
 'week',
 ',',
 'it',
 'would',
 'put',
 'Trump',
 'in',
 'the',
 'historic',
 'position',
 'of',
 'being',
 'the',
 'third',
 'US',
 'leader',
 'ever',
 'impeached',
 'and',
 'placed',
 'on',
 'trial',
 'in',
 'the',
 'Senate',
 '.',
 '``',
 'Our',
 'president',
 'holds',
 'the',
 'ultimate',
 'public',
 'trust',
 ',',
 "''",
 'said',
 'House',
 'Judiciary',
 'Committee',
 'Jerry',
 'Nadler',
 '.',
 '``',
 'When',
 '

In [7]:
tokens = [lm.lemmatize(words).lower() for words in tokens if not words in set(stopwords.words('english'))]

In [8]:
tokens

['democrats',
 'unveiled',
 'two',
 'article',
 'impeachment',
 'tuesday',
 'us',
 'president',
 'donald',
 'trump',
 'week',
 'arguing',
 'overwhelming',
 'evidence',
 'us',
 'leader',
 'abused',
 'office',
 'deserves',
 'removed',
 '.',
 'if',
 'charge',
 '--',
 'abuse',
 'power',
 'obstruction',
 'congress',
 '--',
 'approved',
 'full',
 'house',
 'representatives',
 'vote',
 'expected',
 'next',
 'week',
 ',',
 'would',
 'put',
 'trump',
 'historic',
 'position',
 'third',
 'us',
 'leader',
 'ever',
 'impeached',
 'placed',
 'trial',
 'senate',
 '.',
 '``',
 'our',
 'president',
 'hold',
 'ultimate',
 'public',
 'trust',
 ',',
 "''",
 'said',
 'house',
 'judiciary',
 'committee',
 'jerry',
 'nadler',
 '.',
 '``',
 'when',
 'betrays',
 'trust',
 'put',
 'country',
 ',',
 'endangers',
 'constitution',
 ',',
 'endangers',
 'democracy',
 'endangers',
 'national',
 'security',
 '.',
 "''",
 'nadler',
 ',',
 'solemn',
 'deeply',
 'serious',
 'moment',
 'nation',
 ',',
 'joined',
 'house'

In [9]:
tokens = [word for word in tokens if word.isalpha()]

In [10]:
tokens

['democrats',
 'unveiled',
 'two',
 'article',
 'impeachment',
 'tuesday',
 'us',
 'president',
 'donald',
 'trump',
 'week',
 'arguing',
 'overwhelming',
 'evidence',
 'us',
 'leader',
 'abused',
 'office',
 'deserves',
 'removed',
 'if',
 'charge',
 'abuse',
 'power',
 'obstruction',
 'congress',
 'approved',
 'full',
 'house',
 'representatives',
 'vote',
 'expected',
 'next',
 'week',
 'would',
 'put',
 'trump',
 'historic',
 'position',
 'third',
 'us',
 'leader',
 'ever',
 'impeached',
 'placed',
 'trial',
 'senate',
 'our',
 'president',
 'hold',
 'ultimate',
 'public',
 'trust',
 'said',
 'house',
 'judiciary',
 'committee',
 'jerry',
 'nadler',
 'when',
 'betrays',
 'trust',
 'put',
 'country',
 'endangers',
 'constitution',
 'endangers',
 'democracy',
 'endangers',
 'national',
 'security',
 'nadler',
 'solemn',
 'deeply',
 'serious',
 'moment',
 'nation',
 'joined',
 'house',
 'speaker',
 'nancy',
 'pelosi',
 'us',
 'capitol',
 'lay',
 'charge',
 'facing',
 'trump',
 'the',


## Making a dictionary with count of each token

In [11]:
d = {}
for word in tokens:
    c = tokens.count(word)
    key = {word:c}
    d.update(key)

In [12]:
d

{'democrats': 6,
 'unveiled': 1,
 'two': 1,
 'article': 2,
 'impeachment': 3,
 'tuesday': 2,
 'us': 8,
 'president': 7,
 'donald': 1,
 'trump': 12,
 'week': 3,
 'arguing': 1,
 'overwhelming': 2,
 'evidence': 2,
 'leader': 2,
 'abused': 1,
 'office': 3,
 'deserves': 1,
 'removed': 1,
 'if': 1,
 'charge': 4,
 'abuse': 2,
 'power': 2,
 'obstruction': 2,
 'congress': 2,
 'approved': 1,
 'full': 1,
 'house': 5,
 'representatives': 1,
 'vote': 2,
 'expected': 2,
 'next': 1,
 'would': 5,
 'put': 2,
 'historic': 1,
 'position': 1,
 'third': 2,
 'ever': 1,
 'impeached': 2,
 'placed': 1,
 'trial': 2,
 'senate': 2,
 'our': 1,
 'hold': 1,
 'ultimate': 1,
 'public': 3,
 'trust': 2,
 'said': 3,
 'judiciary': 1,
 'committee': 2,
 'jerry': 1,
 'nadler': 2,
 'when': 1,
 'betrays': 1,
 'country': 1,
 'endangers': 3,
 'constitution': 1,
 'democracy': 1,
 'national': 2,
 'security': 2,
 'solemn': 1,
 'deeply': 1,
 'serious': 1,
 'moment': 1,
 'nation': 1,
 'joined': 1,
 'speaker': 1,
 'nancy': 1,
 'pelosi

## Converting the Dictionary to DataFrame

In [13]:
df = pd.DataFrame.from_dict(d, orient='index', columns=['Occurences'])

In [14]:
df.sort_values(by='Occurences', ascending=False)

Unnamed: 0,Occurences
trump,12
us,8
president,7
democrats,6
house,5
...,...
announcing,1
investigate,1
democratic,1
former,1


In [15]:
sm=df['Occurences'].sum()

In [16]:
sm

295

In [17]:
mx = df['Occurences'].max()

## Assigning the weights of each token based on their number of occurrences

In [18]:
m={}
for i in range(1,mx+1):
    c = i/sm
    k = {i:c}
    m.update(k)

In [19]:
m

{1: 0.003389830508474576,
 2: 0.006779661016949152,
 3: 0.010169491525423728,
 4: 0.013559322033898305,
 5: 0.01694915254237288,
 6: 0.020338983050847456,
 7: 0.023728813559322035,
 8: 0.02711864406779661,
 9: 0.030508474576271188,
 10: 0.03389830508474576,
 11: 0.03728813559322034,
 12: 0.04067796610169491}

In [20]:
df['Weights'] = df['Occurences'].map(m)

In [21]:
df.sort_values(by='Occurences', ascending=False)

Unnamed: 0,Occurences,Weights
trump,12,0.040678
us,8,0.027119
president,7,0.023729
democrats,6,0.020339
house,5,0.016949
...,...,...
announcing,1,0.003390
investigate,1,0.003390
democratic,1,0.003390
former,1,0.003390


In [22]:
df.reset_index(level=0, inplace=True)

In [23]:
df

Unnamed: 0,index,Occurences,Weights
0,democrats,6,0.020339
1,unveiled,1,0.003390
2,two,1,0.003390
3,article,2,0.006780
4,impeachment,3,0.010169
...,...,...,...
199,chamber,1,0.003390
200,republicans,1,0.003390
201,yet,1,0.003390
202,signaled,1,0.003390


## Splitting the Article into sentences to calculate the weightage of each sentence

In [24]:
para = article.split(".")

In [25]:
para

['Democrats unveiled two articles of impeachment Tuesday against US President Donald Trump after weeks of arguing there is overwhelming evidence that the US leader abused his office and deserves to be removed',
 '\nIf the charges -- abuse of power and obstruction of Congress -- are approved by the full House of Representatives in a vote expected next week, it would put Trump in the historic position of being the third US leader ever impeached and placed on trial in the Senate',
 '\n"Our president holds the ultimate public trust," said House Judiciary Committee Jerry Nadler',
 '\n"When he betrays that trust and puts himself before country, he endangers the constitution, he endangers our democracy and he endangers our national security',
 '"\nNadler, in a solemn and deeply serious moment for the nation, was joined by House Speaker Nancy Pelosi in the US Capitol to lay out the charges facing Trump',
 '\nThe president is alleged to have wielded the power of the presidency for personal and 

In [26]:
corpus=[]
for i in range(0,len(para)):  
    para_tokens = word_tokenize(para[i])
    para_tokens = [lm.lemmatize(words).lower() for words in para_tokens if not words in set(stopwords.words('english'))]
    corpus.append([word for word in para_tokens if word.isalpha()])

In [27]:
corpus

[['democrats',
  'unveiled',
  'two',
  'article',
  'impeachment',
  'tuesday',
  'us',
  'president',
  'donald',
  'trump',
  'week',
  'arguing',
  'overwhelming',
  'evidence',
  'us',
  'leader',
  'abused',
  'office',
  'deserves',
  'removed'],
 ['if',
  'charge',
  'abuse',
  'power',
  'obstruction',
  'congress',
  'approved',
  'full',
  'house',
  'representatives',
  'vote',
  'expected',
  'next',
  'week',
  'would',
  'put',
  'trump',
  'historic',
  'position',
  'third',
  'us',
  'leader',
  'ever',
  'impeached',
  'placed',
  'trial',
  'senate'],
 ['our',
  'president',
  'hold',
  'ultimate',
  'public',
  'trust',
  'said',
  'house',
  'judiciary',
  'committee',
  'jerry',
  'nadler'],
 ['when',
  'betrays',
  'trust',
  'put',
  'country',
  'endangers',
  'constitution',
  'endangers',
  'democracy',
  'endangers',
  'national',
  'security'],
 ['nadler',
  'solemn',
  'deeply',
  'serious',
  'moment',
  'nation',
  'joined',
  'house',
  'speaker',
  'n

In [28]:
len(corpus[1])

27

In [29]:
df2 = df.copy()

In [30]:
df2.drop('Occurences', axis=1, inplace=True)

In [31]:
info = df2.T.to_dict('list')

In [32]:
info

{0: ['democrats', 0.020338983050847456],
 1: ['unveiled', 0.003389830508474576],
 2: ['two', 0.003389830508474576],
 3: ['article', 0.006779661016949152],
 4: ['impeachment', 0.010169491525423728],
 5: ['tuesday', 0.006779661016949152],
 6: ['us', 0.02711864406779661],
 7: ['president', 0.023728813559322035],
 8: ['donald', 0.003389830508474576],
 9: ['trump', 0.04067796610169491],
 10: ['week', 0.010169491525423728],
 11: ['arguing', 0.003389830508474576],
 12: ['overwhelming', 0.006779661016949152],
 13: ['evidence', 0.006779661016949152],
 14: ['leader', 0.006779661016949152],
 15: ['abused', 0.003389830508474576],
 16: ['office', 0.010169491525423728],
 17: ['deserves', 0.003389830508474576],
 18: ['removed', 0.003389830508474576],
 19: ['if', 0.003389830508474576],
 20: ['charge', 0.013559322033898305],
 21: ['abuse', 0.006779661016949152],
 22: ['power', 0.006779661016949152],
 23: ['obstruction', 0.006779661016949152],
 24: ['congress', 0.006779661016949152],
 25: ['approved', 0

## Calculating the weightage of the 9 sentences in the article

In [33]:
sum = 0
total = []
for i in range(0,len(corpus)):
    sum = 0
    for j in range(0,len(corpus[i])):
        for k in range(0,203):
            if corpus[i][j] == info[k][0]:
                sum = sum + info[k][1]
            else:
                sum = sum + 0
    total.append(sum)

In [34]:
total

[0.2271186440677966,
 0.23728813559322035,
 0.09830508474576272,
 0.07457627118644068,
 0.14576271186440679,
 0.12203389830508474,
 0.17966101694915257,
 0.0983050847457627,
 0.15593220338983055,
 0.2440677966101695,
 0.18305084745762715,
 0.25423728813559326,
 0.2338983050847458,
 0.11864406779661019,
 0.10847457627118644,
 0]

## Therefore 2nd sentence has the highest weightage

In [35]:
para[total.index(max(total))]

'" Trump, who has long assailed the\nDemocrats for pursuing impeachment, maintained his fighting posture early Tuesday, tweeting that the effort to oust him as "sheer Political Madness!" Democrats on Monday laid out their case for ouster with a nearly 10-hour public hearing in which they declared Trump a "clear and present danger" to national security'

# Therefore the context can be derived from the 12th sentence and the words occurring maximum number of times in the article