In [2]:
# Importing the libraries
import nltk
import pandas as pd
import numpy as np

In [426]:
# Using some sample text data
txt='The Opposition parties will run to a blank wall. Modi will give then a run for their money'

In [429]:
# Tokenize
wordsList = nltk.word_tokenize(txt)
print(wordsList)
# Now we need to identify what is the part of speech for each tokenised part


['The', 'Opposition', 'parties', 'will', 'run', 'to', 'a', 'blank', 'wall', '.', 'Modi', 'will', 'give', 'then', 'a', 'run', 'for', 'their', 'money']


In [420]:
# extracting the pos tags
tagged = nltk.pos_tag(wordsList)
print(tagged)


[('The', 'DT'), ('Opposition', 'NNP'), ('parties', 'NNS'), ('will', 'MD'), ('run', 'VB'), ('to', 'TO'), ('a', 'DT'), ('blank', 'NN'), ('wall', 'NN'), ('.', '.'), ('Modi', 'NNP'), ('will', 'MD'), ('give', 'VB'), ('then', 'RB'), ('a', 'DT'), ('run', 'NN'), ('for', 'IN'), ('their', 'PRP$'), ('money', 'NN')]


In [431]:
tagged_ls = nltk.pos_tag(wordsList)
print(tagged_ls)

[('The', 'DT'), ('Opposition', 'NNP'), ('parties', 'NNS'), ('will', 'MD'), ('run', 'VB'), ('to', 'TO'), ('a', 'DT'), ('blank', 'NN'), ('wall', 'NN'), ('.', '.'), ('Modi', 'NNP'), ('will', 'MD'), ('give', 'VB'), ('then', 'RB'), ('a', 'DT'), ('run', 'NN'), ('for', 'IN'), ('their', 'PRP$'), ('money', 'NN')]


In [None]:
# we can see that the word 'run' appears twice
# 1: As a VB (Verb)
# 2: As a NN (Noun)
# Such pair of words are known as homographs

In [424]:
# Extracting the key entity (Basically the Noun)
new_list = [i[0] for i in tagged_ls if ((i[1]=='NNP') | (i[1]=='NNPS' ) | (i[1]=='NN') | (i[1]=='NNS')) ]
new_list

# NN: Noun (singular)
# NNS: Noun (plural)
# NNP: Noun (Proper Noun)
# NNPS: Proper Noun (plural)

['dashing', 'personality']

In [432]:
# Extracting Proper Nouns
new_list = [i[0] for i in tagged_ls if i[1]=='NNP']
new_list

# Opposition and Modi

['Opposition', 'Modi']

****There are text corpus in NLTK library which are pre-tagged. 
We will try and analyze one such corpora****

In [60]:
# Reading tagged Corpora
# brown bag corpora in nltk is tagged
# Taken from : https://www.nltk.org/book/ch05.html
nltk.corpus.brown.tagged_words()

[('The', 'AT'), ('Fulton', 'NP-TL'), ...]

In [61]:
nltk.corpus.brown.tagged_words(tagset='universal')
# Tagset argument has been provided a value of 'universal'
# This causes pos tags to not apper in the abbreviated form
# For instance, in the output we see that 'The' is tagged as 'DET'(Determiner)
# and 'Fulton' is tagged as 'NOUN'

[('The', 'DET'), ('Fulton', 'NOUN'), ...]

In [62]:
# Let us look at the Frequency Distributon of brown bag corpora
from nltk.corpus import brown
brown_news_tagged = brown.tagged_words(categories='news', tagset='universal')
tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)
tag_fd.most_common()

# We can see that most of the words are Noun followed by Verb
# X represents category that is exclusive of the ones mentioned below

[('NOUN', 30654),
 ('VERB', 14399),
 ('ADP', 12355),
 ('.', 11928),
 ('DET', 11389),
 ('ADJ', 6706),
 ('ADV', 3349),
 ('CONJ', 2717),
 ('PRON', 2535),
 ('PRT', 2264),
 ('NUM', 2166),
 ('X', 92)]

## Let us now try and analyze a text corpus which is related to Indian Prime Minister Narendra Modi .It is a sort of short summary about his background and political journey
<br>


In [3]:
# The text data about Mr Modi taken from the following link
# Link: https://www.britannica.com/biography/Narendra-Modi
# I have dumped the text data in the csv file by the name 'text_file.csv'
# The file will be available for download along with the blog
# We will try to draw some summary in the light of POS tags and NER

dr="C:\\NLP in Python\\POS Tags"
path=dr+"\\text_file.csv"

df=pd.read_csv(dr+"\\text_file.csv",encoding='windows-1252')
df.head()

Unnamed: 0,Text
0,"Narendra Modi, in full Narendra Damodardas Mod..."
1,Modi was raised in a small town in northern Gu...
2,"Modi joined the BJP in 1987, and a year later ..."
3,In 1995 Modi was made the secretary of the BJP...
4,Modi’s political career thereafter remained a ...


In [None]:
# Let us now try and perform the following tasks
# Task 1: Identify key entities from the text
# Task 2: For the key entities identify the adjectives used in the text


# Task 1: Identify key entities from the text
<br>


In [4]:
df.shape
# the data set has 10 records

(10, 1)

In [5]:
# Lets get the pos tags for the first row
# Tokenize

l1=[]

for i in df['Text']:
    l1.append(str(i))


l2=[]

for j in l1:
    l2.append(nltk.word_tokenize(j))
    

print(l2[0])# gives the tokenized form for the first row of the data set


['Narendra', 'Modi', ',', 'in', 'full', 'Narendra', 'Damodardas', 'Modi', ',', '(', 'born', 'September', '17', ',', '1950', ',', 'Vadnagar', ',', 'India', ')', ',', 'Indian', 'politician', 'and', 'government', 'official', 'who', 'rose', 'to', 'become', 'a', 'senior', 'leader', 'of', 'the', 'Bharatiya', 'Janata', 'Party', '(', 'BJP', ')', '.', 'In', '2014', 'he', 'led', 'his', 'party', 'to', 'victory', 'in', 'elections', 'to', 'the', 'Lok', 'Sabha', '(', 'lower', 'chamber', 'of', 'the', 'Indian', 'parliament', ')', ',', 'after', 'which', 'he', 'was', 'sworn', 'in', 'as', 'prime', 'minister', 'of', 'India', '.', 'Prior', 'to', 'that', 'he', 'had', 'served', '(', '2001–14', ')', 'as', 'chief', 'minister', '(', 'head', 'of', 'government', ')', 'of', 'Gujarat', 'state', 'in', 'western', 'India', '.']


In [6]:
# Now we need to identify what is the part of speech for each tokenised part

l3=[nltk.pos_tag(i) for i in l2]
#tagged = nltk.pos_tag(l2)
l3[0][0:5]


[('Narendra', 'NNP'),
 ('Modi', 'NNP'),
 (',', ','),
 ('in', 'IN'),
 ('full', 'JJ')]

In [7]:
# Elements in l3 are tokenized words from each paragraph
# Lets look at NNP words from each paragraph
key_entities=[]

for i in l3:
    l1=[]
    for j in i:
        if j[1]=='NNP':
            l1.append(j[0])
        
    key_entities.append(l1)

print(key_entities)

[['Narendra', 'Modi', 'Narendra', 'Damodardas', 'Modi', 'September', 'Vadnagar', 'India', 'Bharatiya', 'Janata', 'Party', 'BJP', 'Lok', 'Sabha', 'Indian', 'India', 'Prior', 'Gujarat', 'India'], ['Modi', 'Gujarat', 'M.A', 'Gujarat', 'University', 'Ahmadabad', 'Rashtriya', 'Swayamsevak', 'Sangh', 'RSS', 'RSS', '’', 'Akhil', 'Bharatiya', 'Vidyarthi', 'Parishad', 'Modi', 'RSS'], ['Modi', 'BJP', 'Gujarat', '’', 'Modi', 'BJP', 'BJP', 'March', 'India', 'BJP', '’', 'September'], ['Modi', 'BJP', '’', 'New', 'Delhi', 'October', 'Gujarat', 'BJP', 'Keshubhai', 'Patel', 'Patel', '’', 'Bhuj', 'Gujarat', 'Modi', 'February', 'Gujarat'], ['Modi', '’', 'Gujarat', 'Muslims', 'Hindu', 'Godhra', 'United', 'United', 'Kingdom', 'Modi', 'Modi', '’', 'Lashkar-e-Taiba', 'Mumbai', 'Modi'], ['Modi', '’', 'Gujarat', 'BJP', 'BJP', 'December', 'Modi', 'Gujarat', 'BJP', 'Modi'], ['Gujarat', 'Modi', '’', '’', 'Modi', '’', 'India', 'June', 'Modi', 'BJP', '’', 'Lok', 'Sabha'], ['Modi', 'India', '’', 'BJP', 'Modi', 'May'

In [9]:
# Lets look at the Frequency Distribution
# Storing the key entities

key_entities_freq=[nltk.FreqDist(i) for i in key_entities]

# Storing the keys
keys0=[]

for j in key_entities_freq:
    keys1=[]
    for k in j:
        keys0.append(k)
        
    
    
len(keys0) # 113
print(keys0)

['Narendra', 'Modi', 'Damodardas', 'September', 'Vadnagar', 'India', 'Bharatiya', 'Janata', 'Party', 'BJP', 'Lok', 'Sabha', 'Indian', 'Prior', 'Gujarat', 'Modi', 'Gujarat', 'M.A', 'University', 'Ahmadabad', 'Rashtriya', 'Swayamsevak', 'Sangh', 'RSS', '’', 'Akhil', 'Bharatiya', 'Vidyarthi', 'Parishad', 'Modi', 'BJP', 'Gujarat', '’', 'March', 'India', 'September', 'Modi', 'BJP', '’', 'New', 'Delhi', 'October', 'Gujarat', 'Keshubhai', 'Patel', 'Bhuj', 'February', 'Modi', '’', 'Gujarat', 'Muslims', 'Hindu', 'Godhra', 'United', 'Kingdom', 'Lashkar-e-Taiba', 'Mumbai', 'Modi', '’', 'Gujarat', 'BJP', 'December', 'Gujarat', 'Modi', '’', 'India', 'June', 'BJP', 'Lok', 'Sabha', 'Modi', 'India', '’', 'BJP', 'May', 'Chinese', 'President', 'Xi', 'Jinping', 'U.S.', 'New', 'York', 'City', 'Pres', 'Barack', 'Obama', 'Modi', 'Hindu', 'Hindus', '“', 'Goods', 'Tax', 'GST', 'GDP', '’', 'BJP', 'Madhya', 'Pradesh', 'Rajasthan', 'Chhattisgarh', 'National', 'Congress', 'Party', 'Modi', '’', 'Jammu', 'Kashmir',

In [10]:
# Storing the Frequency values
val0=[]
key_entities_freq

val0=[list(i.values()) for i in key_entities_freq]
val1=[]

for i in val0:
    for j in i:
        val1.append(j)
    
len(val1) # 113
print(val1)

[2, 2, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 2, 4, 1, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 3, 1, 2, 1, 1, 4, 2, 1, 1, 1, 1, 2, 1, 1, 1, 3, 1, 2, 3, 1, 1, 3, 4, 1, 1, 1, 1, 1, 4, 3, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 1, 1, 1, 1, 1, 3, 2, 4, 3, 1, 1, 1, 1, 1, 1, 1, 1]


In [12]:
# Storing the paragraph number
indx=[]
for i in val0:
    for j in i:
        indx.append(val0.index(i)+1)
len(indx)  # 113
print(indx)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10]


In [240]:
# Combining indx, keys0 and val1

df_indx=pd.DataFrame(indx,columns=['Paragraph'])
df_indx['Keywords']=keys0
df_indx['Freq']=val1
df_indx

# remving the punctuations from 'Keywords'
pos=[i not in "’" for i in df_indx['Keywords']]
df_indx_new=df_indx[pos]
df_indx_new.head()

Unnamed: 0,Paragraph,Keywords,Freq
0,1,Narendra,2
1,1,Modi,2
2,1,Damodardas,1
3,1,September,1
4,1,Vadnagar,1


In [413]:
df_indx_new.sort_values(['Paragraph','Freq'],ascending=[True,False]).groupby('Paragraph').head(2)

Unnamed: 0,Paragraph,Keywords,Freq
5,1,India,3
0,1,Narendra,2
23,2,RSS,3
15,2,Modi,2
30,3,BJP,4
29,3,Modi,2
42,4,Gujarat,3
36,4,Modi,2
47,5,Modi,4
53,5,United,2


In [411]:
# Lets look at the words that are common across all the paragraphs
keywrds=df_indx_new.sort_values(['Paragraph','Freq'],ascending=[True,False]).groupby('Paragraph').head(4)
keywrds_df=keywrds.groupby('Keywords').size().to_frame('count').reset_index().sort_values('count',ascending=False)
keywrds_df.head()

# So we can see that Modi word appears in all paragraphs
# It is followed by Gujarat and BJP
# So we can clearly see that the key entity here is Modi

Unnamed: 0,Keywords,count
11,Modi,10
4,Gujarat,6
0,BJP,5
7,India,3
18,United,1


****Key Entities identified are the following <br> - Modi<br> - BJP<br> - Gujarat<br> - India<br>****

# Task 2: For the key entities identify the adjectives used in the text <br>

In [14]:
# Since the key entities have been identified, lets now try to find adjectives for them
# Lets now start with the adjectives used for Modi
import string
from string import punctuation
from itertools import compress



In [15]:
# Lets clean all of l3 from any punctuations
l5=[]
for i in l3:
    for j in i:
        if j[0] in punctuation:
            temp=1
        else:
            l5.append(j)

# l5 contains all the pos tags without any punctuation
print(l5)

[('Narendra', 'NNP'), ('Modi', 'NNP'), ('in', 'IN'), ('full', 'JJ'), ('Narendra', 'NNP'), ('Damodardas', 'NNP'), ('Modi', 'NNP'), ('born', 'JJ'), ('September', 'NNP'), ('17', 'CD'), ('1950', 'CD'), ('Vadnagar', 'NNP'), ('India', 'NNP'), ('Indian', 'JJ'), ('politician', 'NN'), ('and', 'CC'), ('government', 'NN'), ('official', 'NN'), ('who', 'WP'), ('rose', 'VBD'), ('to', 'TO'), ('become', 'VB'), ('a', 'DT'), ('senior', 'JJ'), ('leader', 'NN'), ('of', 'IN'), ('the', 'DT'), ('Bharatiya', 'NNP'), ('Janata', 'NNP'), ('Party', 'NNP'), ('BJP', 'NNP'), ('In', 'IN'), ('2014', 'CD'), ('he', 'PRP'), ('led', 'VBD'), ('his', 'PRP$'), ('party', 'NN'), ('to', 'TO'), ('victory', 'NN'), ('in', 'IN'), ('elections', 'NNS'), ('to', 'TO'), ('the', 'DT'), ('Lok', 'NNP'), ('Sabha', 'NNP'), ('lower', 'JJR'), ('chamber', 'NN'), ('of', 'IN'), ('the', 'DT'), ('Indian', 'NNP'), ('parliament', 'NN'), ('after', 'IN'), ('which', 'WDT'), ('he', 'PRP'), ('was', 'VBD'), ('sworn', 'VBN'), ('in', 'IN'), ('as', 'IN'), ('p

In [16]:
len(l5)

1262

In [438]:
# Storing l5 in tagged
tagged=l5


In [439]:
adjective_tags = ["JJ", "JJR", "JJS"]

#tagged.index(i)
#(i[0]=="Modi") and (tagged.index(i)>=2)

tagged_first=[i[0] for i in tagged]
tagged_first

def duplicates(lst, item):
       return [j for j, x in enumerate(lst) if x == item]
    
indx_pos=duplicates(tagged_first, 'Modi')
print(indx_pos)

#tagged[indx_pos[0]]
# pos contains the positional indices where the word 'Modi' has appeared in tagged

[1, 6, 81, 139, 158, 200, 263, 346, 367, 480, 512, 582, 583, 639, 680, 698, 734, 760, 784, 817, 859, 905, 927, 1159, 1177, 1208, 1242]


In [449]:
# Finding the adjective from Modi-3 to Modi+4 places 
adjective_tags = ["JJ", "JJR", "JJS"]
k=[]
# Combining the words that are [Modi -3,Modi + 4]
for j in indx_pos:
    pos=j
    
    if pos==0:
        l=[(tagged[pos+1][0],tagged[pos+1][1]),(tagged[pos+2][0],tagged[pos+2][1]),
      (tagged[pos+3][0],tagged[pos+3][1]),(tagged[pos+4][0],tagged[pos+4][1])]
        k.append(l)
        
    elif pos==1:
        l=[(tagged[pos-1][0],tagged[pos-1][1]),(tagged[pos+1][0],tagged[pos+1][1]),(tagged[pos+2][0],tagged[pos+2][1]),
      (tagged[pos+3][0],tagged[pos+3][1]),(tagged[pos+4][0],tagged[pos+4][1])]
        k.append(l)
     
    elif pos==2:
        l=[(tagged[pos-2][0],tagged[pos-2][1]),(tagged[pos-1][0],tagged[pos-1][1]),(tagged[pos+1][0],tagged[pos+1][1]),(tagged[pos+2][0],tagged[pos+2][1]),
      (tagged[pos+3][0],tagged[pos+3][1]),(tagged[pos+4][0],tagged[pos+4][1])]
        k.append(l)

    else:
        l=[(tagged[pos-1][0],tagged[pos-1][1]),(tagged[pos-2][0],tagged[pos-2][1]),(tagged[pos-3][0],tagged[pos-3][1]),(tagged[pos+1][0],tagged[pos+1][1]),(tagged[pos+2][0],tagged[pos+2][1]),
      (tagged[pos+3][0],tagged[pos+3][1]),(tagged[pos+4][0],tagged[pos+4][1])]
        k.append(l)
    
# Examining first few records
print(k[0:10])    

[[('Narendra', 'NNP'), ('in', 'IN'), ('full', 'JJ'), ('Narendra', 'NNP'), ('Damodardas', 'NNP')], [('Damodardas', 'NNP'), ('Narendra', 'NNP'), ('full', 'JJ'), ('born', 'JJ'), ('September', 'NNP'), ('17', 'CD'), ('1950', 'CD')], [('India', 'NNP'), ('western', 'JJ'), ('in', 'IN'), ('was', 'VBD'), ('raised', 'VBN'), ('in', 'IN'), ('a', 'DT')], [('area', 'NN'), ('his', 'PRP$'), ('in', 'IN'), ('rose', 'VBD'), ('steadily', 'RB'), ('in', 'IN'), ('the', 'DT')], [('career', 'NN'), ('political', 'JJ'), ('subsequent', 'JJ'), ('joined', 'VBD'), ('the', 'DT'), ('BJP', 'NNP'), ('in', 'IN')], [('1990', 'CD'), ('In', 'IN'), ('years', 'NNS'), ('was', 'VBD'), ('one', 'CD'), ('of', 'IN'), ('the', 'DT')], [('1995', 'CD'), ('In', 'IN'), ('1996', 'CD'), ('was', 'VBD'), ('made', 'VBN'), ('the', 'DT'), ('secretary', 'NN')], [('people', 'NNS'), ('20,000', 'CD'), ('than', 'IN'), ('entered', 'VBD'), ('his', 'PRP$'), ('first-ever', 'JJ'), ('electoral', 'JJ')], [('assembly', 'NN'), ('state', 'NN'), ('Gujarat', 'NN

In [450]:
# Extracting the adjectives
adj_jj=[]    
for l in k:
    for m in l:
        if(m[1] in adjective_tags):
            adj_jj.append(m[0])
        


adj_jj_ls=set(adj_jj)
print(adj_jj_ls)

{'first-ever', 'electoral', 'vigorous', 's', 'diplomatic', 'bad', 'political', 'western', 'formidable', 'born', 'significant', 'prime', 'successful', 'subsequent', 'lengthy', 'full'}


In [369]:
prep=[]
for i in tagged:
    if i[1] in ['PRP','PRP$']:
        m=i[0]
        prep.append(m)
set(prep)

# Most probably the prepositions used for Modi are he, His, he, him, himself, his

{'He', 'His', 'he', 'him', 'himself', 'his', 'it', 'its', 'their'}

In [445]:
# Lets try to find out the adjectives related to the pronouns
adjective_tags = ["JJ", "JJR", "JJS"]

tagged_first=[i[0] for i in tagged]
tagged_first

def duplicates(lst, item):
       return [j for j, x in enumerate(lst) if x in item]
    
indx1_pos=duplicates(tagged_first, ['He', 'His', 'he', 'him', 'himself', 'his'])
indx1_pos[0:10]


[33, 35, 53, 65, 92, 105, 137, 147, 154, 168]

In [453]:
# Finding the adjective for preposition-3 to preposition+4 places 
adjective_tags = ["JJ", "JJR", "JJS"]
k1=[]
for j in indx1_pos:
    pos=j
    
    if pos==0:
        l=[(tagged[pos+1][0],tagged[pos+1][1]),(tagged[pos+2][0],tagged[pos+2][1]),
      (tagged[pos+3][0],tagged[pos+3][1]),(tagged[pos+4][0],tagged[pos+4][1])]
        k1.append(l)
        
    elif pos==1:
        l=[(tagged[pos-1][0],tagged[pos-1][1]),(tagged[pos+1][0],tagged[pos+1][1]),(tagged[pos+2][0],tagged[pos+2][1]),
      (tagged[pos+3][0],tagged[pos+3][1]),(tagged[pos+4][0],tagged[pos+4][1])]
        k1.append(l)
     
    elif pos==2:
        l=[(tagged[pos-2][0],tagged[pos-2][1]),(tagged[pos-1][0],tagged[pos-1][1]),(tagged[pos+1][0],tagged[pos+1][1]),(tagged[pos+2][0],tagged[pos+2][1]),
      (tagged[pos+3][0],tagged[pos+3][1]),(tagged[pos+4][0],tagged[pos+4][1])]
        k1.append(l)

    else:
        l=[(tagged[pos-1][0],tagged[pos-1][1]),(tagged[pos-2][0],tagged[pos-2][1]),(tagged[pos-3][0],tagged[pos-3][1]),(tagged[pos+1][0],tagged[pos+1][1]),(tagged[pos+2][0],tagged[pos+2][1]),
      (tagged[pos+3][0],tagged[pos+3][1]),(tagged[pos+4][0],tagged[pos+4][1])]
        k1.append(l)
    
# Examining first few records
print(k1[0:10])

[[('2014', 'CD'), ('In', 'IN'), ('BJP', 'NNP'), ('led', 'VBD'), ('his', 'PRP$'), ('party', 'NN'), ('to', 'TO')], [('led', 'VBD'), ('he', 'PRP'), ('2014', 'CD'), ('party', 'NN'), ('to', 'TO'), ('victory', 'NN'), ('in', 'IN')], [('which', 'WDT'), ('after', 'IN'), ('parliament', 'NN'), ('was', 'VBD'), ('sworn', 'VBN'), ('in', 'IN'), ('as', 'IN')], [('that', 'IN'), ('to', 'TO'), ('Prior', 'NNP'), ('had', 'VBD'), ('served', 'VBN'), ('2001–14', 'CD'), ('as', 'IN')], [('and', 'CC'), ('Gujarat', 'NNP'), ('northern', 'JJ'), ('completed', 'VBD'), ('an', 'DT'), ('M.A', 'NNP'), ('degree', 'NN')], [('Ahmadabad', 'NNP'), ('in', 'IN'), ('University', 'NNP'), ('joined', 'VBD'), ('the', 'DT'), ('pro-Hindu', 'JJ'), ('Rashtriya', 'NNP')], [('in', 'IN'), ('Parishad', 'NNP'), ('Vidyarthi', 'NNP'), ('area', 'NN'), ('Modi', 'NNP'), ('rose', 'VBD'), ('steadily', 'RB')], [('and', 'CC'), ('hierarchy', 'NN'), ('RSS', 'NNP'), ('association', 'NN'), ('with', 'IN'), ('the', 'DT'), ('organization', 'NN')], [('benefi

In [455]:
# Extracting the adjectives
adj_jj1=[]    
for l in k1:
    for m in l:
        if(m[1] in adjective_tags):
            adj_jj1.append(m[0])
        


adj_jj1_ls=set(adj_jj1)
print(adj_jj1_ls)

{'self-promoted', 'able', 'northern', 'political', 'incumbent', 'chief', 'electoral', 'several', 'general', 'instrumental', 'investigative', 'diplomatic', 'responsible', 'first-ever', 'pro-Hindu', 'indispensable', 'subsequent', 'close', 'pragmatic'}


In [456]:
# Combining adj_jj and adj_jj1

final_adj=[list(adj_jj_ls),list(adj_jj1_ls)]
print(final_adj)


[['first-ever', 'electoral', 'vigorous', 's', 'diplomatic', 'bad', 'political', 'western', 'formidable', 'born', 'significant', 'prime', 'successful', 'subsequent', 'lengthy', 'full'], ['self-promoted', 'able', 'northern', 'political', 'incumbent', 'chief', 'electoral', 'several', 'general', 'instrumental', 'investigative', 'diplomatic', 'responsible', 'first-ever', 'pro-Hindu', 'indispensable', 'subsequent', 'close', 'pragmatic']]


***The following task can be tried  as part of the learning***

In [None]:

# Task 3: What are different prepositions used for key entities. What can be inferred from the them
# Task 4: Conditional freq distribution of key entities depending upon a condition