# EXTRACTIVE SUMMARIZATION
# Sentence Scoring

In [1]:
from nltk.corpus import stopwords
import networkx as nx
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from nltk.corpus import wordnet as wn
import matplotlib.pyplot as plt

In [2]:
## Stopwords removal using the stopwords corpus from nltk
## Counting the number of each words in the text
## This could be done easily with the help of a dictionary
dic = dict()
count = 0    ## Number of words (excluding stop words in the text)
with open("cnn","r") as text:
    for line in text:
        for word in line.split(' '):
            word = word.strip()
            if word not in stopwords.words('english'):
                count = count + 1
                try:
                    dic[word] += 1
                except:
                    dic[word] = 1
## Term-frequency of each word is its frequncy divided by the total number of words
## Printing each word in the text and its frequency
for key in dic:
    dic[key] = float(dic[key])/count    ## Term frequency
    print key + " : " + str(dic[key])

soon : 0.00617283950617
violent : 0.00617283950617
behind : 0.00617283950617
staff : 0.00617283950617
police : 0.00617283950617
workers : 0.037037037037
pump : 0.00617283950617
thursday : 0.00617283950617
labor : 0.00617283950617
clashes : 0.00617283950617
parts : 0.00617283950617
fuel : 0.00617283950617
local : 0.00617283950617
easier : 0.00617283950617
hope : 0.00617283950617
stocked : 0.00617283950617
government : 0.00617283950617
familiar : 0.00617283950617
stop : 0.00617283950617
refineries : 0.00617283950617
words : 0.00617283950617
world : 0.00617283950617
governments : 0.0123456790123
prime : 0.00617283950617
tear : 0.00617283950617
companies : 0.0123456790123
worried : 0.00617283950617
leave : 0.00617283950617
intensification : 0.00617283950617
withdrawal : 0.00617283950617
difficult : 0.00617283950617
people : 0.0123456790123
proposals : 0.00617283950617
back : 0.00617283950617
tires : 0.00617283950617
culture : 0.00617283950617
best : 0.00617283950617
might : 0.0061728395061

In [3]:
## Calculating the term frequency of each of the sentence
## tf contains the term frequncy score of each sentence as a hash to the sentence number in the document
tf = list()
with open("cnn","r") as text:
    for line in text:
        score = 0
        for word in line.split(' '):
            word = word.strip()
            if word not in stopwords.words('english'):
                score = score + dic[word]
        print line + " : " + str(score)
        tf.append(score)
            

burning tires tear gas and clashes with riot police the ugly scenes that come with workers strikes are all too familiar in france a country constantly trying to balance its culture of workers rights with a more efficient economy
 : 0.234567901235
such scenes are being played out across the country friday as unions have called for workers to step up protests that have for the past week crippled parts of france
 : 0.185185185185
employees of oil refineries nuclear power plants and some public transportation have left one in three gas stations dry forcing vehicles to search for well stocked stations and causing long lines at the pump
 : 0.179012345679
people are now hoarding gas worried that it may be some time until supply levels are back to normal
 : 0.0864197530864
the workers are protesting a labor reform bill put forward by the government that will make it easier for companies to hire and fire employees
 : 0.148148148148
the governments argument is that the strict laws that make fren

# Sentence Clustering

In [4]:
## STATISTICAL SIMILARITY
def statistical(si,sj):
    ## Stemming the strings so that they donot contain any verb forms
    stemmer = nltk.stem.porter.PorterStemmer()
    remove_punctuation_map =dict((ord(char),None) for char in string.punctuation)
    def stem_tokens(tokens):
        return [stemmer.stem(item) for item in tokens]
    ## Normalizing the text i.e removing both the verb forms and also the punctuation if any
    def normalize(text):
        return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))
    ## Vectorizing the string inputs to be used to calculate the cosine similarity
    vectorizer = TfidfVectorizer(tokenizer = normalize,stop_words = 'english')
    tfidf = vectorizer.fit_transform([si,sj])
    ## Calculating the cosine similarity
    return ((tfidf * tfidf.T).A)[0,1]
statistical("Hello my name is Najeeb","Hello my name is Yasir")

0.3360969272762574

In [5]:
## SEMANTIC SIMILARITY
## Calculating the semantic similarity between two sentences using WordNet
def semantic_score(word1, word2):
    try:
        w1 = wn.synset('%s.n.01'%(word1))
        w2 = wn.synset('%s.n.01'%(word2))
        return w1.path_similarity(w2)
    except:
        return 0
## Calculate the semantic score of two sentences
## What should be the method to normalize these scores? I divided the result by total number of comparisons
def sentence_semantic(string1, string2):
    score,count = 0,0
    token1 = nltk.word_tokenize(string1)
    token2 = nltk.word_tokenize(string2)
    for t1 in nltk.pos_tag(token1):
        for t2 in nltk.pos_tag(token2):
            if(t1[1] == 'NN' and t2[1] == 'NN'):
                score += semantic_score(t1[0],t2[0])
                count += 1
    return score
print sentence_semantic('this is a gorilla eating a fruit', 'this is a cat drinking milk')

0.427457264957


In [6]:
## COREFERENCE RESOLUTION
##TODO

In [7]:
## DISCOURSE RELATIONS
##TODO

In [9]:
## GRAPH IMPLEMENTATION OF THE TEXT
## Using the NetworkX library and constructing edges
## between the graph only on the basis of semantic
## and syntactic similarity of the two node
G = nx.Graph()
filename = str(raw_input("Enter the name of the document: "))
with open(filename,"r") as text:
    for line in text:
        G.add_node(line)
for nodei in G.nodes():
    for nodej in G.nodes():
        if nodei != nodej:
            if(statistical(nodei,nodej) > 1):
                G.add_edge(nodei,nodej)
            if(sentence_semantic(nodei,nodej) > 0.8):
                G.add_edge(nodei,nodej)
                
print "Number of edges in the graph" + " : " + len(G.edges())
nx.draw_circular(G)
plt.show()

Enter the name of the document: cnn
44
