# Extractive Summarisation

In [1]:
# import libraries
from collections import Counter
from itertools import combinations
from math import sqrt
import matplotlib.pyplot as plt
import networkx as nx
from nltk import word_tokenize, sent_tokenize, FreqDist,pos_tag
from nltk.corpus import stopwords, wordnet as wn
from nltk.tokenize import RegexpTokenizer
from operator import itemgetter
import re
%matplotlib inline

In [2]:
# Convergence threshold is the maximum error in score convergence of TextRank
CONVERGENCE_THRESHOLD = 0.0001

In [3]:
# set of all nouns
NOUNS = {x.name().split('.', 1)[0] for x in wn.all_synsets('n')}

In [4]:
class Document():
    '''
    The master class for our Document Summerization module.
    Incorporates all features related to Document
    '''
    
    def __init__(self, document):
        self.document = document
        #self.sents = sent_tokenize(self.document)
        self.sents = self.document.split('\n')
        self.word_freq = FreqDist(clean(self.document))
        self.graph = None
        self.params = { 'thresh': 0.0 }
                
    def __str__(self):
        return self.document
    
    
    def statistical_sim(self, sent1, sent2):
        '''
        Statistical similarity between sentences
        based on the cosine method
        Returns: float (the cosine similarity b/w sent1 and sent2)
        '''
        sent_token1 = Counter(sent1)
        sent_token2 = Counter(sent2)
        
        intxn = set(sent_token1) & set(sent_token2)
        numerator = sum([sent_token1[x] * sent_token2[x] for x in intxn])
        
        mod1 = sum([sent_token1[x]**2 for x in sent_token1.keys()])
        mod2 = sum([sent_token2[x]**2 for x in sent_token2.keys()])
        denominator = sqrt(mod1)*sqrt(mod2)
        
        if not denominator:
            return 0.0

        return float(numerator)/denominator
    
    
    def semantic_sim(self, sent1, sent2):
        '''
        A semantic similarity score between two sentences
        based on WordNet
        Returns: float (the semantic similarity measure)
        '''
        score = 0
        sent1 = [word for word in sent1 if word in NOUNS]
        sent2 = [word for word in sent2 if word in NOUNS]
        for t1 in sent1:
            for t2 in sent2:
                score += semantic_score(t1,t2)
        try:
            return score/(len(sent1 + sent2))  
        except:
            return 10000
    
    
    def construct_graph(self):
        '''
        Constructs the word similarity graph
        '''
        connected = []
        for pair in combinations(self.sents, 2):
            cpair = clean(pair[0]), clean(pair[1])
            weight = self.statistical_sim(*cpair) + \
                     self.semantic_sim(*cpair)
            connected.append((pair[0], pair[1], weight))
        self.graph = draw_graph(connected, self.params['thresh'])    

In [5]:
# Utility functions
def clean(sent):
    '''
    A utility function that returns a a list of words in a sentence
    after cleaning it. Gets rid off uppper-case, punctuations, 
    stop words, etc.
    Returns: list (a list of cleaned words in sentence)
    '''
    words =  sent.lower() 
    words = re.findall(r'\w+', words,flags = re.UNICODE | re.LOCALE) 
    imp_words = filter(lambda x: x not in stopwords.words('english'), words)
    return imp_words
        
def semantic_score(word1, word2):
    '''
    Semantic score between two words based on WordNet
    Returns: float (the semantic score between word1 and word2)
    '''
    try:
        w1 = wn.synset('%s.n.01'%(word1))
        w2 = wn.synset('%s.n.01'%(word2))
        return wn.path_similarity(w1,w2,simulate_root = False)
    except:
        return 0
    
def draw_graph(connected, thresh):
    '''
    Draws graph as per weights and puts edges if 
    weight exceed the given thresh
    Returns: networkx Graph (nodes are sentences and edges
             are statistical and semantic relationships)
    '''
    nodes = set([n1 for n1, n2, n3 in connected] + \
                [n2 for n1, n2, n3 in connected])
    G=nx.Graph()
    for node in nodes:
        G.add_node(node)
    for edge in connected:
        if edge[2] > thresh:
            G.add_edge(edge[0], edge[1],weight = edge[2])
    #plt.figure(figsize=(8,8))
    #pos = nx.spring_layout(G)
    #nx.draw(G,node_color='#A0CBE2', edge_color='orange',width=1,with_labels=False)
    #plt.show()
    return G
    
def textrank_weighted(graph, initial_value=None, damping=0.85):
    '''
    Calculates PageRank for an undirected graph
    Returns: A list of tuples representing sentences and respective
    scores in descending order
    '''
    if initial_value == None:
        initial_value = 1.0 / len(graph.nodes())
    scores = dict.fromkeys(graph.nodes(), initial_value)

    iteration_quantity = 0
    for iteration_number in xrange(100):
        iteration_quantity += 1
        convergence_achieved = 0
        for i in graph.nodes():
            rank = 1 - damping
            for j in graph.neighbors(i):
                neighbors_sum = sum([graph.get_edge_data(j, k)['weight'] for k in graph.neighbors(j)])
                rank += damping * scores[j] * graph.get_edge_data(j, i)['weight'] / neighbors_sum

            if abs(scores[i] - rank) <= CONVERGENCE_THRESHOLD:
                convergence_achieved += 1

            scores[i] = rank

        if convergence_achieved == len(graph.nodes()):
            break
    return sorted(scores.items(), key=itemgetter(1), reverse=True)

# Twitter OAuth

In [6]:
import tweepy
from tweepy import OAuthHandler
import json
from tweepy import Stream
from tweepy.streaming import StreamListener
import re
import csv as csv
from time import strftime
import pickle

In [7]:
def tweet_cleaner(tweet):
    ''' Cleans the tweet by removing hyperlinks
        and other unnecessary strings
    '''
    return ' '.join(re.sub("(RT @[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|@[A-Za-z0-9]+"," ",tweet).split())

In [8]:
def find_id(tweets,tag,tweet):
    ''' Finds a tweet corresponding to
        a given id. The id is obtained
        while extracting the tweets '''
    for t in tweets[tag]:
        if tweet == (t[0]):
            return t[1]
    return None

In [9]:
def get_trends(trend):
    ''' Returns a list of top 50 trending
        hashtags from US region '''
    return [(i['name']) for i in trend[0]['trends']]

In [10]:
def get_tweets(hashtags = [],nb_tweets = 10):
    ''' Get nb_tweets from the list hashtags.
        The tweets are returned as a dictionary
        of list where each list corresponds to
        a list of tweets of value hashtag '''
    tweets = {}
    for tag in hashtags[0:10]:
        Tweets = tweepy.Cursor(api.search, q=tag).items(10)
        text = [(tweet_cleaner((tweet.text).encode('ascii','ignore')) + '.',tweet.id) for tweet in Tweets]
        tweets[tag] = text
    return tweets

In [11]:
def get_top_tweets(tweets = {},top = 3):
    ''' Takes in a dictionary of lists of tweets.
        Applies extractive summarisation over
        each list of tweets and finds the top 
        tweets. The top tweets are returned as
        a dictionary of lists. '''
    extract_tweets = {}
    for tag in tweets:
        temp = [tweet[0] for tweet in tweets[tag]]
        string = '\n'.join(temp)
        a = Document(string)
        a.construct_graph()
        x = textrank_weighted(a.graph)
        extract_tweets[tag] = [(i[0],find_id(tweets,tag,i[0])) for i in x[:top]]
    return extract_tweets

In [34]:
def save_csv(extract = {},final = {}):
    filename = strftime("%m-%d-%Y.csv")
    print "Saving the data to filename: "+str(filename)
    with open(filename,'wb') as fp:
        p = csv.writer(fp)
        p.writerow(['hashtag','tweets','main tweet'])
        for tag in extract:
            p.writerow([tag,new_extract[tag],final[tag]])

In [39]:
def convert_pickel(filename):
    ''' Converts the csv file generated above to
        the pickel format. 
        Returns None '''
    news = []
    headlines = []
    with open(filename+".csv","rb") as fp:
            reader = csv.DictReader(fp)
            for line in reader:
                text = ''
                l = line['tweets'][1:-1].split(',')
                for i in range(len(l)):
                    l[i] = l[i].strip()
                    l[i] = l[i].strip('()\'')
                for i in range(0,len(l),2):
                    text+=l[i]
                news.append(text)
                l = line['main tweet'][1:-1].split(',')
                for i in range(len(l)):
                    l[i] = l[i].strip()
                    l[i] = l[i].strip('()\'')
                headlines.append(l[0])
    print "Saving to the pickel file named: "+str(filename)+".pkl"
    pickle.dump((headlines,news),open(filename+".pkl","wb"))

In [14]:
def display(tweets = {}):
    ''' Utility function to display the 
        tweets present in a dictionary
        of list tweets '''
    for tag in tweets:
        print tag
        print '----------------------------------------------------------'
        for tweet in tweets[tag]:
            print tweet
        print '==========================================================='

In [15]:
# Najeeb Khan Credentials
consumer_key = '3cFJ5hLMswDJOwXfaJGS4eNyS'
consumer_secret = 'GRn1itnKbrCdAvHhbpjFgzWXMHePNuhDYz5scjnyhD8fC3Bnqg'
access_token = '277893116-tcAWnc62SVdSBLUIqF5R5h92qm2Y0epfQUQJNa4l'
access_secret = 'S47f26iaI7L0Z5AyT2NQqtsmitrMn70nZG2H5n5dyqM4C'

auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

In [16]:
api = tweepy.API(auth)
trend = api.trends_place(23424977)

In [17]:
hashtags = get_trends(trend)

In [18]:
tweets = get_tweets(hashtags,10)

In [19]:
display(tweets)

#growingupthick
----------------------------------------------------------
('growingupthick when you sleep over a family members house amp no one can let you borrow jeans cause your considered fat.', 749190903546413056)
('growingupthick when all your friends are skinny but you re there like a fat potato.', 749190901415587840)
('growingupthick when you re cold putting you hands between your thighs because it s warm af.', 749190899335213056)
('growingupthick not being able to wear shorts cause it be so hot you get a rash between your thighs from rubbing together so.', 749190899264028672)
('growingupthick when your thigh coming out your ripped jeans.', 749190899192573952)
('growingupthick not being able to wear shorts cause it be so hot you get a rash between your thighs from rubbing together so.', 749190895929405440)
('chaan growingupthick getting thigh rashes burns when wearing shorts dresses etc bc your thighs rub together ALL THE TIME.', 749190893056368640)
('growingupthick when you h

In [20]:
extract = get_top_tweets(tweets,3)

In [21]:
display(extract)

#growingupthick
----------------------------------------------------------
('growingupthick not being able to wear shorts cause it be so hot you get a rash between your thighs from rubbing together so.', 749190899264028672)
('growingupthick when you sleep over a family members house amp no one can let you borrow jeans cause your considered fat.', 749190903546413056)
('chaan growingupthick getting thigh rashes burns when wearing shorts dresses etc bc your thighs rub together ALL THE TIME.', 749190893056368640)
Joe Johnson
----------------------------------------------------------
('Joe Johnson is heading to the Jazz on a 2 year 22 million deal.', 749190926808059904)
('According to our daldridgetnt Joe Johnson has agreed to a 2 year 22M deal with the Utah Jazz.', 749190686767886336)
('Joe Johnson nuevo jugador de Utah Jazz firma por 2 aos y 22M.', 749190831802818560)
Germany vs Italy
----------------------------------------------------------
('Betfred betting specials for Euro 2016 conti

In [22]:
final = get_top_tweets(extract,1)

In [23]:
display(final)

#growingupthick
----------------------------------------------------------
('growingupthick not being able to wear shorts cause it be so hot you get a rash between your thighs from rubbing together so.', 749190899264028672)
Joe Johnson
----------------------------------------------------------
('According to our daldridgetnt Joe Johnson has agreed to a 2 year 22M deal with the Utah Jazz.', 749190686767886336)
Germany vs Italy
----------------------------------------------------------
('This tag can be defined as BBC defence vs German mid fielders Hoping Germany to make a win today Go Germany go Italy or Germany.', 749190980302041089)
All 20
----------------------------------------------------------
('20 I M A MORNING PERSON PPL I m not an anything person Y do u feel the need to wake people up to do morning ppl t.', 749190913335955456)
#ausvotes
----------------------------------------------------------
('Yes it is as we thought folks looks like LNP will again blame ALP if they lose the

In [24]:
''' Removing the main tweet from the summarised tweet
    so as to facilitate the learning of the neural
    network. Otherwise the network may learn to pick
    up the main tweet as it is '''
new_extract = extract.copy()
for tag in extract:
    new_extract[tag].remove(final[tag][0])

In [36]:
save_csv(new_extract,final)

Saving the data to filename: 07-02-2016.csv


In [41]:
filename = strftime("%m-%d-%Y")
convert_pickel(filename)

Saving to the pickel file named: 07-02-2016.pkl


In [42]:
X,y = pickle.load(open(filename+".pkl","rb"))

In [43]:
for i in range(len(X)):
    print "H: "+str(X[i])
    print "T: "+str(y[i])
    print "-----------------------------------------------------"

H: growingupthick not being able to wear shorts cause it be so hot you get a rash between your thighs from rubbing together so.
T: growingupthick when you sleep over a family members house amp no one can let you borrow jeans cause your considered fat.chaan growingupthick getting thigh rashes burns when wearing shorts dresses etc bc your thighs rub together ALL THE TIME.
-----------------------------------------------------
H: According to our daldridgetnt Joe Johnson has agreed to a 2 year 22M deal with the Utah Jazz.
T: Joe Johnson is heading to the Jazz on a 2 year 22 million deal.Joe Johnson nuevo jugador de Utah Jazz firma por 2 aos y 22M.
-----------------------------------------------------
H: This tag can be defined as BBC defence vs German mid fielders Hoping Germany to make a win today Go Germany go Italy or Germany.
T: Betfred betting specials for Euro 2016 continue tonight with Germany vs Italy Details.EURO16onSonyLIV So to whom u are going to cheer in today s Italy Vs Germa