In [1]:
import re
import string
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from textacy.vsm import Vectorizer

import pprint
pp = pprint.PrettyPrinter()

Loading data

In [2]:
# Read data from file
tweets = pd.read_csv('tweets.txt', sep='\t',
                        names=['id', 'topic', 'sentiment', 'body'])  # Change here to your own column name
tweets.shape

(80, 4)

Describing data

In [3]:
tweets.head()

Unnamed: 0,id,topic,sentiment,body
0,628949369883000832,@microsoft,negative,dear @Microsoft the newOoffice for Mac is grea...
1,628976607420645377,@microsoft,negative,@Microsoft how about you make a system that do...
2,629023169169518592,@microsoft,negative,I may be ignorant on this issue but... should ...
3,629179223232479232,@microsoft,negative,"Thanks to @microsoft, I just may be switching ..."
4,629226490152914944,@microsoft,positive,"Microsoft, I may not prefer your gaming branch..."


## Preprocessing

At beginning, sentence level tokenization should be done since our task objective is sentence extraction.

The second step is cleaning data. Intuitively, those string like url, '@...', hashtag and punctuation seldom contribute to the importance of sentence. In addition, stopwords are usually regarded as noises in most of the NLP tasks. These things should be removed.

Finally, a filtered word list following the original sentence sequence is created for tf-idf value calculation.

In [4]:
def preprocess(data):
    '''
    Clean input data and tokenize them for tf-idf calculation.

    :param data: input tweets
    :return: sents, a tokenized sentences list.
            filtered_words, a tokenized sentences list whose element is cleaned word list(e.g. [['microsoft', 'vista'], ...]）
    '''
    sents = [sent for tweet in data.body.tolist() for sent in nltk.sent_tokenize(tweet)]  # Tokenize sentence
    cleaned_sents = sents.copy()  # Prepare a copy for cleaning
    cleaned_sents = list(map(lambda x: re.sub('http\S+', '', x), cleaned_sents))  # Removing URLS
    cleaned_sents = list(map(lambda x: re.sub('(\s)@\w+', '', x), cleaned_sents))  # Removing @...
    cleaned_sents = list(map(lambda x: re.sub('#', '', x), cleaned_sents))  # Removing hashtags
    cleaned_sents = list(map(lambda x: x.translate(str.maketrans('', '', string.punctuation)), cleaned_sents))  # Removing puntuation
    # Tokenization and Removing stopwords
    tokenizer = TweetTokenizer()
    filtered_words = [[word.lower() for word in tokenizer.tokenize(sent) if word.lower() not in stopwords.words('english')]
        for sent in cleaned_sents]

    return sents, filtered_words

In [5]:
sents, filtered_words = preprocess(tweets.copy())
print('Number of sentences:' + str(len(sents)))
pp.pprint(sents[:5])
print('------------------------------------------------------------------------------------')
print('Number of unique words after filtering:' + str(len(set([token for sent in filtered_words for token in sent]))))
pp.pprint(filtered_words[:5])

Number of sentences:158
['dear @Microsoft the newOoffice for Mac is great and all, but no Lync update?',
 "C'mon.",
 "@Microsoft how about you make a system that doesn't eat my friggin discs.",
 'This is the 2nd time this has happened and I am so sick of it!',
 "I may be ignorant on this issue but... should we celebrate @Microsoft's "
 'parental leave changes?']
------------------------------------------------------------------------------------
Number of unique words after filtering:591
[['dear', 'newooffice', 'mac', 'great', 'lync', 'update'],
 ['cmon'],
 ['microsoft', 'make', 'system', 'doesnt', 'eat', 'friggin', 'discs'],
 ['2nd', 'time', 'happened', 'sick'],
 ['may', 'ignorant', 'issue', 'celebrates', 'parental', 'leave', 'changes']]


## Caculating tf-idf value

I use [textacy](https://textacy.readthedocs.io/en/latest/index.html), a higher-level NLP built on spaCy, to calculate  tf-idf score. Since I only care about the meaningful token of each sentence, I apply this technique on the filtered word list created from previous step.

In [6]:
def tfidf(data_tokenized):
    '''
    Caculate tf-idf matrix.

    :param data_tokenized: A sequence of tokenized documents, where each document is a sequence of (str) terms.
    :return: vectorizer, instance of textacy.vsm.Vectorizer.
            calculate , tf-idf matrix whose row is document, column is term
    '''
    vectorizer = Vectorizer(weighting='tfidf')
    term_matrix = vectorizer.fit_transform(data_tokenized).todense()  # dense matrix means most of the elements are nonzero

    return vectorizer, term_matrix

In [7]:
vectorizer, term_matrix = tfidf(filtered_words)
term_matrix.shape

(158, 591)

As I mention in the code comment, the return term_matrix is a term‐document matrix which is also called "bag‐of‐words". At this case, the term_matrix contains 158 documents and 591 terms which is corresponding to the number of sentences and number of unique words after filtering we create in the Preprocessing step.

## Extract most representative sentences as summarization

Since tweet is short, some widely-used techniques like position weights and biased heading weights are not consider suitable for the task. At current stage, the sum score of each word in sentence is used to rank sentences.

In [8]:
def rank_sentences(sents, filtered_words, vectorizer, term_matrix, top_n=3):
    '''
    Select top n important sentence.

    :param sents: a list containing sentences.
    :param filtered_words: a tokenized sentences list whose element is word list
    :param vectorizer: instance of textacy.vsm.Vectorizer
    :param term_matrix: tf-idf matrix whose row is document, column is term
    :param top_n: the selecting number
    :return: a list containing top n important sentences
    '''
    tfidf_sent = [[term_matrix[index, vectorizer.vocabulary[token]] for token in sent] for index, sent in
                  enumerate(filtered_words)]  # Get tfidf value for noun word in each sentence
    sent_values = [sum(sent) for sent in tfidf_sent]  # Caculate whole tfidf weights for each sentence
    ranked_sent = sorted(zip(sents, sent_values), key=lambda x: x[1], reverse=True)  # Sort sentence at descending order

    return [sent[0] for sent in ranked_sent[:top_n]]

In [9]:
top_n = 3  # Select top n importance sentences
best_sents = rank_sentences(sents, filtered_words, vectorizer, term_matrix, top_n)

In [10]:
pp.pprint(best_sents)

["@eyesonfoxorg @Microsoft I'm still using Vista on one &amp; Win-7 on "
 'another, Vista is a dinosaur, unfortunately I may use a free 10 with limits',
 'W/ all the $$$ and drones U have working 4 U, maybe U guys could get it '
 'right the 1st time?',
 "@Lumia #Lumia @Microsoft 2nd, you guys haven't released a lumia that has a "
 'QHD screen, or takes video in 2k resolution yet.']
