# Article spinner

In [1]:
from bs4 import BeautifulSoup
import numpy as np

We'll use positive electonics reviews as our dataset. Reviews will be parsed into trigrams, computing the probability of the middle word given the previous and the next word in the review.

In [2]:
reviews = BeautifulSoup(open('../data/sorted_data_acl/electronics/positive.review').read())
reviews = [tag.text for tag in reviews.findAll('review_text')]
len(reviews)

1000

Very lightweight sentence cleanup that will remove unnecessary characters that are not helpful for our model.

In [3]:
import re

def clean_text(t):
    words = re.split('; |, |\*|\n|/|_', t)
    words = [word.strip(',.:-_').lower() for word in words]
    words = [word for word in words if len(word) > 0]
    return ' '.join(words)

reviews = [clean_text(review) for review in reviews]
reviews = [review.split(' ') for review in reviews]

At first, we count occurences of words given the previous and next word in the sentence. Then, we re-compute those counts into the conditional probabilities.

In [4]:
def get_count_trigrams(reviews):
    trigrams = {}
    for review in reviews:
        for idx in range(1, len(review) - 1):
            key = (review[idx-1], review[idx+1])
            if key not in trigrams:
                trigrams[key] = {}

            word = review[idx]
            if review[idx] not in trigrams[key]:
                trigrams[key][word] = 1
            else:
                trigrams[key][word] += 1
    return trigrams

def get_probability_trigrams(reviews):
    trigrams = get_count_trigrams(reviews)
    
    for key in trigrams:
        s = sum(trigrams[key].values())
        for words in trigrams[key]:
            trigrams[key][words] /= s
    return trigrams

Pick word method will return a word with a respect to the given probabilities.

In [5]:
trigrams = get_probability_trigrams(reviews)
trigrams[('i', 'this')].values()

dict_values([0.12269938650306748, 0.3067484662576687, 0.006134969325153374, 0.012269938650306749, 0.018404907975460124, 0.006134969325153374, 0.05521472392638037, 0.006134969325153374, 0.05521472392638037, 0.03680981595092025, 0.012269938650306749, 0.006134969325153374, 0.006134969325153374, 0.024539877300613498, 0.006134969325153374, 0.024539877300613498, 0.006134969325153374, 0.012269938650306749, 0.05521472392638037, 0.018404907975460124, 0.024539877300613498, 0.006134969325153374, 0.024539877300613498, 0.006134969325153374, 0.012269938650306749, 0.006134969325153374, 0.012269938650306749, 0.006134969325153374, 0.012269938650306749, 0.006134969325153374, 0.006134969325153374, 0.006134969325153374, 0.006134969325153374, 0.006134969325153374, 0.006134969325153374, 0.006134969325153374, 0.006134969325153374, 0.006134969325153374, 0.006134969325153374, 0.006134969325153374, 0.006134969325153374, 0.006134969325153374, 0.006134969325153374, 0.006134969325153374])

In [6]:
def pick_word(word_probs):
    return np.random.choice(list(word_probs.keys()), 1, p=list(word_probs.values()))[0]

In [7]:
sum(trigrams[('i', 'this')].values()), pick_word(trigrams[('i', 'this')])

(0.9999999999999993, 'purchased')

We can now 'spin' an article. Words will be replaced randomly with recpect to the probability of their occurence given the previous and the next word.

In [8]:
def spin(article):
    article = article.lower()
    words = article.split(' ')
    for i in range(1, len(words) - 2):
        if np.random.random() < 0.5:
            p = words[i-1]
            n = words[i+1]
            
            if (p, n) in trigrams:
                words[i] = pick_word(trigrams[(p, n)])
    
    return ' '.join(words)

In [9]:
article = ' '.join(np.random.choice(reviews))
new_review = spin(article)

article, new_review

