### Parse sentences as `<role>, <action>, <benefits>`

In [1]:
import re
import pprint

class US:
  def __init__(self, raw, role, action, benefit):
    self.raw = raw 
    self.role = role
    self.action = action
    self.benefit = benefit
    
  def __repr__(self):
    return str(self.__dict__)

lines = []

f = open("us.txt", "r").read().splitlines()

for x in f:
  if x:
    lines.append(x)

user_stories = []

for line in lines:
  result = re.search('As (a|an) (.*), I want to (.*), (So that I|so I|So that|so that I|so that) (.*).', line)
  if(not result):
    print('Couldn\'t parse:\n', line)
  else:
    user_stories.append(US(line, result.group(2), result.group(3), result.group(5)))

Couldn't parse:
 As a Publisher, I want to create a data package in the UI so that it is available and published.
Couldn't parse:
 As a Publisher, I want to see real examples of published packages so that I can understand how useful and simple the datapackage format and the registry is.
Couldn't parse:
 As a Data Analyst I want to download a data package, so that that I can study it and wrangle with it to infer new data or generate new insights.


### Derive `concepts`

In [2]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency

import spacy  # For preprocessing

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [3]:
df = pd.read_csv('us.txt', sep="\n", header=None)

In [4]:
nlp = spacy.load('en', disable=['ner', 'parser'])
spacy.lang.en.stop_words.STOP_WORDS.add('want')
nlp.vocab['want'].is_stop = True

def cleaning(doc):
  for token in doc:
    if 'data' == str(token):
      print(token, token.lemma_, doc)
  txt = [token.lemma_ for token in doc if not token.is_stop]
  if len(txt) > 2:
    return ' '.join(txt)

In [5]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df[0])

In [6]:
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=10, n_threads=1)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

data datum as a publisher i want to sign up for an account so that that i can publish my data package to the registry and to have a publisher account to publish my data package under 
data data as a publisher i want to sign up for an account so that that i can publish my data package to the registry and to have a publisher account to publish my data package under 
data datum as an admin i want to invite someone to join the platform so that that they can start contributing or using data 
data data as a publisher i want to import my data package into the registry so that my data has a permanent online home to access 
data datum as a publisher i want to import my data package into the registry so that my data has a permanent online home to access 
data datum as a publisher i want to configure my client so that i can start publishing data packages 
data data as a publisher i want to use a publish command to update a data package that is already in the registry so that it appears there 
dat

In [7]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(67, 1)

In [8]:
from gensim.models.phrases import Phrases, Phraser

INFO - 17:52:38: 'pattern' package not found; tag filters are not available for English


In [9]:
sent = [row.split() for row in df_clean['clean']]

In [10]:
phrases = Phrases(sent, min_count = 2, threshold = 3)

INFO - 17:52:38: collecting all words and their counts
INFO - 17:52:38: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 17:52:38: collected 743 word types from a corpus of 652 words (unigram + bigrams) and 67 sentences
INFO - 17:52:38: using 743 counts as vocab in Phrases<0 vocab, min_count=2, threshold=3, max_vocab_size=40000000>


In [11]:
bigram = Phraser(phrases)

INFO - 17:52:38: source_vocab length 743
INFO - 17:52:38: Phraser built with 11 phrasegrams


In [12]:
sentences = bigram[sent]

In [13]:
word_freq = defaultdict(int)
for senti in sentences:
  for i in senti:
    word_freq[i] += 1
len(word_freq)

257

In [14]:
sorted(word_freq, key = word_freq.get, reverse = True)[:10]

['publisher',
 'datum_package',
 'consumer',
 'datum',
 'publish',
 'package',
 'download',
 'datapackage',
 'data_package',
 'owner']