In [6]:
from py2neo import Graph
import pandas as pd
import re
from nltk.tokenize import sent_tokenize, wordpunct_tokenize, RegexpTokenizer, word_tokenize

In [7]:
df = pd.read_csv('../data/vaping_testimonials.csv')

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,title,url
0,0,I quit a 20 year habit of smoking tobacco ciga...,20+ yr Smoker Now Ex Smoker since 2008 quit be...,http://vaping.info/news/2015/20-yr-smoker-now-...
1,1,"Started when I was 13 years old, now 55. Tried...","42 year smoker, 3 week vaper..",http://vaping.info/news/2015/42-year-smoker-3-...
2,2,"25 years smoking, being diagnosed with COPD an...",Finally Smoke Free thanks to VAPING,http://vaping.info/news/2015/finally-smoke-fre...
3,3,"I smoked for over 16 year, one pack a day if n...","Vaping saved my life, which in turned saved 4+...",http://vaping.info/news/2015/vaping-saved-life...
4,4,First let me say that No one under the legal s...,. I was a smoker for 27 years. I made many att...,http://vaping.info/news/2015/smoker-27-years-m...


In [9]:
df.text[0].split('.')[0]

'I quit a 20 year habit of smoking tobacco cigarettes the day my first quality starter kit arrived'

## Graph database

In [10]:
graph = Graph()

In [26]:
# Build a word adjacency graph for a comment string
INSERT_QUERY = '''
WITH split(tolower({sentence}), " ") AS text
UNWIND range(0,size(text)-2) AS i
MERGE (w1:Word {name: text[i]})
ON CREATE SET w1.count = 1 ON MATCH SET w1.count = w1.count + 1
MERGE (w2:Word {name: text[i+1]})
ON CREATE SET w2.count = 1 ON MATCH SET w2.count = w2.count +  (CASE WHEN i = size(text)-2 then 1 else 0 end)
MERGE (w1)-[r:NEXT]->(w2)
  ON CREATE SET r.count = 1
  ON MATCH SET r.count = r.count + 1;
'''

INSERT_QUERY2 = '''
WITH {tid} AS tid, {sid} AS sid, split(tolower({sentence}), " ") AS text
UNWIND range(0,size(text)-2) AS i
MERGE (w1:Word {name: text[i]})
ON CREATE SET w1.count = 1 ON MATCH SET w1.count = w1.count + 1
MERGE (w2:Word {name: text[i+1]})
ON CREATE SET w2.count = 1 ON MATCH SET w2.count = w2.count +  (CASE WHEN i = size(text)-2 then 1 else 0 end)
MERGE (w1)-[r:NEXT]->(w2)
  ON CREATE SET r.count = 1
  ON MATCH SET r.count = r.count + 1
MERGE (t1:Text {tid: tid})
MERGE (s1:Sentence {sid: sid, tid: tid})
MERGE (s1)-[r2:HAS_WORD {num: i}]->(w1)
  ON CREATE SET r2.sid = sid 
MERGE (s1)-[r3:HAS_WORD {num: i+1}]->(w2)
  ON CREATE SET r3.sid=sid
MERGE (t1)-[:HAS_SENTENCE]->(s1)
'''



In [20]:
sentence = df.text[0].split('.')[0].strip().lower()

graph.evaluate(INSERT_QUERY, parameters={'sentence': sentence})


In [12]:
def process_text(row):
    txt = row.title + '. ' + row.text
    sents=[' '.join([ w.lower() for w in word_tokenize(s) if w not in ',.!']) for s in sent_tokenize(txt) ]
    return sents

In [24]:
def add_to_db(tid, sents):
    for sid,sentence in enumerate(sents):
        print(sentence)
        graph.evaluate(INSERT_QUERY2, parameters={'sentence': sentence, 'sid': sid, 'tid': tid})


In [28]:
add_to_db('t2',process_text(df.iloc[1]))

42 year smoker 3 week vaper ...
started when i was 13 years old now 55
tried to quit many times nicotine patches gum cold turkey never managed more then a few days
it’s only been 3 weeks since i started vaping but i have not had a cigarette since
i started out a low nicotine level 6 % and it seems to be the right level for me
3 weeks for me is amazing i have no cravings
vaping is working for me and quite a number of people at my place of employment
physically i haven’t noticed a difference early days yet
mentally i feel very good about myself this is a great achievement for me


In [22]:
df.rename(columns=)

TypeError: Index does not support mutable operations

In [45]:
sents=[' '.join([ w.lower() for w in word_tokenize(s) if w not in ',.!']) for s in sent_tokenize(df.title[0] + '. ' + df.text[0]) ]

In [46]:
sents

['20+ yr smoker now ex smoker since 2008 quit because of vaping',
 'i quit a 20 year habit of smoking tobacco cigarettes the day my first quality starter kit arrived',
 'unlike the many failed attempts at quitting and wasting tons of money in the past using the usual over the counter quit smoking products such as the patch and gum that are in my opinion just a way to keep people hooked on smoking vaping worked for me immediately',
 'not only did i quit smoking immediately it made it very easy for me to do as well',
 'i was already feeling better within the first week and things have just kept getting better and better for me',
 'i also got off using a bronchial inhaler because i could breathe so much better and no longer do i get feelings of having asthma attacks regularly',
 'it has been over 6 years now since i started vaping and because vaping allowed me to quit smoking cigarettes it has made my life so much better in so many ways in which i am very grateful',
 'it really enrages me

In [85]:
add_to_db(process_text(df.iloc[1]))

42 year smoker 3 week vaper ...
started when i was 13 years old now 55
tried to quit many times nicotine patches gum cold turkey never managed more then a few days
it’s only been 3 weeks since i started vaping but i have not had a cigarette since
i started out a low nicotine level 6 % and it seems to be the right level for me
3 weeks for me is amazing i have no cravings
vaping is working for me and quite a number of people at my place of employment
physically i haven’t noticed a difference early days yet
mentally i feel very good about myself this is a great achievement for me


In [57]:
df.iloc[1]

Unnamed: 0                                                    1
text          Started when I was 13 years old, now 55. Tried...
title                            42 year smoker, 3 week vaper..
url           http://vaping.info/news/2015/42-year-smoker-3-...
Name: 1, dtype: object

In [86]:
for row in df.itertuples():
    add_to_db(process_text(row))

20+ yr smoker now ex smoker since 2008 quit because of vaping
i quit a 20 year habit of smoking tobacco cigarettes the day my first quality starter kit arrived
unlike the many failed attempts at quitting and wasting tons of money in the past using the usual over the counter quit smoking products such as the patch and gum that are in my opinion just a way to keep people hooked on smoking vaping worked for me immediately
not only did i quit smoking immediately it made it very easy for me to do as well
i was already feeling better within the first week and things have just kept getting better and better for me
i also got off using a bronchial inhaler because i could breathe so much better and no longer do i get feelings of having asthma attacks regularly
it has been over 6 years now since i started vaping and because vaping allowed me to quit smoking cigarettes it has made my life so much better in so many ways in which i am very grateful
it really enrages me to think there are groups out

In [89]:
len([r for r in df.text.str.findall('since i \w+', flags=re.I) if len(r)>0])

43

In [72]:
[process_text(r) for r in df.itertuples()]

[['20+ yr smoker now ex smoker since 2008 quit because of vaping',
  'i quit a 20 year habit of smoking tobacco cigarettes the day my first quality starter kit arrived',
  'unlike the many failed attempts at quitting and wasting tons of money in the past using the usual over the counter quit smoking products such as the patch and gum that are in my opinion just a way to keep people hooked on smoking vaping worked for me immediately',
  'not only did i quit smoking immediately it made it very easy for me to do as well',
  'i was already feeling better within the first week and things have just kept getting better and better for me',
  'i also got off using a bronchial inhaler because i could breathe so much better and no longer do i get feelings of having asthma attacks regularly',
  'it has been over 6 years now since i started vaping and because vaping allowed me to quit smoking cigarettes it has made my life so much better in so many ways in which i am very grateful',
  'it really en