In [1]:
reset -fs

In [2]:
import string

import gensim
from nltk.stem import PorterStemmer, WordNetLemmatizer
import pandas as pd
from sklearn.feature_extraction import stop_words

In [3]:
reviews = pd.read_csv("data/appstore_all_reviews.csv"); reviews.head(3)

Unnamed: 0.1,Unnamed: 0,name,id,title,author_name,author_uri,voteSum,voteCount,rating,text,date,review_id
0,1,Microsoft HealthVault,546835834,Lab Corp blood results,liver transplant patient,https://itunes.apple.com/us/reviews/id782157250,0,0,1,Lab Corp had my weekly blood work results on t...,2018-08-08 05:03:25,https://itunes.apple.com/us/reviews/id78215725...
1,2,Microsoft HealthVault,546835834,What happened,Bistline,https://itunes.apple.com/us/reviews/id335994415,0,0,3,This app used to be my favorite. It would sync...,2018-06-27 14:07:45,https://itunes.apple.com/us/reviews/id33599441...
2,3,Microsoft HealthVault,546835834,Great idea,Gdb&&@,https://itunes.apple.com/us/reviews/id216415940,0,0,5,I have many yrs worth of data now stored. It i...,2018-06-27 10:26:12,https://itunes.apple.com/us/reviews/id21641594...


## Tokenizing data

In [4]:
def tokenize(text):
    text = text.lower()  # lowercase
    text = text.translate(str.maketrans('\t\r\n', '   '))  # remove tabs, return-line, new-line
    text = text.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))  # remove punctuation
    text = ' '.join(text.split())  # remove extra white-spaces
    text = ''.join(list(filter(lambda x: x in set(string.printable), text)))  # remove all non-ascii characters
    text = text.translate(str.maketrans('1234567890', ' '*10))  # remove all numbers  
    text = ' '.join([word for word in text.split() if len(word) > 2])  # remove short words
    text = ' '.join([word for word in text.split() if word not in stop_words.ENGLISH_STOP_WORDS])  # remove stopwords
    
    return text

In [5]:
reviews['text_tokenized'] = reviews['text'].map(tokenize); reviews.head(2)

Unnamed: 0.1,Unnamed: 0,name,id,title,author_name,author_uri,voteSum,voteCount,rating,text,date,review_id,text_tokenized
0,1,Microsoft HealthVault,546835834,Lab Corp blood results,liver transplant patient,https://itunes.apple.com/us/reviews/id782157250,0,0,1,Lab Corp had my weekly blood work results on t...,2018-08-08 05:03:25,https://itunes.apple.com/us/reviews/id78215725...,lab corp weekly blood work results app months ...
1,2,Microsoft HealthVault,546835834,What happened,Bistline,https://itunes.apple.com/us/reviews/id335994415,0,0,3,This app used to be my favorite. It would sync...,2018-06-27 14:07:45,https://itunes.apple.com/us/reviews/id33599441...,app used favorite sync apps pull medical info ...


## Shortening each word to their lemma

In [6]:
def lemmatize(text):
    lemmatizer = WordNetLemmatizer() # or use PorterStemmer(), if want to run faster
    text = ' '.join([lemmatizer.lemmatize(w) for w in text.split()])
        
    return text

In [7]:
reviews['text_tokenized_lemmatized'] = reviews['text_tokenized'].map(lemmatize); reviews.head(1)

Unnamed: 0.1,Unnamed: 0,name,id,title,author_name,author_uri,voteSum,voteCount,rating,text,date,review_id,text_tokenized,text_tokenized_lemmatized
0,1,Microsoft HealthVault,546835834,Lab Corp blood results,liver transplant patient,https://itunes.apple.com/us/reviews/id782157250,0,0,1,Lab Corp had my weekly blood work results on t...,2018-08-08 05:03:25,https://itunes.apple.com/us/reviews/id78215725...,lab corp weekly blood work results app months ...,lab corp weekly blood work result app month ag...


In [None]:
reviews.to_csv("data/appstore_all_reviews_clean_tomo.csv", index=False)