In [40]:
import pandas as pd
import nltk
import sklearn

In [41]:
#data input
boost=pd.read_csv("anno.csv")

In [42]:
boost.head()

Unnamed: 0,Applications,Review,Sentiment,Emotions
0,Boost,Poor design for movie booking. Clicking back g...,Negative,Anger
1,TNG,"lousy app, do not use this e wallet. my credit...",Negative,Anger
2,Maybank,great apps..make us easier,Positive,Joy
3,Boost,toped up with this app... until this app crash...,Negative,Anger
4,TNG,this nice app deserves a better UI. The curren...,Negative,Anticipation


In [46]:
cleaned=[]
#strips all non-ASCII characters (emojis)
for row in boost['Review']:
    ans=str(row).encode('ascii', 'ignore').decode('ascii')
    cleaned.append(ans)
boost['cleaned']=pd.Series(cleaned)

In [47]:
#strips all emojis and punctuations
nonpunc=[]
for row in boost['cleaned']:
    ans2=str(row).translate(str.maketrans('', '', string.punctuation))
    nonpunc.append(ans2)
boost['nonpunc']=pd.Series(nonpunc)

In [48]:
#strip all numbers
boost['no_digits']=pd.Series(boost['nonpunc'].str.replace('\d+', ''))

In [50]:
def reduce_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

In [52]:
boost['reduce']=boost['no_digits'].apply(reduce_lengthening)

In [53]:
#spell check
#autocorrect spell requires high computation power
import autocorrect
from autocorrect import spell

In [54]:
boost["spell"] = [' '.join([spell(i) for i in x.split()]) for x in boost['reduce']]

In [55]:
noneng=[]
words = set(nltk.corpus.words.words())
#strips all non-English characters
for row in boost['spell']:
    ans1=" ".join(w for w in nltk.wordpunct_tokenize(row) \
         if w.lower() in words or not w.isalpha())
    noneng.append(ans1)
boost['noneng']=pd.Series(noneng)

In [56]:
#lemmatization
from nltk.stem import WordNetLemmatizer 
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer() 

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\loel7003\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [57]:
boost['lemma']=boost['noneng'].apply(lemmatizer.lemmatize)

In [58]:
#td-idf takes care of tokenization as well hence no need to have tokenization before that
#The TfidfVectorizer will tokenize documents, learn the vocabulary and inverse document frequency weightings, and allow you to encode new documents
from sklearn.feature_extraction.text import TfidfVectorizer
v = TfidfVectorizer()
x = v.fit_transform(boost['lemma'])

In [59]:
#convert it into an array then dataframe.
#matrix>array>data frame
tdidf=pd.DataFrame(x.toarray())

In [60]:
tdidf.columns=v.get_feature_names()

In [61]:
tdidf.head()

Unnamed: 0,aa,abandon,ability,able,about,above,absolute,absolutely,absurd,accent,...,yet,yew,yo,you,young,your,yourself,youve,yr,zero
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.160757,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [62]:
tdidf.to_csv("tdidffinal.csv",sep=',',encoding='utf-8')
boost.to_csv("cleanfinal.csv",sep=',',encoding='utf-8')

In [63]:
finalfinal=pd.concat([boost,tdidf],axis=1)

In [64]:
finalfinal.tail()

Unnamed: 0,Applications,Review,Sentiment,Emotions,cleaned,nonpunc,no_digits,reduce,spell,noneng,...,yet,yew,yo,you,young,your,yourself,youve,yr,zero
2995,Boost,good.. more saving.. i hope can give more saving,Positive,Anticipation,good.. more saving.. i hope can give more saving,good more saving i hope can give more saving,good more saving i hope can give more saving,good more saving i hope can give more saving,good more saving i hope can give more saving,good more saving i hope can give more saving,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2996,Boost,good service and hope more convenient in future,Positive,Anticipation,good service and hope more convenient in future,good service and hope more convenient in future,good service and hope more convenient in future,good service and hope more convenient in future,good service and hope more convenient in future,good service and hope more convenient in future,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2997,TNG,"very slow balance update, not good to use",Negative,Sadness,"very slow balance update, not good to use",very slow balance update not good to use,very slow balance update not good to use,very slow balance update not good to use,very slow balance update not good to use,very slow balance update not good to use,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2998,Boost,good job hope can get in more shops..,Positive,Anticipation,good job hope can get in more shops..,good job hope can get in more shops,good job hope can get in more shops,good job hope can get in more shops,good job hope can get in more shops,good job hope can get in more,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2999,Boost,So lag after upgrade. No good to use,Negative,Sadness,So lag after upgrade. No good to use,So lag after upgrade No good to use,So lag after upgrade No good to use,So lag after upgrade No good to use,So lag after upgrade No good to use,So lag after upgrade No good to use,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
finalfinal.to_csv("finalinput.csv",sep=',',encoding='utf-8')