-
Notifications
You must be signed in to change notification settings - Fork 33
/
cleanreviews.py
54 lines (45 loc) · 1.72 KB
/
cleanreviews.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import pandas as pd
import numpy as np
import spacy
import re
import string
#load reviews data
reviews = pd.read_csv('7282_1.csv')
#extract only reviews
comments = reviews['reviews.text']
comments = comments.astype('str')
#function to remove non-ascii characters
def _removeNonAscii(s): return "".join(i for i in s if ord(i)<128)
#remove non-ascii characters
comments = comments.map(lambda x: _removeNonAscii(x))
#get stop words of all languages
STOPWORDS_DICT = {lang: set(nltk.corpus.stopwords.words(lang)) for lang in nltk.corpus.stopwords.fileids()}
#function to detect language based on # of stop words for particular language
def get_language(text):
words = set(nltk.wordpunct_tokenize(text.lower()))
lang = max(((lang, len(words & stopwords)) for lang, stopwords in STOPWORDS_DICT.items()), key = lambda x: x[1])[0]
if lang == 'english':
return True
else:
return False
#filter for only english comments
eng_comments=comments[comments.apply(get_language)]
#drop duplicates
eng_comments.drop_duplicates(inplace=True)
#load spacy
nlp = spacy.load('en')
#function to clean and lemmatize comments
def clean_comments(text):
#remove punctuations
regex = re.compile('[' + re.escape(string.punctuation) + '\\r\\t\\n]')
nopunct = regex.sub(" ", str(text))
#use spacy to lemmatize comments
doc = nlp(nopunct, disable=['parser','ner'])
lemma = [token.lemma_ for token in doc]
return lemma
#apply function to clean and lemmatize comments
lemmatized = eng_comments.map(clean_comments)
#make sure to lowercase everything
lemmatized = lemmatized.map(lambda x: [word.lower() for word in x])
#turn all comments' tokens into one single list
unlist_comments = [item for items in lemmatized for item in items]