Load data and import libraries

In [19]:
import pandas as pd
from helpers.data_preprocessing import DataProcesser

data = pd.read_csv("./../../data/train_set_0520.csv", usecols=['comment', 'non-information'])

comments = data['comment'].apply(str)

### Preprocess data:
1. remove all special characters and accents
2. turn all characters into lower case
3. stemming --> put the word into its most basic form

In [27]:
dp = DataProcesser()

comments = dp.preprocess(comments)
comments = pd.Series(comments)

comments.head()



0    implnot taken from link comsunjavafxscenecontr...
1             iconsettooltiptextprintedviewmodelgetloc
2    synchron chang of the underli date valu with t...
3    ask if the user realli want to close the given...
4                                           css inform
dtype: object

Split words and put them into a list

In [28]:
split_comments = " ".join(comments)
split_comments = split_comments.split()
split_comments[0:15]

['implnot',
 'taken',
 'from',
 'link',
 'comsunjavafxscenecontrolbehaviortextareabehaviorcontextmenurequestedjavafxsceneinputcontextmenuev',
 'iconsettooltiptextprintedviewmodelgetloc',
 'synchron',
 'chang',
 'of',
 'the',
 'underli',
 'date',
 'valu',
 'with',
 'the']

count word frequency

In [29]:
from collections import Counter
counted_comments = Counter(split_comments)
{k: counted_comments[k] for k in list(counted_comments)[:15]}

{'implnot': 2,
 'taken': 10,
 'from': 105,
 'link': 161,
 'comsunjavafxscenecontrolbehaviortextareabehaviorcontextmenurequestedjavafxsceneinputcontextmenuev': 1,
 'iconsettooltiptextprintedviewmodelgetloc': 1,
 'synchron': 1,
 'chang': 99,
 'of': 330,
 'the': 1461,
 'underli': 3,
 'date': 18,
 'valu': 60,
 'with': 105,
 'temporalaccessorvalu': 1}

15 most common words

In [30]:
most_occur = counted_comments.most_common(15)
most_occur

[('the', 1461),
 ('to', 587),
 ('a', 455),
 ('is', 353),
 ('of', 330),
 ('thi', 311),
 ('in', 279),
 ('and', 272),
 ('if', 263),
 ('for', 258),
 ('entri', 215),
 ('it', 207),
 ('file', 201),
 ('be', 185),
 ('link', 161)]

remove stopwords

In [31]:
comments_cleaned = list(map(DataProcesser.remove_stopwords, comments))
comments_cleaned[0:15]

TypeError: remove_stopwords() missing 1 required positional argument: 'text'

most common words after removing stopwords

In [None]:
split_comments = " ".join(comments_cleaned)
split_comments = split_comments.split()
counted_comments = Counter(split_comments)
most_occur = counted_comments.most_common(15)
most_occur

Word Cloud

In [None]:
# Generate a word cloud image
comments_cleaned = " ".join(comments_cleaned)
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
wordcloud1 = WordCloud(width=1600, height=800).generate(comments_cleaned)
import matplotlib.pyplot as plt

# Display the generated image:
plt.imshow(wordcloud1, interpolation='bilinear')
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()
plt.savefig('./comment_wordcloud.png', facecolor='k', bbox_inches='tight')

Non-information 'yes' comments

In [None]:
data.head()
bad_comments = data[data['non-information'] == 'yes']
bad_comments = bad_comments['comment'].apply(str)

#data cleaning
#remove special characters
bad_comments = bad_comments.map(DataProcesser.remove_special_characters)
#remove accented chars
bad_comments = bad_comments.map(DataProcesser.remove_accented_chars)
#to lower case
bad_comments = bad_comments.map(lambda com : com.lower())

#stemming
ps = PorterStemmer()
bad_comments = [(" ".join(list(map(ps.stem, comment.split())))) for comment in bad_comments]

#remove stopwords
bad_comments = list(map(DataProcesser.remove_stopwords, bad_comments))
bad_comments[:15]

What words are most common among the non-information 'yes' comments?

In [None]:
#split into words
split_comments = " ".join(bad_comments)
split_comments = split_comments.split()
counted_comments = Counter(split_comments)
{k: counted_comments[k] for k in list(counted_comments)[:15]}
most_occur = counted_comments.most_common(15)
most_occur

Word Cloud of non-information 'yes' comments

In [None]:
# Generate a word cloud image
comments_cleaned = " ".join(bad_comments)
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
wordcloud1 = WordCloud(width=1600, height=800).generate(comments_cleaned)
import matplotlib.pyplot as plt

# Display the generated image:
plt.imshow(wordcloud1, interpolation='bilinear')
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()


Non-information 'no' comments

In [None]:
data.head()
good_comments = data[data['non-information'] == 'no']
good_comments = good_comments['comment'].apply(str)

#data cleaning
#remove special characters
good_comments = good_comments.map(DataProcesser.remove_special_characters)
#remove accented chars
good_comments = good_comments.map(DataProcesser.remove_accented_chars)
#to lower case
good_comments = good_comments.map(lambda com : com.lower())

#stemming
ps = PorterStemmer()
good_comments = [(" ".join(list(map(ps.stem, comment.split())))) for comment in good_comments]

#remove stopwords
good_comments = list(map(DataProcesser.remove_stopwords, good_comments))
good_comments[:15]

What words are most common among the non-information 'no' comments?

In [None]:
#split into words
split_comments = " ".join(good_comments)
split_comments = split_comments.split()
counted_comments = Counter(split_comments)
{k: counted_comments[k] for k in list(counted_comments)[:15]}
most_occur = counted_comments.most_common(15)
most_occur

Non-information 'no' comments

In [None]:
# Generate a word cloud image
comments_cleaned = " ".join(good_comments)
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
wordcloud1 = WordCloud(width=1600, height=800).generate(comments_cleaned)
import matplotlib.pyplot as plt

# Display the generated image:
plt.imshow(wordcloud1, interpolation='bilinear')
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()