# Importing the data

As a first, I read the original data set (in the folder ``input/raw``):

In [1]:
import pandas as pd


df = pd.read_csv("../input/raw/stackexchange_812k.csv")
print(df.shape)

(812132, 5)


In [2]:
df.head()

Unnamed: 0,post_id,parent_id,comment_id,text,category
0,1,,,Eliciting priors from experts,title
1,2,,,What is normality?,title
2,3,,,What are some valuable Statistical Analysis op...,title
3,4,,,Assessing the significance of differences in d...,title
4,6,,,The Two Cultures: statistics vs. machine learn...,title


# Basic cleaning

I take the first text for example:

In [3]:
df.text[0]

'Eliciting priors from experts'

Converting to lower case:

In [4]:
df.text[0].lower()

'eliciting priors from experts'

Converting all strings to lower:

In [5]:
texts = [i.lower() for i in df.text]
print(texts[0:40])

['eliciting priors from experts', 'what is normality?', 'what are some valuable statistical analysis open source projects?', 'assessing the significance of differences in distributions', 'the two cultures: statistics vs. machine learning?', 'locating freely available data samples', 'so how many staticians *does* it take to screw in a lightbulb?', 'under what conditions should likert scales be used as ordinal or interval data?', 'multivariate interpolation approaches', 'forecasting demographic census', 'bayesian and frequentist reasoning in plain english', 'finding the pdf given the cdf', 'tools for modeling financial time series', 'what is a standard deviation?', 'testing random variate generation algorithms', 'what is the meaning of p values and t values in statistical tests?', 'r packages for seasonality analysis', 'examples for teaching: correlation does not mean causation', 'pseudo-random number generation algorithms', 'explain data visualization', 'clustering of large, heavy-taile

The assignment hint towards the presence of tags and mathematical equation that need to be cleaned. For example:

In [6]:
tags = [i.lower() for i in df.text if "<" in i]
print(tags[0:10])

['time series for count data, with counts < 20', 'why is it possible to get significant f statistic (p<.001) but non-significant regressor t-tests?', 'symmetric fat-tailed distributions where $\\mathbb{e} e^x < \\infty$', 'formulate hypotheses when $\\mu_a < \\mu_b$ is different from $\\mu_a > \\mu_b$', 'winbugs error with zero values in binomial distribution: value of order of binomial <expr> must be greater than zero', 'mahalanobis distance via pca when $n<p$', 'do you reject the null hypothesis when $p < \\alpha$ or $p \\leq \\alpha$? ', 'small dimensional classification (< 20 features), one (or two) dominant predictors', 'sample size to tell if more than x% of the population can do <thing>', "chi-squared vs fisher's exact test w/ 5x6 contingency table & some cells <5?"]


In [7]:
import re

print([re.sub('\$(.*?)\$', '', s) for s in tags[0:10]])

['time series for count data, with counts < 20', 'why is it possible to get significant f statistic (p<.001) but non-significant regressor t-tests?', 'symmetric fat-tailed distributions where ', 'formulate hypotheses when  is different from ', 'winbugs error with zero values in binomial distribution: value of order of binomial <expr> must be greater than zero', 'mahalanobis distance via pca when ', 'do you reject the null hypothesis when  or ? ', 'small dimensional classification (< 20 features), one (or two) dominant predictors', 'sample size to tell if more than x% of the population can do <thing>', "chi-squared vs fisher's exact test w/ 5x6 contingency table & some cells <5?"]


In [8]:
remove_equations = [re.sub('\$.*?\$', '', s) for s in tags[0:10]]
print(remove_equations)

['time series for count data, with counts < 20', 'why is it possible to get significant f statistic (p<.001) but non-significant regressor t-tests?', 'symmetric fat-tailed distributions where ', 'formulate hypotheses when  is different from ', 'winbugs error with zero values in binomial distribution: value of order of binomial <expr> must be greater than zero', 'mahalanobis distance via pca when ', 'do you reject the null hypothesis when  or ? ', 'small dimensional classification (< 20 features), one (or two) dominant predictors', 'sample size to tell if more than x% of the population can do <thing>', "chi-squared vs fisher's exact test w/ 5x6 contingency table & some cells <5?"]


In [9]:
remove_tags = [re.sub('<.*?>', '', s) for s in remove_equations]
print(remove_tags)

['time series for count data, with counts < 20', 'why is it possible to get significant f statistic (p<.001) but non-significant regressor t-tests?', 'symmetric fat-tailed distributions where ', 'formulate hypotheses when  is different from ', 'winbugs error with zero values in binomial distribution: value of order of binomial  must be greater than zero', 'mahalanobis distance via pca when ', 'do you reject the null hypothesis when  or ? ', 'small dimensional classification (< 20 features), one (or two) dominant predictors', 'sample size to tell if more than x% of the population can do ', "chi-squared vs fisher's exact test w/ 5x6 contingency table & some cells <5?"]


I would remove also the percentage sign:

In [10]:
remove_percentages = [re.sub('%', '', s) for s in remove_tags]
print(remove_percentages)

['time series for count data, with counts < 20', 'why is it possible to get significant f statistic (p<.001) but non-significant regressor t-tests?', 'symmetric fat-tailed distributions where ', 'formulate hypotheses when  is different from ', 'winbugs error with zero values in binomial distribution: value of order of binomial  must be greater than zero', 'mahalanobis distance via pca when ', 'do you reject the null hypothesis when  or ? ', 'small dimensional classification (< 20 features), one (or two) dominant predictors', 'sample size to tell if more than x of the population can do ', "chi-squared vs fisher's exact test w/ 5x6 contingency table & some cells <5?"]


Thus, for the full dataset:

In [11]:
texts = [i.lower() for i in df.text]
remove_equations = [re.sub('\$.*?\$', '', s) for s in texts]
remove_tags = [re.sub('<.*?>', '', s) for s in remove_equations]
remove_percentages = [re.sub('%', '', s) for s in remove_tags]

# Calculating word frequency

In order to find typos or formula survived to the filters, I joined in a unique string all the texts:

In [12]:
all_texts = " ".join(remove_percentages)

In [13]:
print(all_texts[0:2000])

eliciting priors from experts what is normality? what are some valuable statistical analysis open source projects? assessing the significance of differences in distributions the two cultures: statistics vs. machine learning? locating freely available data samples so how many staticians *does* it take to screw in a lightbulb? under what conditions should likert scales be used as ordinal or interval data? multivariate interpolation approaches forecasting demographic census bayesian and frequentist reasoning in plain english finding the pdf given the cdf tools for modeling financial time series what is a standard deviation? testing random variate generation algorithms what is the meaning of p values and t values in statistical tests? r packages for seasonality analysis examples for teaching: correlation does not mean causation pseudo-random number generation algorithms explain data visualization clustering of large, heavy-tailed dataset pca on correlation or covariance? why do us and uk s

Then tokenizing using ``nltk``:

In [14]:
import nltk
from nltk.tokenize import word_tokenize


words_list = word_tokenize(all_texts)

Using ``Counter`` to count the word frequency:

In [15]:
from collections import Counter

c = Counter(words_list)

In [16]:
 c.most_common()[:-20:-1]

[('fxns', 1),
 ('crash-course', 1),
 ('montone', 1),
 ('regresser', 1),
 ('//stats.stackexchange.com/a/301933/28500', 1),
 ('erxcerpt', 1),
 ('build/get', 1),
 ('//stats.stackexchange.com/questions/279918', 1),
 ('vocalizes', 1),
 ('cadgas', 1),
 ('outnumbering', 1),
 ('multivariate20regression', 1),
 ('q=multiple20regression', 1),
 ('a-lambda', 1),
 ('pc1+pc3', 1),
 ('//stats.stackexchange.com/questions/4220/', 1),
 ('night-sky', 1),
 ('//stats.stackexchange.com/questions/199207/why-do-t-test-assuming-equal-population-variance-and-t-test-not-assuming-equal-v',
  1),
 ('rian', 1)]

## Removing links

The links should be probably removed; for the moment, replacing all of them simply with 'http'. Thus the procedure should be:

In [17]:
texts = [i.lower() for i in df.text]
remove_equations = [re.sub('\$.*?\$', '', s) for s in texts]
remove_tags = [re.sub('<.*?>', '', s) for s in remove_equations]
remove_percentages = [re.sub('%', '', s) for s in remove_tags]
remove_links = [re.sub(r"http\S+", "http", s) for s in remove_percentages]

In [18]:
all_texts = " ".join(remove_links)
words_list = word_tokenize(all_texts)
c = Counter(words_list)
c.most_common()[:-50:-1]

[('fxns', 1),
 ('crash-course', 1),
 ('montone', 1),
 ('regresser', 1),
 ('erxcerpt', 1),
 ('build/get', 1),
 ('vocalizes', 1),
 ('cadgas', 1),
 ('outnumbering', 1),
 ('a-lambda', 1),
 ('pc1+pc3', 1),
 ('night-sky', 1),
 ('rian', 1),
 ('detoxication', 1),
 ('sf36', 1),
 ('q3-5', 1),
 ('score=7', 1),
 ('score=8', 1),
 ('t2w', 1),
 ('t1w', 1),
 ('23+10+12=t1i+t1w+t2w', 1),
 ('self-limiting', 1),
 ('wait-listing', 1),
 ('risk/calving', 1),
 ('deviations/variance', 1),
 ('matox', 1),
 ('varianc', 1),
 ('loan-status', 1),
 ('co-applicant', 1),
 ('1-fitted.vales', 1),
 ('fitted.values/', 1),
 ('physicsforums.com', 1),
 ('discrete/continuous/neither', 1),
 ('exp.wald', 1),
 ('ncp=exp.wald', 1),
 ('q=qnorm', 1),
 ('itcannot', 1),
 ('li.ear', 1),
 ('fundemental', 1),
 ('just-burned-out', 1),
 ('ms_caalis', 1),
 ('eifenvalue', 1),
 ('dony', 1),
 ('up-weeks', 1),
 ('up-week', 1),
 ("method='neuralnet", 1),
 ('data=train_hypep', 1),
 ('turtles~twine+mesh+black+blue+green+red+orange+yellow+syntheti

## Removing formulas

One important filter is to remove math formulas in R:

In [38]:
formula = 'turtles ~twine +mesh+black+blue+green+red+orange+yellow+synthetic+braided+mono+multi+x1+x2+x3+x4+x5+ x16'
re.sub('[a-z]*[0-9]* *\~([a-z]*[0-9]* *\+)* *[a-z]*[0-9]*', '', formula)

''

## Removing parameter setting

Another relevant filter can be on setting parameters in a call, like in:

In [45]:
parameters = ["method='neuralnet", 'data=train_hypep']
[re.sub('[a-z_]*=[\'|\"]*[0-9a-z_]*', '', s) for s in parameters]

['', '']

Thus the procedure:

In [47]:
texts = [i.lower() for i in df.text]
remove_equations = [re.sub('\$.*?\$', '', s) for s in texts]
remove_tags = [re.sub('<.*?>', '', s) for s in remove_equations]
remove_percentages = [re.sub('%', '', s) for s in remove_tags]
remove_links = [re.sub(r"http\S+", "http", s) for s in remove_percentages]
remove_formulas = [re.sub('[a-z]*[0-9]* *\~([a-z]*[0-9]* *\+)* *[a-z]*[0-9]*', '', s) for s in remove_links]
remove_parameters = [re.sub('[a-z_]*=[\'|\"]*[0-9a-z_]*', '', s) for s in remove_formulas]

In [48]:
all_texts = " ".join(remove_parameters)
words_list = word_tokenize(all_texts)
c = Counter(words_list)
c.most_common()[:-50:-1]

[('fxns', 1),
 ('crash-course', 1),
 ('montone', 1),
 ('regresser', 1),
 ('erxcerpt', 1),
 ('build/get', 1),
 ('vocalizes', 1),
 ('cadgas', 1),
 ('outnumbering', 1),
 ('a-lambda', 1),
 ('pc1+pc3', 1),
 ('night-sky', 1),
 ('rian', 1),
 ('detoxication', 1),
 ('sf36', 1),
 ('q3-5', 1),
 ('t2w', 1),
 ('t1w', 1),
 ('23+10+12+t1w+t2w', 1),
 ('self-limiting', 1),
 ('wait-listing', 1),
 ('risk/calving', 1),
 ('deviations/variance', 1),
 ('matox', 1),
 ('varianc', 1),
 ('loan-status', 1),
 ('co-applicant', 1),
 ('1-fitted.vales', 1),
 ('fitted.values/', 1),
 ('physicsforums.com', 1),
 ('discrete/continuous/neither', 1),
 ('exp.wald', 1),
 ('.wald', 1),
 ('itcannot', 1),
 ('li.ear', 1),
 ('fundemental', 1),
 ('just-burned-out', 1),
 ('ms_caalis', 1),
 ('eifenvalue', 1),
 ('dony', 1),
 ('up-weeks', 1),
 ('up-week', 1),
 ('layer3:5', 1),
 ('layer2:5', 1),
 ('layer1:5', 1),
 ('parametergrid', 1),
 ('controlparameters', 1),
 ('stimulating/motivational', 1),
 ('-convex', 1)]

In [51]:
df["text_tokenized"] = [" ".join(word_tokenize(s)) for s in remove_parameters]
df.head()

Unnamed: 0,post_id,parent_id,comment_id,text,category,text_tokenized
0,1,,,Eliciting priors from experts,title,eliciting priors from experts
1,2,,,What is normality?,title,what is normality ?
2,3,,,What are some valuable Statistical Analysis op...,title,what are some valuable statistical analysis op...
3,4,,,Assessing the significance of differences in d...,title,assessing the significance of differences in d...
4,6,,,The Two Cultures: statistics vs. machine learn...,title,the two cultures : statistics vs. machine lear...


In [52]:
df.to_csv("../input/preprocessed/tokenized.csv")