# Loading and Preparing the Dataset

In [9]:
import pandas as pd
import re
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer

## Load the dataset into a pandas dataframe

In [10]:
df = pd.read_csv('~/data/stackexchange_812k.csv')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 812132 entries, 0 to 812131
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   post_id     812132 non-null  int64  
 1   parent_id   75535 non-null   float64
 2   comment_id  553076 non-null  float64
 3   text        812132 non-null  object 
 4   category    812132 non-null  object 
dtypes: float64(2), int64(1), object(2)
memory usage: 31.0+ MB


In [12]:
df.head()

Unnamed: 0,post_id,parent_id,comment_id,text,category
0,1,,,Eliciting priors from experts,title
1,2,,,What is normality?,title
2,3,,,What are some valuable Statistical Analysis op...,title
3,4,,,Assessing the significance of differences in d...,title
4,6,,,The Two Cultures: statistics vs. machine learn...,title


## Use regular expressions to remove elements that are not words such as HTML tags, LaTeX expressions, URLs, digits, line returns, and so on.

In [13]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
LATEX_RE = re.compile('\$|\$$|\\\\')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = LATEX_RE.sub('', text)
    # text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text

In [14]:
clean_text('Formulate hypotheses when $\mu_A < \mu_B$')

'formulate hypotheses when mu_a  mu_b'

In [38]:
df[df.text.str.contains('<[^<]+?>')].text

2530      Formulate hypotheses when $\mu_A < \mu_B$ is d...
4791      WinBUGS error with zero values in binomial dis...
6959      Sample size to tell if more than X% of the pop...
18051     $\sum_{n=1}^{\infty} p_n < \infty \to \prod_{n...
24545     If $F_X(z) > F_Y (z)$ for all $z\in \mathbb{R}...
                                ...                        
809195    @GeoMatt22 You're right. There are some concer...
809468    One other note: [tag:change-point] detection w...
809819    The curly braces might be taken to imply an un...
810231    @StephanKolassa, I'm happy to vote to reopen t...
810639    Would you mind adding `set.seed(<some number>)...
Name: text, Length: 168532, dtype: object

In [39]:
df[df.text.str.contains('http')].text

91756     <p>The R-project</p>\n\n<p><a href="http://www...
91757     <p>Last year, I read a blog post from <a href=...
91760     <p><a href="http://incanter.org/">Incanter</a>...
91762     <p>Is there a good, modern treatment covering ...
91763     <p>See my response to <a href="https://stackov...
                                ...                        
812044    Suddenly I am confused, I think the statistic ...
812064    Since a likelihood is typically constructed fr...
812073    (+1)  And the fact that Google Trends (https:/...
812091    Hint: regress any data you like against the (C...
812115    [This answer](https://stats.stackexchange.com/...
Name: text, Length: 120559, dtype: object

In [50]:
df[df.text.str.contains('[/(){}\[\]\|@,;]')].text[91756]

'<p>The R-project</p>\n\n<p><a href="http://www.r-project.org/">http://www.r-project.org/</a></p>\n\n<p>R is valuable and significant because it was the first widely-accepted Open-Source alternative to big-box packages.  It\'s mature, well supported, and a standard within many scientific communities.</p>\n\n<ul>\n<li><a href="http://www.inside-r.org/why-use-r">Some reasons why it is useful and valuable</a> </li>\n<li>There are some nice tutorials <a href="http://gettinggeneticsdone.blogspot.com/search/label/ggplot2">here</a>.</li>\n</ul>\n'

In [76]:
re.sub('^\$|\$$|\\\\', '', '$\\10 11\\$')

'10 11'

In [160]:
df[df.text.str.contains('=\s+\w+\s+=')]

Unnamed: 0,post_id,parent_id,comment_id,text,category
6090,28146,,,is it true that p -z leq z leq z = alpha = 2 p...,title
8259,37720,,,how do i reject or fail to reject the null hyp...,title
20893,185024,,,if mathbb e x = k and text var x = 0 is prleft...,title
25261,406447,,,why does mathrm e e^ -x = 0 imply mathrm p x =...,title
25492,407376,,,in exact matching in causal inference why is i...,title
...,...,...,...,...,...
804847,273557,,524594.0,"in a ""plain vanilla"" mixture of regressions pr...",comment
808042,276349,,530078.0,thanks for pointing that out alex. allow me to...,comment
808049,276349,,654080.0,did - if y is a continuous variable then does ...,comment
808925,277122,,531381.0,no? i don't understand what you're doing there...,comment


In [51]:
df.text.apply(lambda x: re.sub('[/(){}\[\]\|@,;]', '', x))[2530]

'Formulate hypotheses when $\\mu_A < \\mu_B$ is different from $\\mu_A > \\mu_B$'

In [15]:
df.text = df.text.apply(clean_text)

In [16]:
df

Unnamed: 0,post_id,parent_id,comment_id,text,category
0,1,,,eliciting priors from experts,title
1,2,,,what is normality,title
2,3,,,what are some valuable statistical analysis op...,title
3,4,,,assessing the significance of differences in d...,title
4,6,,,the two cultures statistics vs machine learning,title
...,...,...,...,...,...
812127,279994,,536471.0,it does run and gives very valid looking esti...,comment
812128,279998,,536439.0,it seems to me that you are correct the doubl...,comment
812129,279998,,536514.0,it wouldnt be the first time a grader has miss...,comment
812130,279999,,536802.0,the basic idea is to compare the clustering co...,comment


## Remove missing values for texts

In [17]:
df = df[df.text.str.len() > 0]

In [18]:
df[df.text.str.len() == 0]

Unnamed: 0,post_id,parent_id,comment_id,text,category


In [19]:
# remove multiple spaces
df['text'] = df.text.apply(lambda t : re.sub("\s\s+",' ', t) )

# remove trailing spaces with strip()
df['text'] = df.text.apply(lambda t : t.strip() )

## Remove texts that are extremely large or too short to bring any information to the model. 

We want to keep paragraphs that contain at least a few words and remove the paragraphs that are composed of large numerical tables.

In [98]:
df[df.text.str.len() > 100].text.str.split().map(lambda x: len(x))

27        15
40        17
64        20
89        17
103       17
          ..
812127    39
812128    67
812129    23
812130    79
812131    21
Name: text, Length: 596452, dtype: int64

In [21]:
df = df[df.text.str.split().map(lambda x: len(x) > 4 & len(x) < 5000)]

In [None]:
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
df['tokens'] = df.text.apply(lambda t : tokenizer.tokenize(t.lower())) 

df = df[(df.n_tokens > 4) & (df.n_tokens < 5000)].reset_index(drop = True)
df.shape

## Use a tokenizer to create a version of the original text that is a string of space-separated lowercase tokens. For instance,

- Thank you!, This equation y = ax + by=ax+b, is very helpful.
  would be transformed as:

  thank you ! this equation , is very helpful .
  

- “retrieve a distance matrix” is a matter of coding. 
  It also might be irrelevant: one can imagine creative answers.

  becomes, if you choose to remove double quotes from the original text:

  retrieve a distance matrix is a matter of coding . it also might be irrelevant : one can imagine creative answers .
  
Note that punctuation signs (, . : !) are also represented as tokens

In [22]:
from nltk import word_tokenize
from nltk import Text

In [23]:
def space_separated_lower(text):
    tokens = word_tokenize(text.lower())
    return " ".join(list(filter(lambda x: x not in ['“', "”"], tokens)))

In [148]:
text = '“retrieve a distance matrix” is a matter of coding. It also might be irrelevant: one can imagine creative answers.'
space_separated_lower(text)

'retrieve a distance matrix is a matter of coding . it also might be irrelevant : one can imagine creative answers .'

In [151]:
text = "Thank you!, This equation y = ax + by=ax+b, is very helpful"
space_separated_lower(text)

'thank you ! , this equation y = ax + by=ax+b , is very helpful'

In [153]:
re.sub('(\\d+)\\s*([+\\-*\\/])\\s*(\\d+)', '', text)

'Thank you!, This equation y = ax + by=ax+b, is very helpful'

In [32]:
df['tokens'] = df.text.apply(space_separated_lower)

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 801673 entries, 1 to 812131
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   post_id     801673 non-null  int64  
 1   parent_id   75518 non-null   float64
 2   comment_id  548180 non-null  float64
 3   text        801673 non-null  object 
 4   category    801673 non-null  object 
 5   tokens      801673 non-null  object 
dtypes: float64(2), int64(1), object(3)
memory usage: 42.8+ MB


In [27]:
df.sample(1000)

Unnamed: 0,post_id,parent_id,comment_id,text,category
450430,423338,,790157.0,yearly the survey is conducted once every year,comment
594006,317935,,603835.0,i edited the question hope i make sense now ju...,comment
799386,269025,,514540.0,matthewgunn on the other hand these are questi...,comment
544880,163206,,310305.0,can you suggest an example where an observatio...,comment
208043,114875,114860.0,,say you have categories a b c d it turns out t...,post
...,...,...,...,...,...
381639,76717,,150014.0,alan my earlier version omitted an important t...,comment
601919,83611,,163947.0,can you expand on the statement that you were ...,comment
400271,180742,,349837.0,lol anyway saz admitted hes wrong thanks,comment
433831,409668,,765634.0,that clarifies things for me thank you,comment


In [34]:
import csv
df.to_csv("../data/stackexchange_cleaned.csv", quoting = csv.QUOTE_ALL, index = False)