In [6]:
%config IPCompleter.greedy=True

In [7]:
import pandas as pd

In [8]:
# columns: "post_id","parent_id","comment_id","text","category"
# lowercase, space-separated tokens
# remove rows when no text
# cleanup with regex
# - html
# - latex
# LaTex expressions are delimited by the $ sign: For instance $y = ax + b$
# You can use any word tokenizer you want. NLTK’s WordPunctTokenizer is a good choice. 

# make sure to make a cut on an index that ends a record
# head -n 100007 stackexchange_812k.csv > first100000rows.csv
df_stackexchange = pd.read_csv("first100000rows.csv", index_col = "post_id", engine="python", encoding="utf-8", error_bad_lines=True)

In [9]:
# strip endlines and convert to lowercase
df_stackexchange["text_cleanup"] = df_stackexchange["text"].str.replace('\n', '', regex=True)
df_stackexchange["text_cleanup"] = df_stackexchange["text_cleanup"].str.lower()

In [10]:
# strip digits
df_stackexchange["text_cleanup"] = df_stackexchange["text_cleanup"].str.replace('\d+', '', regex=True)

In [11]:
# strip html
df_stackexchange["text_cleanup"] = df_stackexchange["text_cleanup"].str.replace('(<[^>]*>)', '', regex=True)

In [12]:
# strip latex
# df_stackexchange["text_cleanup"] = df_stackexchange["text_cleanup"].str.replace('(\$[^\&]*\$)', '', regex=True)
df_stackexchange["text_cleanup"] = df_stackexchange["text_cleanup"].str.replace('(\$[^\&?]*\$)', '', regex=True)

In [13]:
# strip formulas / latex
df_stackexchange["text_cleanup"] = df_stackexchange["text_cleanup"].str.replace('(\\\\begin{equation}[^>]*\\\\end{equation})', '', regex=True)

In [14]:
# strip punctuation but keep -,.!? (do this after latex-strip!)
import string
# string.punctuation
df_stackexchange["text_cleanup"] = df_stackexchange["text_cleanup"].str.replace('(["#$%&\'()*+/:;<=>@[\\]^_`{|}~])', '', regex=True)
# df_stackexchange["text_cleanup"] = df_stackexchange["text_cleanup"].str.replace('([{}])'.format(string.punctuation), '', regex=True)

In [15]:
# remove rows with no text (should be the last!)
df_stackexchange = df_stackexchange.drop(df_stackexchange[df_stackexchange["text"].isna()].index)

In [16]:
df_stackexchange["text_cleanup"].values[-1]

'when carrying out ols multiple linear regression, rather than plot the residuals against fitted values,  i plot the internal studentized residuals against fitted values ditto for covariates. these residuals are defined aswhere  are the diagonal elements of the hat matrix. to get these studentized residuals in r, you can use the rstandard command. what type of residuals do people routinely use in this context? for example, do you just stick with  or do you use jackknife residuals, or something else entirely.note im not that interested in papers that define a new type of residual that no-one ever uses.'

In [17]:
df_stackexchange["text"].values[-1]

"<p>When carrying out OLS multiple linear regression, rather than plot the residuals against fitted values,  I plot the (internal) Studentized residuals against fitted values (ditto for covariates). These residuals are defined as:</p>\n\n<p>\\begin{equation}\ne^*_i = \\frac{e_i}{\\sqrt{s^2 (1-h_{ii})}}\n\\end{equation}</p>\n\n<p>where $e_i$ is the residual and $h_{ii}$ are the diagonal elements of the hat matrix. To get these studentized residuals in R, you can use the <code>rstandard</code> command. </p>\n\n<p>What type of residuals do people routinely use in this context? For example, do you just stick with $e_i$ or do you use jackknife residuals, or something else entirely.</p>\n\n<p>Note: I'm not that interested in papers that define a new type of residual that no-one ever uses.</p>\n"

In [18]:
# epport to csv
df_stackexchange.to_csv('output.csv')

In [20]:
char_set = set()

all_strings = list(df_stackexchange['text_cleanup'])

for text in all_strings:
    for word in text:
        for char in word:
            char_set.add(char)
        
print(char_set)

{'r', 'ﬂ', '“', 's', 'é', ',', '∪', '⊂', '？', 'ñ', 'û', 'h', '𝔼', '−', 'o', '（', 'à', '̄', 'ﬁ', 'μ', '²', '³', 'ó', 'θ', 'φ', 'x', 'z', 'σ', 'n', 'γ', '：', '≈', 'ω', '₂', '\u200f', 'i', '\\', '≠', '̂', 'j', '‘', 'ϵ', '\xa0', 'm', 'e', '∩', '±', '°', '£', '-', 'ß', '.', 'α', 'ᵢ', '´', 'w', '»', '×', 'λ', 'u', 'ε', 'q', '!', 'ρ', 'β', 'ö', 'g', 'd', '⇒', '—', '\xad', '„', '∈', 'a', 'ï', '¿', '₁', 't', 'ū', '𝑛', '）', '̶', '\r', 'b', 'χ', '̇', 'k', 'π', 'c', 'ø', '‐', 'l', 'p', 'ν', 'y', '”', 'τ', 'ı', '→', 'ő', 'á', '’', '«', ' ', '√', 'ﬀ', 'δ', 'í', 'ɛ', '𝑝', '∞', '§', '₃', '–', '…', '\u200b', '\u2009', '?', 'ā', '≤', 'с', 'f', 'v', '‖', 'ü'}
