## Stopword removal & Tokenization

In [1]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /Users/mamieo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/mamieo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/mamieo/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
%store -r all_categories

#### Example tokenization and remove stopword 

In [3]:
all_categories['stress']['Selftext'][0]

'**I have a 5775 in Math right now I have less than 2 weeks to bring up my math grade from an F to a D (3 points) before I need to go to summer school and re-take it next year, and maybe not graduate I am a Sophomore in high-school Math has never been something I am good at I have 2 more tests, including one tomorrow My parents are yelling at me and stressing me out and making my self harming issue worse, and they wont get me a tutor Ive been having dizzy spells and fainting spells because im so stressed that I forget to take care of myself and life is shit WHAT do I do???**'

In [4]:
def token(submission):
    token_words = word_tokenize(submission)
    return token_words
    
terms = token(all_categories['stress']['Selftext'][0])

print(terms)

['*', '*', 'I', 'have', 'a', '5775', 'in', 'Math', 'right', 'now', 'I', 'have', 'less', 'than', '2', 'weeks', 'to', 'bring', 'up', 'my', 'math', 'grade', 'from', 'an', 'F', 'to', 'a', 'D', '(', '3', 'points', ')', 'before', 'I', 'need', 'to', 'go', 'to', 'summer', 'school', 'and', 're-take', 'it', 'next', 'year', ',', 'and', 'maybe', 'not', 'graduate', 'I', 'am', 'a', 'Sophomore', 'in', 'high-school', 'Math', 'has', 'never', 'been', 'something', 'I', 'am', 'good', 'at', 'I', 'have', '2', 'more', 'tests', ',', 'including', 'one', 'tomorrow', 'My', 'parents', 'are', 'yelling', 'at', 'me', 'and', 'stressing', 'me', 'out', 'and', 'making', 'my', 'self', 'harming', 'issue', 'worse', ',', 'and', 'they', 'wont', 'get', 'me', 'a', 'tutor', 'Ive', 'been', 'having', 'dizzy', 'spells', 'and', 'fainting', 'spells', 'because', 'im', 'so', 'stressed', 'that', 'I', 'forget', 'to', 'take', 'care', 'of', 'myself', 'and', 'life', 'is', 'shit', 'WHAT', 'do', 'I', 'do', '?', '?', '?', '*', '*']


In [5]:
def stem_tokens(terms):
    porter = PorterStemmer()
    stem_terms = []
    for term in terms:
        stem_terms.append(porter.stem(term))
    return stem_terms;

def lemma_tokens(terms):
    wordnet_lemmatizer = WordNetLemmatizer()
    lemma_terms = []

    for term in terms:
        lemma_terms.append(wordnet_lemmatizer.lemmatize(term))
    return lemma_terms;

def remove_stopword(terms):
    stop_words = set(stopwords.words('english'))
    stop_words = ENGLISH_STOP_WORDS

    tokens_without_sw = [word for word in terms if not word in stop_words]
    return tokens_without_sw
    
lemma_terms = lemma_tokens(terms)
stem_terms = stem_tokens(lemma_terms)
terms = remove_stopword(terms)

print(stem_terms)

['*', '*', 'i', 'have', 'a', '5775', 'in', 'math', 'right', 'now', 'i', 'have', 'le', 'than', '2', 'week', 'to', 'bring', 'up', 'my', 'math', 'grade', 'from', 'an', 'f', 'to', 'a', 'd', '(', '3', 'point', ')', 'befor', 'i', 'need', 'to', 'go', 'to', 'summer', 'school', 'and', 're-tak', 'it', 'next', 'year', ',', 'and', 'mayb', 'not', 'graduat', 'i', 'am', 'a', 'sophomor', 'in', 'high-school', 'math', 'ha', 'never', 'been', 'someth', 'i', 'am', 'good', 'at', 'i', 'have', '2', 'more', 'test', ',', 'includ', 'one', 'tomorrow', 'my', 'parent', 'are', 'yell', 'at', 'me', 'and', 'stress', 'me', 'out', 'and', 'make', 'my', 'self', 'harm', 'issu', 'wors', ',', 'and', 'they', 'wont', 'get', 'me', 'a', 'tutor', 'ive', 'been', 'have', 'dizzi', 'spell', 'and', 'faint', 'spell', 'becaus', 'im', 'so', 'stress', 'that', 'i', 'forget', 'to', 'take', 'care', 'of', 'myself', 'and', 'life', 'is', 'shit', 'what', 'do', 'i', 'do', '?', '?', '?', '*', '*']


#### def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data)
    data = remove_apostrophe(data)
    data = remove_single_characters(data)
    data = convert_numbers(data)
    data = remove_stop_words(data)
    data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)

1. No need to convert lower case because stemming and lemma will do it
2. Punctuation is the set of unnecessary symbols that are in our corpus documents.
3. Note that there is no ‘ apostrophe in the punctuation symbols. Because when we remove punctuation first it will convert don’t to dont, and it is a stop word that won't be removed.
4. Single characters are not much useful in knowing the importance of the document and few final single characters might be irrelevant symbols
5. Stemming, playing and played are the same type of words that basically indicate an action play.
6. Lemmatisation is a way to reduce the word to the root synonym of a word.

### Thus, 
#### if the word is very common and appears in many documents, this number will approach 0. Otherwise, it will approach 1.

- The most significant word for **document A** is man and walk
- The most significant word for **document B** is around, children, fire, and sat

# Preprocessing - Real Data

In [6]:
submission1 = all_categories['stress']['Selftext'][0]
submission2 = all_categories['stress']['Selftext'][1]
submission3 = all_categories['stress']['Selftext'][2]
submission4 = all_categories['stress']['Selftext'][3]
submission5 = all_categories['stress']['Selftext'][4]
submission6 = all_categories['stress']['Selftext'][5]
submission7 = all_categories['stress']['Selftext'][6]
submission8 = all_categories['stress']['Selftext'][7]
submission9 = all_categories['stress']['Selftext'][8]
submission10 = all_categories['stress']['Selftext'][9]

submissions = [submission1, submission2, submission3, submission4, submission5,
              submission6, submission7, submission8, submission9, submission10]

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    lemma_terms = lemma_tokens(tokens)
    stem_terms = stem_tokens(lemma_terms)
    terms = remove_stopword(stem_terms)
    return terms

vectorizer = TfidfVectorizer(tokenizer=tokenize)
submission_vectors = vectorizer.fit_transform(submissions)

dense = submission_vectors.todense()
submission_list = dense.tolist()

df = pd.DataFrame(submission_list, columns=vectorizer.get_feature_names())
df



Unnamed: 0,!,%,'m,(,),*,",",120,15,2,...,winter,wish,wont,work,worri,wors,year,yell,…,…i
0,0.0,0.0,0.0,0.096469,0.096469,0.453923,0.165951,0.0,0.0,0.226961,...,0.0,0.0,0.113481,0.0,0.0,0.096469,0.113481,0.113481,0.0,0.0
1,0.488729,0.0,0.0,0.0,0.0,0.0,0.357353,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.144016,0.0,0.122427,0.0,0.0,0.0,0.0
3,0.0,0.092972,0.0,0.079034,0.079034,0.0,0.317239,0.092972,0.092972,0.0,...,0.0,0.0,0.0,0.0,0.092972,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.184624,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.108869,0.0,0.0,0.0,0.187283,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.202012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.111585,0.0,0.0,0.0,...,0.0,0.228912,0.0,0.0,0.0,0.0,0.0,0.0,0.114456,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.406714,0.0,0.0,0.0,...,0.06953,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06953


In [7]:
df.sort_values(by=['worri'], ascending=False)

Unnamed: 0,!,%,'m,(,),*,",",120,15,2,...,winter,wish,wont,work,worri,wors,year,yell,…,…i
3,0.0,0.092972,0.0,0.079034,0.079034,0.0,0.317239,0.092972,0.092972,0.0,...,0.0,0.0,0.0,0.0,0.092972,0.0,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.096469,0.096469,0.453923,0.165951,0.0,0.0,0.226961,...,0.0,0.0,0.113481,0.0,0.0,0.096469,0.113481,0.113481,0.0,0.0
1,0.488729,0.0,0.0,0.0,0.0,0.0,0.357353,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.144016,0.0,0.122427,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.184624,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.108869,0.0,0.0,0.0,0.187283,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.202012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.111585,0.0,0.0,0.0,...,0.0,0.228912,0.0,0.0,0.0,0.0,0.0,0.0,0.114456,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.406714,0.0,0.0,0.0,...,0.06953,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06953


In [8]:
def pre_processing(submissions):
    test = []
    for submission in submissions:
        terms = tokenize(submission)

        text = " ".join(terms)
        test.append(text);
    return test

preprocessed = pre_processing(submissions)

pd.options.display.max_colwidth = 1000
dataframe = pd.DataFrame(preprocessed, columns=["submission"])

In [9]:
print(preprocessed)

['* * 5775 math right le 2 week bring math grade f d ( 3 point ) befor need summer school re-tak year , mayb graduat sophomor high-school math ha someth good 2 test , includ tomorrow parent yell stress make self harm issu wors , wont tutor ive dizzi spell faint spell becaus im stress forget care life shit ? ? ? * *', 'nobodi told , let remind fuck awesomei love buddi , care ! dont stress , live best !', 'work shift killin dust factori bad hard breath everi day nust wors best pay job need money need abl breath tho sleep eat fucntion right rn im exaust 247 cuz sleep cuz got ta real life thing dure day time everywer open dont sleep', 'im high schooler right , appli colleg im multivari calculu ( colleg level math cours ) right , onli way end class 97-100 testsi mind worri thi everi time think class , just want throw like math befor thi , im extrem rigor cours load , mani extra thing outsid school , 120 % activ just drain sleep best thing onli thing want bc im mental drainedi just dont know

In [10]:
dataframe

Unnamed: 0,submission
0,"* * 5775 math right le 2 week bring math grade f d ( 3 point ) befor need summer school re-tak year , mayb graduat sophomor high-school math ha someth good 2 test , includ tomorrow parent yell stress make self harm issu wors , wont tutor ive dizzi spell faint spell becaus im stress forget care life shit ? ? ? * *"
1,"nobodi told , let remind fuck awesomei love buddi , care ! dont stress , live best !"
2,work shift killin dust factori bad hard breath everi day nust wors best pay job need money need abl breath tho sleep eat fucntion right rn im exaust 247 cuz sleep cuz got ta real life thing dure day time everywer open dont sleep
3,"im high schooler right , appli colleg im multivari calculu ( colleg level math cours ) right , onli way end class 97-100 testsi mind worri thi everi time think class , just want throw like math befor thi , im extrem rigor cours load , mani extra thing outsid school , 120 % activ just drain sleep best thing onli thing want bc im mental drainedi just dont know 15 week left thi calculu class continu grind im thi exhaust studi effici ? final come , realli need advic thank"
4,"physic , mental sexual frustrat street lack caus stress"
5,"late night thought stupid shityou ca n't chang anyth near midnightno action start feel like whirlpool stressfeel stupid , hopeless , like imposteropinionsi reason fuck thinkyou bettery failedthey hate mewhat think doe n't matterthey judg mewhat think matter ? tri best , think correct enoughi 'm enoughstupid shit n't matter"
6,feel 'm pass real onli life onc semi faint tho guess leav car near peopl happen wake sri just rant tri chill
7,anyon sever stress experi eye throb pain pain ?
8,"break fun know lot peopl say just sell roommat im tri hold hous kid sell just buy differ realiz itll problem onli know person mayb room mate brother honestli onli stand long child support possibl wish littl money overtim option mean miss time kid just wish easi way wa easi option im tire depress , stress , thi anxieti just just want …"
9,"intens emotionsi normal feel emot realli intens deepli effect ? exampl , sometim feel like im suffoc whenev im nervou someth , heart race feel start feel control stress effect way dont good reason nervou , usual small thing realli effect , dont know thi sick feel , like someth bad happen whenev smell fresh winter air sick stomach , somewhat good way feel intens im happi , feel like heart burst , dont angri , long ago thi incid friend basic happen felt like tri avoid purpos reason thi realli tick , flip switch brain ran , push hard fell , pretti hard went bitch whoop …i dont know caus react wayi normal problem like thi emot ?"


In [11]:
import contractions

def tokenize_without_preprocessing(text):
    tokens = nltk.word_tokenize(text)
    return tokens


def pre_processing_2(submissions):
    test = []
    for submission in submissions:
        terms = tokenize_without_preprocessing(submission)

        text = " ".join(terms)
        test.append(text);
    return test

preprocessed_2 = pre_processing_2(submissions)

pd.options.display.max_colwidth = 1000
dataframe = pd.DataFrame(preprocessed_2, columns=["submission"])

dataframe['no_contract'] = dataframe['submission'].apply(lambda x: [contractions.fix(word) for word in x.split()])
dataframe['no_contract'] = [' '.join(map(str, l)) for l in dataframe['no_contract']]
dataframe.head()

Unnamed: 0,submission,no_contract
0,"* * I have a 5775 in Math right now I have less than 2 weeks to bring up my math grade from an F to a D ( 3 points ) before I need to go to summer school and re-take it next year , and maybe not graduate I am a Sophomore in high-school Math has never been something I am good at I have 2 more tests , including one tomorrow My parents are yelling at me and stressing me out and making my self harming issue worse , and they wont get me a tutor Ive been having dizzy spells and fainting spells because im so stressed that I forget to take care of myself and life is shit WHAT do I do ? ? ? * *","* * I have a 5775 in Math right now I have less than 2 weeks to bring up my math grade from an F to a D ( 3 points ) before I need to go to summer school and re-take it next year , and maybe not graduate I am a Sophomore in high-school Math has never been something I am good at I have 2 more tests , including one tomorrow My parents are yelling at me and stressing me out and making my self harming issue worse , and they will not get me a tutor I have been having dizzy spells and fainting spells because im so stressed that I forget to take care of myself and life is shit WHAT do I do ? ? ? * *"
1,"If nobody told you , let me remind you You are fucking awesomeI love you buddy , take care ! Dont stress , live your best !","If nobody told you , let me remind you You are fucking awesomeI love you buddy , take care ! do not stress , live your best !"
2,I work third shifts Its killin me The dust in the factory is so bad its hard to breath every day nust gets worse its best paying job i ever had I need the money I need to be able to breath too tho i cant sleep eat or even fucntion right rn Im exausted 247 cuz i never get to sleep cuz i got ta do real life things during the day time When everyweres open So i dont get to sleep,I work third shifts Its killin me The dust in the factory is so bad its hard to breath every day nust gets worse its best paying job i ever had I need the money I need to be able to breath too tho i cannot sleep eat or even fucntion right rn I am exausted 247 cuz i never get to sleep cuz i got ta do real life things during the day time When everyweres open So i do not get to sleep
3,"Im a high schooler right now , and am applying to college Im taking Multivariable calculus ( a college level math course ) right now , and the only way for me to end with an A in the class is if I get a 97-100 on the next two testsI cant keep my mind from worrying about this Every time I think about the class , I just want to throw up I liked math before this , but Im also taking an extremely rigorous course load , have so many extra things I do outside of school , and giving my 120 % to each activity is just draining me Sleeping is the best thing and only thing I want to do bc Im so mentally drainedI just dont know how to keep going I have 15 weeks left in this calculus class How do I continue the grind when Im this exhausted and study efficiently ? I have finals coming up , so I really need advice Thanks","I am a high schooler right now , and am applying to college I am taking Multivariable calculus ( a college level math course ) right now , and the only way for me to end with an A in the class is if I get a 97-100 on the next two testsI cannot keep my mind from worrying about this Every time I think about the class , I just want to throw up I liked math before this , but I am also taking an extremely rigorous course load , have so many extra things I do outside of school , and giving my 120 % to each activity is just draining me Sleeping is the best thing and only thing I want to do bc I am so mentally drainedI just do not know how to keep going I have 15 weeks left in this calculus class How do I continue the grind when I am this exhausted and study efficiently ? I have finals coming up , so I really need advice Thanks"
4,"Physical , mentally and sexual frustrated from streets and lack of causing more stress","Physical , mentally and sexual frustrated from streets and lack of causing more stress"


In [12]:
preprocessed = pre_processing(submissions)

pd.options.display.max_colwidth = 1000
dataframe2 = pd.DataFrame(preprocessed, columns=["submission2"])

dataframe2['no_contract_with_preprocssing'] = dataframe2['submission2'].apply(lambda x: [contractions.fix(word) for word in x.split()])
dataframe2['no_contract_with_preprocssing'] = [' '.join(map(str, l)) for l in dataframe2['no_contract_with_preprocssing']]
dataframe2.head()


Unnamed: 0,submission2,no_contract_with_preprocssing
0,"* * 5775 math right le 2 week bring math grade f d ( 3 point ) befor need summer school re-tak year , mayb graduat sophomor high-school math ha someth good 2 test , includ tomorrow parent yell stress make self harm issu wors , wont tutor ive dizzi spell faint spell becaus im stress forget care life shit ? ? ? * *","* * 5775 math right le 2 week bring math grade f d ( 3 point ) befor need summer school re-tak year , mayb graduat sophomor high-school math ha someth good 2 test , includ tomorrow parent yell stress make self harm issu wors , will not tutor ive dizzi spell faint spell becaus im stress forget care life shit ? ? ? * *"
1,"nobodi told , let remind fuck awesomei love buddi , care ! dont stress , live best !","nobodi told , let remind fuck awesomei love buddi , care ! do not stress , live best !"
2,work shift killin dust factori bad hard breath everi day nust wors best pay job need money need abl breath tho sleep eat fucntion right rn im exaust 247 cuz sleep cuz got ta real life thing dure day time everywer open dont sleep,work shift killin dust factori bad hard breath everi day nust wors best pay job need money need abl breath tho sleep eat fucntion right rn im exaust 247 cuz sleep cuz got ta real life thing dure day time everywer open do not sleep
3,"im high schooler right , appli colleg im multivari calculu ( colleg level math cours ) right , onli way end class 97-100 testsi mind worri thi everi time think class , just want throw like math befor thi , im extrem rigor cours load , mani extra thing outsid school , 120 % activ just drain sleep best thing onli thing want bc im mental drainedi just dont know 15 week left thi calculu class continu grind im thi exhaust studi effici ? final come , realli need advic thank","im high schooler right , appli colleg im multivari calculu ( colleg level math cours ) right , onli way end class 97-100 testsi mind worri thi everi time think class , just want throw like math befor thi , im extrem rigor cours load , mani extra thing outsid school , 120 % activ just drain sleep best thing onli thing want bc im mental drainedi just do not know 15 week left thi calculu class continu grind im thi exhaust studi effici ? final come , realli need advic thank"
4,"physic , mental sexual frustrat street lack caus stress","physic , mental sexual frustrat street lack caus stress"


In [13]:
def tokenize_with_stopword(text):
    tokens = nltk.word_tokenize(text)
    terms = remove_stopword(tokens)
    return terms

def pre_processing_3(submissions):
    test = []
    for submission in submissions:
        terms_without_contracts = []
        for word in submission.split():
            terms_without_contracts.append(contractions.fix(word))
        text = " ".join(terms_without_contracts)
        
        tokens = nltk.word_tokenize(text)
        terms = remove_stopword(tokens)

        lemma_terms = lemma_tokens(terms)
        stem_terms = stem_tokens(lemma_terms)

        text = " ".join(stem_terms)
        test.append(text);
    return test

preprocessed_3 = pre_processing_3(submissions)

pd.options.display.max_colwidth = 1000
dataframe3 = pd.DataFrame(preprocessed_3, columns=["tokenize_with_stopword"])
dataframe3.head()

Unnamed: 0,tokenize_with_stopword
0,"* * i 5775 math right i 2 week bring math grade f d ( 3 point ) i need summer school re-tak year , mayb graduat i sophomor high-school math i good i 2 test , includ tomorrow my parent yell stress make self harm issu wors , tutor i have dizzi spell faint spell im stress i forget care life shit what i ? ? ? * *"
1,"if told , let remind you fuck awesomei love buddi , care ! stress , live best !"
2,i work shift it killin the dust factori bad hard breath day nust get wors best pay job i need money i need abl breath tho sleep eat fucntion right rn i exaust 247 cuz sleep cuz got real life thing day time when everywer open so sleep
3,"i high schooler right , appli colleg i take multivari calculu ( colleg level math cours ) right , way end a class i 97-100 testsi mind worri everi time i think class , i just want throw i like math , i take extrem rigor cours load , extra thing i outsid school , give 120 % activ just drain sleep best thing thing i want bc i mental drainedi just know go i 15 week left calculu class how i continu grind i exhaust studi effici ? i final come , i realli need advic thank"
4,"physic , mental sexual frustrat street lack caus stress"


In [14]:
def pre_processing_refactor_full(submissions):
    test = []
    for submission in submissions:
        terms_without_contracts = []
        for word in submission.split():
            terms_without_contracts.append(contractions.fix(word))
        text = " ".join(terms_without_contracts)
        
        tokens = nltk.word_tokenize(text)
        terms = remove_stopword(tokens)

        lemma_terms = lemma_tokens(terms)
        stem_terms = stem_tokens(lemma_terms)

        text = " ".join(stem_terms)
        test.append(text);
    return test

preprocessing_refactor_full = pre_processing_refactor_full(submissions)

pd.options.display.max_colwidth = 1000
df_preprocessing_full = pd.DataFrame(preprocessing_refactor_full, columns=["preprocessing_full"])
df_preprocessing_full.head()

Unnamed: 0,preprocessing_full
0,"* * i 5775 math right i 2 week bring math grade f d ( 3 point ) i need summer school re-tak year , mayb graduat i sophomor high-school math i good i 2 test , includ tomorrow my parent yell stress make self harm issu wors , tutor i have dizzi spell faint spell im stress i forget care life shit what i ? ? ? * *"
1,"if told , let remind you fuck awesomei love buddi , care ! stress , live best !"
2,i work shift it killin the dust factori bad hard breath day nust get wors best pay job i need money i need abl breath tho sleep eat fucntion right rn i exaust 247 cuz sleep cuz got real life thing day time when everywer open so sleep
3,"i high schooler right , appli colleg i take multivari calculu ( colleg level math cours ) right , way end a class i 97-100 testsi mind worri everi time i think class , i just want throw i like math , i take extrem rigor cours load , extra thing i outsid school , give 120 % activ just drain sleep best thing thing i want bc i mental drainedi just know go i 15 week left calculu class how i continu grind i exhaust studi effici ? i final come , i realli need advic thank"
4,"physic , mental sexual frustrat street lack caus stress"


In [15]:
def pre_processing_refactor_sw_only(submissions):
    test = []
    for submission in submissions:
        terms_without_contracts = []
        for word in submission.split():
            terms_without_contracts.append(contractions.fix(word))
        text = " ".join(terms_without_contracts)
        
        tokens = nltk.word_tokenize(text)
        terms = remove_stopword(tokens)

        text = " ".join(terms)
        test.append(text);
    return test

preprocessing_refactor_sw_only = pre_processing_refactor_sw_only(submissions)

pd.options.display.max_colwidth = 1000
df_preprocessing_sw_only = pd.DataFrame(preprocessing_refactor_sw_only, columns=["preprocessing_sw_only"])
df_preprocessing_sw_only.head()

Unnamed: 0,preprocessing_sw_only
0,"* * I 5775 Math right I 2 weeks bring math grade F D ( 3 points ) I need summer school re-take year , maybe graduate I Sophomore high-school Math I good I 2 tests , including tomorrow My parents yelling stressing making self harming issue worse , tutor I having dizzy spells fainting spells im stressed I forget care life shit WHAT I ? ? ? * *"
1,"If told , let remind You fucking awesomeI love buddy , care ! stress , live best !"
2,I work shifts Its killin The dust factory bad hard breath day nust gets worse best paying job I need money I need able breath tho sleep eat fucntion right rn I exausted 247 cuz sleep cuz got real life things day time When everyweres open So sleep
3,"I high schooler right , applying college I taking Multivariable calculus ( college level math course ) right , way end A class I 97-100 testsI mind worrying Every time I think class , I just want throw I liked math , I taking extremely rigorous course load , extra things I outside school , giving 120 % activity just draining Sleeping best thing thing I want bc I mentally drainedI just know going I 15 weeks left calculus class How I continue grind I exhausted study efficiently ? I finals coming , I really need advice Thanks"
4,"Physical , mentally sexual frustrated streets lack causing stress"


In [16]:
pd.options.display.max_colwidth = 1000
dataframe_preprocessed = pd.concat([dataframe, df_preprocessing_full, df_preprocessing_sw_only], axis=1, join='inner')

In [17]:
dataframe_preprocessed[['submission', 'preprocessing_sw_only', 'preprocessing_full']]

Unnamed: 0,submission,preprocessing_sw_only,preprocessing_full
0,"* * I have a 5775 in Math right now I have less than 2 weeks to bring up my math grade from an F to a D ( 3 points ) before I need to go to summer school and re-take it next year , and maybe not graduate I am a Sophomore in high-school Math has never been something I am good at I have 2 more tests , including one tomorrow My parents are yelling at me and stressing me out and making my self harming issue worse , and they wont get me a tutor Ive been having dizzy spells and fainting spells because im so stressed that I forget to take care of myself and life is shit WHAT do I do ? ? ? * *","* * I 5775 Math right I 2 weeks bring math grade F D ( 3 points ) I need summer school re-take year , maybe graduate I Sophomore high-school Math I good I 2 tests , including tomorrow My parents yelling stressing making self harming issue worse , tutor I having dizzy spells fainting spells im stressed I forget care life shit WHAT I ? ? ? * *","* * i 5775 math right i 2 week bring math grade f d ( 3 point ) i need summer school re-tak year , mayb graduat i sophomor high-school math i good i 2 test , includ tomorrow my parent yell stress make self harm issu wors , tutor i have dizzi spell faint spell im stress i forget care life shit what i ? ? ? * *"
1,"If nobody told you , let me remind you You are fucking awesomeI love you buddy , take care ! Dont stress , live your best !","If told , let remind You fucking awesomeI love buddy , care ! stress , live best !","if told , let remind you fuck awesomei love buddi , care ! stress , live best !"
2,I work third shifts Its killin me The dust in the factory is so bad its hard to breath every day nust gets worse its best paying job i ever had I need the money I need to be able to breath too tho i cant sleep eat or even fucntion right rn Im exausted 247 cuz i never get to sleep cuz i got ta do real life things during the day time When everyweres open So i dont get to sleep,I work shifts Its killin The dust factory bad hard breath day nust gets worse best paying job I need money I need able breath tho sleep eat fucntion right rn I exausted 247 cuz sleep cuz got real life things day time When everyweres open So sleep,i work shift it killin the dust factori bad hard breath day nust get wors best pay job i need money i need abl breath tho sleep eat fucntion right rn i exaust 247 cuz sleep cuz got real life thing day time when everywer open so sleep
3,"Im a high schooler right now , and am applying to college Im taking Multivariable calculus ( a college level math course ) right now , and the only way for me to end with an A in the class is if I get a 97-100 on the next two testsI cant keep my mind from worrying about this Every time I think about the class , I just want to throw up I liked math before this , but Im also taking an extremely rigorous course load , have so many extra things I do outside of school , and giving my 120 % to each activity is just draining me Sleeping is the best thing and only thing I want to do bc Im so mentally drainedI just dont know how to keep going I have 15 weeks left in this calculus class How do I continue the grind when Im this exhausted and study efficiently ? I have finals coming up , so I really need advice Thanks","I high schooler right , applying college I taking Multivariable calculus ( college level math course ) right , way end A class I 97-100 testsI mind worrying Every time I think class , I just want throw I liked math , I taking extremely rigorous course load , extra things I outside school , giving 120 % activity just draining Sleeping best thing thing I want bc I mentally drainedI just know going I 15 weeks left calculus class How I continue grind I exhausted study efficiently ? I finals coming , I really need advice Thanks","i high schooler right , appli colleg i take multivari calculu ( colleg level math cours ) right , way end a class i 97-100 testsi mind worri everi time i think class , i just want throw i like math , i take extrem rigor cours load , extra thing i outsid school , give 120 % activ just drain sleep best thing thing i want bc i mental drainedi just know go i 15 week left calculu class how i continu grind i exhaust studi effici ? i final come , i realli need advic thank"
4,"Physical , mentally and sexual frustrated from streets and lack of causing more stress","Physical , mentally sexual frustrated streets lack causing stress","physic , mental sexual frustrat street lack caus stress"
5,"Late night thoughts over stupid shitYou ca n't change anything It is near midnightNo action starts to feel like a whirlpool of stressFeeling stupid , hopeless , and like an imposterOpinionsIs there even a reason to give two fucks about what they thinkYou could have done that betterYou failedThey hate meWhat they think does n't matterThey judge meWhat they think matters ? I tried my best , but I never think it will ever be correct or even enoughI 'm not enoughStupid shit that should n't even matter","Late night thoughts stupid shitYou change It near midnightNo action starts feel like whirlpool stressFeeling stupid , hopeless , like imposterOpinionsIs reason fucks thinkYou betterYou failedThey hate meWhat think does matterThey judge meWhat think matters ? I tried best , I think correct enoughI enoughStupid shit matter","late night thought stupid shityou chang it near midnightno action start feel like whirlpool stressfeel stupid , hopeless , like imposteropinionsi reason fuck thinkyou bettery failedthey hate mewhat think doe matterthey judg mewhat think matter ? i tri best , i think correct enoughi enoughstupid shit matter"
6,And feeling I 'm almost passing out for real Only in my life i once semi fainted thos I guess I will leave the car and get near people so if that happens they can wake me Sry just ranting here to try chill,And feeling I passing real Only life semi fainted thos I guess I leave car near people happens wake Sry just ranting try chill,and feel i pass real onli life semi faint tho i guess i leav car near peopl happen wake sri just rant tri chill
7,Anyone with severe stress experience eye throbbing pain and back pain ?,Anyone severe stress experience eye throbbing pain pain ?,anyon sever stress experi eye throb pain pain ?
8,"Breaking up is no fun I know a lot of people would say just sell or get a roommate Im trying to hold on the my house for the kids I could sell and just buy a different one But realized itll be the same problem with bills I only know of one person the maybe could room mate with my brother but honestly I can only stand him for so long Child support is still a possibility I wish a little more money Overtime is an option but that means on missing time with my kids I just wish the easy way out was an easy option for me Im tired of being depressed , stress , all this anxiety is just to much I just want to be done …","Breaking fun I know lot people say just sell roommate I trying hold house kids I sell just buy different But realized problem bills I know person maybe room mate brother honestly I stand long Child support possibility I wish little money Overtime option means missing time kids I just wish easy way easy option I tired depressed , stress , anxiety just I just want …","break fun i know lot peopl say just sell roommat i tri hold hous kid i sell just buy differ but realiz problem bill i know person mayb room mate brother honestli i stand long child support possibl i wish littl money overtim option mean miss time kid i just wish easi way easi option i tire depress , stress , anxieti just i just want …"
9,"intense emotionsIs it normal to feel emotions really intensely and be deeply effected by them ? For example , sometimes I feel like im suffocation whenever im nervous about something , my heart racing and feel i start feeling out of control Stress effects me the same way i dont have a good reason about being nervous , its usually about small things But it still really effects me , and i dont know what to do about it I often have this sick feeling , like something bad is going to happen Or whenever i smell the fresh winter air i get sick to my stomach , somewhat in a good way I also feel intense when im happy , i feel like my heart is bursting Also , i dont often get angry Still , not so long ago i had this incident with a friend What basically happened is that i felt like they were trying to avoid me on purpose and for some reason this really ticked me off , it flipped a switch in my brain i ran after them , and pushed them not so hard that they fell , but still pretty hard Then i ...","intense emotionsIs normal feel emotions really intensely deeply effected ? For example , I feel like im suffocation im nervous , heart racing feel start feeling control Stress effects way good reason nervous , usually small things But really effects , know I sick feeling , like bad going happen Or smell fresh winter air sick stomach , somewhat good way I feel intense im happy , feel like heart bursting Also , angry Still , long ago incident friend What basically happened felt like trying avoid purpose reason really ticked , flipped switch brain ran , pushed hard fell , pretty hard Then went bitch whoops …I know caused react wayis normal problems like emotions ?","intens emotionsi normal feel emot realli intens deepli effect ? for exampl , i feel like im suffoc im nervou , heart race feel start feel control stress effect way good reason nervou , usual small thing but realli effect , know i sick feel , like bad go happen or smell fresh winter air sick stomach , somewhat good way i feel intens im happi , feel like heart burst also , angri still , long ago incid friend what basic happen felt like tri avoid purpos reason realli tick , flip switch brain ran , push hard fell , pretti hard then went bitch whoop …i know caus react wayi normal problem like emot ?"


In [18]:
%store dataframe_preprocessed

Stored 'dataframe_preprocessed' (DataFrame)
