## Stopword removal & Tokenization

In [22]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /Users/mamieo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/mamieo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/mamieo/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
%store -r all_categories

#### Example tokenization and remove stopword 

In [12]:
all_categories['stress']['Selftext'][0]

'Its about to be 2020 and Im panicking because I dont know what to do I get nervous thinking about the future and imagining what could happen and its really stressing me out😓'

In [13]:
def token(submission):
    token_words = word_tokenize(submission)
    return token_words
    
terms = token(all_categories['stress']['Selftext'][0])

print(terms)

['Its', 'about', 'to', 'be', '2020', 'and', 'Im', 'panicking', 'because', 'I', 'dont', 'know', 'what', 'to', 'do', 'I', 'get', 'nervous', 'thinking', 'about', 'the', 'future', 'and', 'imagining', 'what', 'could', 'happen', 'and', 'its', 'really', 'stressing', 'me', 'out😓']


In [17]:
def stem_tokens(terms):
    porter = PorterStemmer()
    stem_terms = []
    for term in terms:
        stem_terms.append(porter.stem(term))
    return stem_terms;

def lemma_tokens(terms):
    wordnet_lemmatizer = WordNetLemmatizer()
    lemma_terms = []

    for term in terms:
        lemma_terms.append(wordnet_lemmatizer.lemmatize(term))
    return lemma_terms;

def remove_stopword(terms):
    stop_words = set(stopwords.words('english'))

    tokens_without_sw = [word for word in terms if not word in stop_words]
    return tokens_without_sw
    
lemma_terms = lemma_tokens(terms)
stem_terms = stem_tokens(lemma_terms)
terms = remove_stopword(terms)

print(stem_terms)

['it', '2020', 'im', 'panick', 'i', 'dont', 'know', 'i', 'get', 'nervou', 'think', 'futur', 'imagin', 'could', 'happen', 'realli', 'stress', 'out😓']


#### def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data)
    data = remove_apostrophe(data)
    data = remove_single_characters(data)
    data = convert_numbers(data)
    data = remove_stop_words(data)
    data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)

1. No need to convert lower case because stemming and lemma will do it
2. Punctuation is the set of unnecessary symbols that are in our corpus documents.
3. Note that there is no ‘ apostrophe in the punctuation symbols. Because when we remove punctuation first it will convert don’t to dont, and it is a stop word that won't be removed.
4. Single characters are not much useful in knowing the importance of the document and few final single characters might be irrelevant symbols
5. Stemming, playing and played are the same type of words that basically indicate an action play.
6. Lemmatisation is a way to reduce the word to the root synonym of a word.

### Thus, 
#### if the word is very common and appears in many documents, this number will approach 0. Otherwise, it will approach 1.

- The most significant word for **document A** is man and walk
- The most significant word for **document B** is around, children, fire, and sat

# Preprocessing - Real Data

In [24]:
submission1 = all_categories['stress']['Selftext'][0]
submission2 = all_categories['stress']['Selftext'][1]
submission3 = all_categories['stress']['Selftext'][2]

submissions = [submission1, submission2, submission3]

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    lemma_terms = lemma_tokens(tokens)
    stem_terms = stem_tokens(lemma_terms)
    terms = remove_stopword(stem_terms)
    return terms

# vectorizer = TfidfVectorizer(tokenizer=tokenize)
# submission_vectors = vectorizer.fit_transform(submissions)

# dense = submission_vectors.todense()
# submission_list = dense.tolist()

df = pd.DataFrame(submission_list, columns=vectorizer.get_feature_names())
df



Unnamed: 0,'m,",",2020,abl,activ,allow,also,ani,appli,arent,...,think,though,time,tri,wa,week,wont,worri,would,year
0,0.0,0.0,0.286675,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.218024,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.491607,0.0,0.080801,0.080801,0.0,0.080801,0.161601,0.080801,0.080801,...,0.0,0.080801,0.122902,0.061451,0.323202,0.080801,0.080801,0.061451,0.0,0.061451
2,0.100399,0.152712,0.0,0.0,0.0,0.100399,0.0,0.0,0.0,0.0,...,0.076356,0.0,0.152712,0.076356,0.0,0.0,0.0,0.305425,0.200798,0.076356


In [81]:
df.sort_values(by=['worri'], ascending=False)

Unnamed: 0,'m,",",2020,abl,activ,allow,also,ani,appli,arent,...,think,though,time,tri,wa,week,wont,worri,would,year
2,0.100399,0.152712,0.0,0.0,0.0,0.100399,0.0,0.0,0.0,0.0,...,0.076356,0.0,0.152712,0.076356,0.0,0.0,0.0,0.305425,0.200798,0.076356
1,0.0,0.491607,0.0,0.080801,0.080801,0.0,0.080801,0.161601,0.080801,0.080801,...,0.0,0.080801,0.122902,0.061451,0.323202,0.080801,0.080801,0.061451,0.0,0.061451
0,0.0,0.0,0.286675,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.218024,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [82]:
%store -r all_categories

In [31]:
for submission in submissions:
    print(submission)
    print("====")
    terms = tokenize(submission)
    print(terms)
    print("=======")
    
    text = " ".join(terms)
    print(text)
    print("#####")

# df = pd.DataFrame(submission_list, columns=vectorizer.get_feature_names())
# df

Its about to be 2020 and Im panicking because I dont know what to do I get nervous thinking about the future and imagining what could happen and its really stressing me out😓
====
['2020', 'im', 'panick', 'becaus', 'dont', 'know', 'get', 'nervou', 'think', 'futur', 'imagin', 'could', 'happen', 'realli', 'stress', 'out😓']
2020 im panick becaus dont know get nervou think futur imagin could happen realli stress out😓
Im a sophomore in college, and last semester was super busy I immersed myself in a lot of activities last year, mostly because I was really depressed and needed things to do on campus Ive enjoyed all the things Im a part of now though, but it definitely takes up a lot of time I should have searched for more internships last semester, but I procrastinated and was also sick all the time I applied for a big internship that was through my school, but I didnt get it, and I didnt look into any other internshipsIm going back to school in a couple weeks and the stress is really settlin

##  Summary without Bigrams