#### 1 tweet = document

1) lower case

2) remove stopwords

3) remove stems and lemmas

## Import Packages

In [28]:
import pandas as pd
import numpy as np
from nltk.stem import LancasterStemmer, SnowballStemmer, RegexpStemmer, WordNetLemmatizer 

## Create DataFrame and EDA

In [3]:
df = pd.read_csv('cleaned_all_samples.csv')

In [4]:
print(len(df))
df.head()

243390


Unnamed: 0.1,Unnamed: 0,hashtags,label,like_count,reply_count,retweet_count,tweet,cleaned_tweets
0,0,[politics],1,0.0,0.0,0.0,#politics NH gun shop owner gifts Trump semi-a...,NH gun shop owner gifts Trump semi-automatic r...
1,1,"[ISIS, targets, iceisis, opiceisis]",1,0.0,0.0,0.0,RT @CtrlSec: Targeted #ISIS accounts https://t...,Targeted accounts
2,2,[],0,42.0,0.0,9.0,16-year-old environmental activist Greta Thunb...,16-year-old environmental activist Greta Thunb...
3,3,['#moleg'],0,6.0,1.0,1.0,"Thank God for @repdottieb4mo Under her watch,...","Thank God for Under her watch, 110 remains ..."
4,4,[],0,4.0,0.0,0.0,The Rise of Purpose Education: A Recipe for Fu...,The Rise of Purpose Education: A Recipe for Fu...


## Word Tokenize

In [5]:
#import sentence tokenizer
from nltk import sent_tokenize

#import word tokenizer
from nltk import word_tokenize

In [6]:
#this was part of the NLP notebook

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/markbrennan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
#eda to check for na's

df.isnull().sum()

#NOTE: some tweets became NaN's after they were cleaned, probably insignificant

Unnamed: 0            0
hashtags          10770
label                 0
like_count           96
reply_count          96
retweet_count        96
tweet                 0
cleaned_tweets       74
dtype: int64

In [8]:
#drop the NaN's from cleaned_tweets

df['cleaned_tweets'].dropna(inplace=True)

In [9]:
df.isnull().sum() # checking NaN's again

Unnamed: 0            0
hashtags          10770
label                 0
like_count           96
reply_count          96
retweet_count        96
tweet                 0
cleaned_tweets       74
dtype: int64

## Lowercase and Tokenize the Tweets

In [12]:
#lowercase all the values in the cleaned tweets

df['cleaned_tweets'] = df.cleaned_tweets.astype(str).str.lower()

In [13]:
#tokenize all rows of cleaned tweets

df['tokenized_tweets'] = df['cleaned_tweets'].apply(word_tokenize)

In [14]:
df.head()

Unnamed: 0.1,Unnamed: 0,hashtags,label,like_count,reply_count,retweet_count,tweet,cleaned_tweets,tokenized_tweets
0,0,[politics],1,0.0,0.0,0.0,#politics NH gun shop owner gifts Trump semi-a...,nh gun shop owner gifts trump semi-automatic r...,"[nh, gun, shop, owner, gifts, trump, semi-auto..."
1,1,"[ISIS, targets, iceisis, opiceisis]",1,0.0,0.0,0.0,RT @CtrlSec: Targeted #ISIS accounts https://t...,targeted accounts,"[targeted, accounts]"
2,2,[],0,42.0,0.0,9.0,16-year-old environmental activist Greta Thunb...,16-year-old environmental activist greta thunb...,"[16-year-old, environmental, activist, greta, ..."
3,3,['#moleg'],0,6.0,1.0,1.0,"Thank God for @repdottieb4mo Under her watch,...","thank god for under her watch, 110 remains ...","[thank, god, for, under, her, watch, ,, 110, r..."
4,4,[],0,4.0,0.0,0.0,The Rise of Purpose Education: A Recipe for Fu...,the rise of purpose education: a recipe for fu...,"[the, rise, of, purpose, education, :, a, reci..."


## Removing Stop Words

In [15]:
from nltk.corpus import stopwords

In [16]:
my_stopwords = set(stopwords.words('english'))
# my_stopwords

In [17]:
#remove stop words

df['tokenized_tweets'] = df['tokenized_tweets'].apply(lambda x: [item for item in x if item not in my_stopwords])


#### Comparing for stop words

In [19]:
df['cleaned_tweets'][28] # checking for stop words

't-minus 5 days until nyc!'

In [20]:
df['tokenized_tweets'][28] # checking for stop words

['t-minus', '5', 'days', 'nyc', '!']

## Removing Punctuation

In [21]:
import string

In [22]:
punct =[]
punct += list(string.punctuation)
punct += '’'

In [23]:
punct

['!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '{',
 '|',
 '}',
 '~',
 '’']

In [24]:
#borrowed code from StackOverFlow

df['tokenized_tweets'] = df['tokenized_tweets'].apply(lambda x: [item for item in x if item not in punct])


#### Compare the cleaned / tokenized columns

In [25]:
df['tweet'][10] #comparing token cleaning

'It’s something that’s been sticking out and definitely bothering me! There’s the narrative that we’re so liberal but 2018 was basically all moderates winning. And then of course the decision as to what’s a sexy storyline and what isn’t. Think votes are what count 🤔'

In [26]:
df['cleaned_tweets'][10] #comparing token cleaning

'it’s something that’s been sticking out and definitely bothering me! there’s the narrative that we’re so liberal but 2018 was basically all moderates winning. and then of course the decision as to what’s a sexy storyline and what isn’t. think votes are what count '

In [27]:
df['tokenized_tweets'][10] #comparing token cleaning

['something',
 'sticking',
 'definitely',
 'bothering',
 'narrative',
 'liberal',
 '2018',
 'basically',
 'moderates',
 'winning',
 'course',
 'decision',
 'sexy',
 'storyline',
 'think',
 'votes',
 'count']

### Convert the tokens back to a string for stems and lemma

In [157]:
#df[dummy] is a dummy column to perform stem and lemma on

df['dummy'] = df['tokenized_tweets'].apply(lambda x: ' '.join(map(str, x)))


In [158]:
df.head() # checking results

Unnamed: 0.1,Unnamed: 0,hashtags,label,like_count,reply_count,retweet_count,tweet,cleaned_tweets,tokenized_tweets,dummy
0,0,[politics],1,0.0,0.0,0.0,#politics NH gun shop owner gifts Trump semi-a...,nh gun shop owner gifts trump semi-automatic r...,"[nh, gun, shop, owner, gifts, trump, semi-auto...",nh gun shop owner gifts trump semi-automatic r...
1,1,"[ISIS, targets, iceisis, opiceisis]",1,0.0,0.0,0.0,RT @CtrlSec: Targeted #ISIS accounts https://t...,targeted accounts,"[targeted, accounts]",targeted accounts
2,2,[],0,42.0,0.0,9.0,16-year-old environmental activist Greta Thunb...,16-year-old environmental activist greta thunb...,"[16-year-old, environmental, activist, greta, ...",16-year-old environmental activist greta thunb...
3,3,['#moleg'],0,6.0,1.0,1.0,"Thank God for @repdottieb4mo Under her watch,...","thank god for under her watch, 110 remains ...","[thank, god, watch, 110, remains, safe, secure...",thank god watch 110 remains safe secure leftis...
4,4,[],0,4.0,0.0,0.0,The Rise of Purpose Education: A Recipe for Fu...,the rise of purpose education: a recipe for fu...,"[rise, purpose, education, recipe, fulfillment...",rise purpose education recipe fulfillment snow...


## Stems and Lemmas

In [126]:
#Import stemmers and lemmatizers

from nltk.stem import LancasterStemmer, SnowballStemmer, RegexpStemmer, WordNetLemmatizer 

In [142]:
#instantiating

snowball = SnowballStemmer('english')
lancaster = LancasterStemmer() #more aggressive
regex_stemmer = RegexpStemmer('ing$|s$|e$|able$', min=4)
lemma = WordNetLemmatizer()

In [145]:
# functions to get stems and lemmas

def stem_words(document,stemmer):
    # tokenize the text
    toks = word_tokenize(document)
    wrd_list = []
    # go through the tokens
    for word in toks:
        # stem the tokens
        wrd_list.append(stemmer.stem(word))
    # return them
    return " ".join(wrd_list)


def lem_words(document,lemmer):
    toks = word_tokenize(document)
    wrd_list = []
    for word in toks:
        wrd_list.append(lemmer.lemmatize(word))
    return " ".join(wrd_list)

In [131]:
df['tokenized_tweets'][10]

['something',
 'sticking',
 'definitely',
 'bothering',
 'narrative',
 'liberal',
 '2018',
 'basically',
 'moderates',
 'winning',
 'course',
 'decision',
 'sexy',
 'storyline',
 'think',
 'votes',
 'count']

### combine tokens into a string


In [138]:
#testing a randomy dummy row to see how each stemmer/lemmer performs
dummy = ' '.join(df['tokenized_tweets'][10])

In [139]:
stem_words(dummy, snowball)

'someth stick definit bother narrat liber 2018 basic moder win cours decis sexi storylin think vote count'

In [143]:
stem_words(dummy, lancaster)

'someth stick definit both nar lib 2018 bas mod win cours decid sexy storylin think vot count'

In [144]:
stem_words(dummy, regex_stemmer)

'someth stick definitely bother narrativ liberal 2018 basically moderate winn cours decision sexy storylin think vote count'

In [146]:
lem_words(dummy, lemma)

'something sticking definitely bothering narrative liberal 2018 basically moderate winning course decision sexy storyline think vote count'

### Just testing stuff out now

In [154]:
lemma.lemmatize('friends')

'friend'

In [147]:
df['tokenized_tweets'][10]

['something',
 'sticking',
 'definitely',
 'bothering',
 'narrative',
 'liberal',
 '2018',
 'basically',
 'moderates',
 'winning',
 'course',
 'decision',
 'sexy',
 'storyline',
 'think',
 'votes',
 'count']

## Vectorize