<a href="https://colab.research.google.com/github/neuralsrg/NLP/blob/main/vector_spaces/text_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk 
import nltk.corpus
import nltk.tokenize
import nltk.stem
import re
import string

In [None]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


True

In [None]:
positive = nltk.corpus.twitter_samples.strings('positive_tweets.json')
negative = nltk.corpus.twitter_samples.strings('negative_tweets.json')

In [None]:
type(positive), type(negative)

(list, list)

In [None]:
type(positive[0]), type(negative[0])

(str, str)

In [None]:
len(positive), len(negative)

(5000, 5000)

In [None]:
print(f'positive: {positive[64]}')
print(f'negative: {negative[64]}')

positive: @CarcassDrop Woohoo! Can't wait :) Have you signed up yet, or still thinking about it? MKa
negative: HUNGRY :-(


# Preprocessing

In [None]:
tweet = positive[2277]
tweet

'My beautiful sunflowers on a sunny Friday morning off :) #sunflowers #favourites #happy #Friday off… https://t.co/3tfYom0N1i'

### Removing hyperlinks

In [None]:
tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
tweet

'My beautiful sunflowers on a sunny Friday morning off :) #sunflowers #favourites #happy #Friday off… '

In [None]:
tweet = re.sub(r'^RT[\s]+', '', tweet) # twitter retweet
tweet = re.sub(r'#', '', tweet) # removing tags
tweet

'My beautiful sunflowers on a sunny Friday morning off :) sunflowers favourites happy Friday off… '

### Tokenize

In [None]:
tokenizer = nltk.tokenize.TweetTokenizer(
    preserve_case=False,
    reduce_len=True, # whether to replace repeated character sequences of length 3 or greater with sequences of length 3
    strip_handles=True, # whether to remove Twitter handles of text used in the tokenize method
    match_phone_numbers=True # whether the tokenize method should look for phone numbers
)

In [None]:
tokens = tokenizer.tokenize(tweet)
tokens

['my',
 'beautiful',
 'sunflowers',
 'on',
 'a',
 'sunny',
 'friday',
 'morning',
 'off',
 ':)',
 'sunflowers',
 'favourites',
 'happy',
 'friday',
 'off',
 '…']

### Removing stop words and punctuation

In [None]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [None]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
tokens = [word for word in tokens if (word not in stopwords) & (word not in string.punctuation)]
tokens

['beautiful',
 'sunflowers',
 'sunny',
 'friday',
 'morning',
 ':)',
 'sunflowers',
 'favourites',
 'happy',
 'friday',
 '…']

In [None]:
stemmer = nltk.stem.PorterStemmer()
stems = [stemmer.stem(word) for word in tokens]
stems

['beauti',
 'sunflow',
 'sunni',
 'friday',
 'morn',
 ':)',
 'sunflow',
 'favourit',
 'happi',
 'friday',
 '…']