# Pre-processing

### 1. Tokenization: 

Tokenization is the process of breaking down a text into individual words, phrases, or symbols. In Python, you can use the NLTK or spaCy libraries for tokenization.

In [None]:
import nltk
# nltk.download('punkt')

from nltk.tokenize import word_tokenize

text = "This is PGDBA Data Science Class."
tokens = word_tokenize(text)
print(tokens)

: 

In [2]:
tagged = nltk.pos_tag(tokens)
tagged[:]

[('This', 'DT'),
 ('is', 'VBZ'),
 ('PGDBA', 'NNP'),
 ('Data', 'NNP'),
 ('Science', 'NNP'),
 ('Class', 'NNP'),
 ('.', '.')]

### 2. Punctuation and Stop word removal: 

Stop words are common words that are usually removed from text because they don't carry much meaning, such as "a", "an", "the", "in", "of", etc. You can use the NLTK library to remove stop words.

In [3]:
import string
import re

no_punct = "".join([char for char in text if char not in string.punctuation])

print(no_punct)

This is PGDBA Data Science Class


In [5]:
from nltk.corpus import stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [6]:
# nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

filtered_tokens = "".join([word.lower() for word in tokens if not word.lower() in stop_words])
print(filtered_tokens)

['pgdba', 'data', 'science', 'class', '.']


### 3. Stemming and Lemmatization: 
Stemming and Lemmatization are techniques used to reduce words to their base form (or root form) so that similar words can be grouped together. In Python, you can use the NLTK or spaCy libraries for stemming and lemmatization.

In [7]:
# nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

text = "rock rocks rocking history geography happy happyly"
tokens = word_tokenize(text)

lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
print(lemmatized_tokens)

stemmed_tokens = [stemmer.stem(word) for word in tokens]
print(stemmed_tokens)

['rock', 'rock', 'rocking', 'history', 'geography', 'happy', 'happyly']
['rock', 'rock', 'rock', 'histori', 'geographi', 'happi', 'happyli']
