In [122]:
import nltk
from nltk import word_tokenize

In [123]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#Tokenization

In [124]:
sentence = 'This animal tale, for kids is - about a  @fisherman'
tokens = word_tokenize(sentence)
tokens

['This',
 'animal',
 'tale',
 ',',
 'for',
 'kids',
 'is',
 '-',
 'about',
 'a',
 '@',
 'fisherman']

In [125]:
sentence = sentence.lower()
tokens = word_tokenize(sentence)
tokens

['this',
 'animal',
 'tale',
 ',',
 'for',
 'kids',
 'is',
 '-',
 'about',
 'a',
 '@',
 'fisherman']

In [126]:
import string
sentence_without_punctuation = "".join([char for char in sentence if char not in string.punctuation])

tokens_without_punctuation = word_tokenize(sentence_without_punctuation)

print(tokens_without_punctuation)

['this', 'animal', 'tale', 'for', 'kids', 'is', 'about', 'a', 'fisherman']


In [127]:
#Lower casing
sentence = "Books are on the Table"
print(sentence.lower())

books are on the table


In [128]:
print(sentence.upper())

BOOKS ARE ON THE TABLE


#Removing stop words

In [129]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [130]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print(stop_words[0:10])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


#Stemming:process of reducing the word stems or roots

In [131]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
ps = PorterStemmer()

words = ["program",'programs','programmer','programming','programmers']
for w in words:
  print(w,":",ps.stem(w))

program : program
programs : program
programmer : programm
programming : program
programmers : programm


#Removing punctuation from a string

In [132]:
import string
def remove_punctuation(input_string):
  translator = str.maketrans("","",string.punctuation)

  result = input_string.translate(translator)
  return result
text = "Hello, world! This is a @ sample string with punctuation. A spaces!,()"
output = remove_punctuation(text)

print("Original String:",text)
print("String without punctuation:",output)


Original String: Hello, world! This is a @ sample string with punctuation. A spaces!,()
String without punctuation: Hello world This is a  sample string with punctuation A spaces


#N-grams

In [133]:
from nltk.util import ngrams
sentence = "Natural language processing is a field of study focused on the text"
words = sentence.split()

#bigrams
bigrams = ngrams(words,2)
for bigram in bigrams:
  print(bigram)

('Natural', 'language')
('language', 'processing')
('processing', 'is')
('is', 'a')
('a', 'field')
('field', 'of')
('of', 'study')
('study', 'focused')
('focused', 'on')
('on', 'the')
('the', 'text')


In [134]:
#trigrams
trigrams = ngrams(words,3)
for trigram in trigrams:
  print(trigram)

('Natural', 'language', 'processing')
('language', 'processing', 'is')
('processing', 'is', 'a')
('is', 'a', 'field')
('a', 'field', 'of')
('field', 'of', 'study')
('of', 'study', 'focused')
('study', 'focused', 'on')
('focused', 'on', 'the')
('on', 'the', 'text')


In [135]:
#unigrams
unigrams = ngrams(words,1)
for unigram in unigrams:
  print(unigram)

('Natural',)
('language',)
('processing',)
('is',)
('a',)
('field',)
('of',)
('study',)
('focused',)
('on',)
('the',)
('text',)


#frequency analysis for unigram,bigrams,trigrams

In [136]:
corpus = [
    'Great oppurtunity. Love the Professor',
    'Great content. Textbook was great',
    'This course is very interesting. Great content',
    'very interesting',
    'Love the professor',
    'Easy to understand'
]
import pandas as pd
df = pd.DataFrame(corpus)
df.columns = ['reviews']
df.head(3)

Unnamed: 0,reviews
0,Great oppurtunity. Love the Professor
1,Great content. Textbook was great
2,This course is very interesting. Great content


In [137]:
from nltk.corpus import stopwords
stoplist = stopwords.words('english') + ['though']
from sklearn.feature_extraction.text import CountVectorizer
c_vec = CountVectorizer(stop_words=stoplist,ngram_range=(2,3))

#matrix of ngrams
ngrams = c_vec.fit_transform(df['reviews'])

# count frequency of ngrams
count_values = ngrams.toarray().sum(axis=0)

#list of ngrams
vocab = c_vec.vocabulary_
df_ngram = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocab.items()],reverse = True)).rename(columns={0:'frequency',1:'bigram/trigram'})
df_ngram.head(5)

Unnamed: 0,frequency,bigram/trigram
0,2,love professor
1,2,great content
2,1,textbook great
3,1,oppurtunity love professor
4,1,oppurtunity love
