In [1]:
import nltk

In [2]:
from nltk.tokenize import word_tokenize

In [3]:
sentence = 'The quick brown fox jumps over the lazy dog'

In [4]:
tokenized = word_tokenize(sentence)

In [5]:
tokenized

['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

In [6]:
from nltk.tokenize import sent_tokenize

In [7]:
sentence = 'The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog.'


In [8]:
tokenized = sent_tokenize(sentence)

In [9]:
tokenized

['The quick brown fox jumps over the lazy dog.',
 'The quick brown fox jumps over the lazy dog.',
 'The quick brown fox jumps over the lazy dog.']

In [10]:
from nltk.stem import PorterStemmer

words = ['program', 'programming', 'programed', 'programmer']

stem = []

ps = PorterStemmer()

for w in words:
    stem.append(ps.stem(w))

stem

['program', 'program', 'program', 'programm']

In [11]:
from nltk.stem import WordNetLemmatizer

words = ['stopped', 'study', 'floors', 'crying']

lem = []

wnl = WordNetLemmatizer()

for w in words:
    lem.append(wnl.lemmatize(w))
    
lem

['stopped', 'study', 'floor', 'cry']

In [12]:
from nltk.corpus import stopwords

stopwords = stopwords.words('english')
stopwords

text = 'AI was introduced in the year 1956 but it gained popularity recently.'

tokenized = word_tokenize(text)

remstopwords = []

for token in tokenized:
    if token not in set(stopwords):
        remstopwords.append(token)
        


In [13]:
remstopwords

['AI', 'introduced', 'year', '1956', 'gained', 'popularity', 'recently', '.']

In [14]:
text = 'AI was introduced in the year 1956 but it gained popularity recently.'

postag = []

tokenized = word_tokenize(text)

for token in tokenized:
    postag.append(nltk.pos_tag([token]))

corpus = [
    'The quick brown fox jumps over the lazy dog',
    'The brown fox is quick',
    'The lazy dog is sleeping'
]

In [15]:
postag

[[('AI', 'NN')],
 [('was', 'VBD')],
 [('introduced', 'VBN')],
 [('in', 'IN')],
 [('the', 'DT')],
 [('year', 'NN')],
 [('1956', 'CD')],
 [('but', 'CC')],
 [('it', 'PRP')],
 [('gained', 'VBN')],
 [('popularity', 'NN')],
 [('recently', 'RB')],
 [('.', '.')]]

In [16]:
def tf(corpus):
    dic={}
    for document in corpus:
        for word in document.split():
            if word in dic:
                dic[word]+=1
            else:
                dic[word]=1
        for word,freq in dic.items():
            dic[word]=freq/len(document.split())
    return dic
TF = tf(corpus)
TF

{'The': 0.24444444444444446,
 'quick': 0.044444444444444446,
 'brown': 0.044444444444444446,
 'fox': 0.044444444444444446,
 'jumps': 0.004444444444444444,
 'over': 0.004444444444444444,
 'the': 0.004444444444444444,
 'lazy': 0.20444444444444443,
 'dog': 0.20444444444444443,
 'is': 0.24,
 'sleeping': 0.2}

In [17]:
import math

In [18]:


n = len(corpus)

def idf(dic):
    idfdic = {}
    
    for tok, freq in dic.items():
        cnt = 0
        for document in corpus:
            if tok in document:
                cnt += 1
        idfdic[tok] = math.log(n / cnt)
    return idfdic 

IDF = idf(TF)
IDF

{'The': 0.0,
 'quick': 0.4054651081081644,
 'brown': 0.4054651081081644,
 'fox': 0.4054651081081644,
 'jumps': 1.0986122886681098,
 'over': 1.0986122886681098,
 'the': 1.0986122886681098,
 'lazy': 0.4054651081081644,
 'dog': 0.4054651081081644,
 'is': 0.4054651081081644,
 'sleeping': 1.0986122886681098}

In [19]:
tfidfsl = []
for document in corpus:
    tfidfs = {}
    for word in set(document.split()):
        tfidfs[word] = TF[word] * IDF[word]
    tfidfsl.append(tfidfs)

tfidfsl

[{'jumps': 0.004882721282969376,
  'over': 0.004882721282969376,
  'quick': 0.018020671471473973,
  'lazy': 0.08289508876878027,
  'the': 0.004882721282969376,
  'brown': 0.018020671471473973,
  'fox': 0.018020671471473973,
  'dog': 0.08289508876878027,
  'The': 0.0},
 {'quick': 0.018020671471473973,
  'brown': 0.018020671471473973,
  'fox': 0.018020671471473973,
  'is': 0.09731162594595945,
  'The': 0.0},
 {'sleeping': 0.21972245773362198,
  'lazy': 0.08289508876878027,
  'is': 0.09731162594595945,
  'dog': 0.08289508876878027,
  'The': 0.0}]

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
tfidf = TfidfVectorizer()

In [22]:
result = tfidf.fit_transform(corpus)

In [23]:
print('\nWord indexes:')
print(tfidf.vocabulary_)


Word indexes:
{'the': 9, 'quick': 7, 'brown': 0, 'fox': 2, 'jumps': 4, 'over': 6, 'lazy': 5, 'dog': 1, 'is': 3, 'sleeping': 8}


In [24]:
print('\ntf-idf values:')
print(result)


tf-idf values:
  (0, 1)	0.30330642493908333
  (0, 5)	0.30330642493908333
  (0, 6)	0.3988114995291713
  (0, 4)	0.3988114995291713
  (0, 2)	0.30330642493908333
  (0, 0)	0.30330642493908333
  (0, 7)	0.30330642493908333
  (0, 9)	0.4710889922721062
  (1, 3)	0.46609584262774545
  (1, 2)	0.46609584262774545
  (1, 0)	0.46609584262774545
  (1, 7)	0.46609584262774545
  (1, 9)	0.3619650009883935
  (2, 8)	0.5694308628404254
  (2, 3)	0.43306684852870914
  (2, 1)	0.43306684852870914
  (2, 5)	0.43306684852870914
  (2, 9)	0.33631504064053513
