In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Algorithm
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import warnings
warnings.filterwarnings("ignore")

import glob
import re

## Data Gathering

In [2]:
pos_folder_path = r"movie_reviews\pos"
neg_folder_path = r"movie_reviews\neg"

In [3]:
pos_review_files = glob.glob(f"{pos_folder_path}\\*.txt")
neg_review_files = glob.glob(f"{neg_folder_path}\\*.txt")

In [64]:
from nltk.tokenize import word_tokenize

def lemmitization(text):
    tokens = word_tokenize(text)
    post = PorterStemmer()
    
    output_text = []
    for word in tokens:
        lem_word = post.stem(word.lower())
        output_text.append(lem_word)
    
    text = " ".join(output_text)
    
    return text

In [65]:
text = "Python is Programming Language and Python Language is Preferable for Data Science"

lemmitization(text)

'python is program languag and python languag is prefer for data scienc'

In [66]:
review_list = []

for file_name in pos_review_files:
#     print("*" * 50 ,"File Name -->", file_name)
    with open(file_name, "r") as f:
            text = f.read()

    # Normalize the text
    text = text.lower()
    
    text = re.sub('[^a-z]', " ", text)
    
    # function call
    text = lemmitization(text)
    
    review_list.append(text)
    
for file_name in neg_review_files:
#     print("*" * 50 ,"File Name -->", file_name)
    with open(file_name, "r") as f:
            text = f.read()

    # Normalize the text
    text = text.lower()
    
    # Clean the text
    text = re.sub('[^a-z]', " ", text)
    
    # function call
    text = lemmitization(text)
    
    review_list.append(text)
    
len(review_list)    

2000

In [67]:
# Create Target Column
pos_target = np.ones(len(pos_review_files), dtype=int)
neg_target = np.zeros(len(neg_review_files), dtype=int)

y = np.append(pos_target, neg_target)

y = pd.Series(y)
y

0       1
1       1
2       1
3       1
4       1
       ..
1995    0
1996    0
1997    0
1998    0
1999    0
Length: 2000, dtype: int32

### CountVectorizer

In [68]:
# Create Instance
cnt_vector = CountVectorizer(stop_words='english', min_df = 0.05)

x_cnt_array = cnt_vector.fit_transform(review_list).toarray()

x_cnt = pd.DataFrame(x_cnt_array, columns=cnt_vector.get_feature_names())
x_cnt

Unnamed: 0,abil,abl,abov,absolut,accent,accept,achiev,act,action,actor,...,wouldn,write,writer,written,wrong,wrote,ye,year,york,young
0,0,0,0,0,2,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,2,0,0
3,0,0,0,0,0,0,0,6,0,1,...,0,0,0,0,0,0,0,1,1,1
4,0,0,0,0,0,0,0,6,0,1,...,0,0,0,0,0,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,1,2,0,0
1996,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,2,0,0
1997,1,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
1998,0,0,0,0,0,0,0,0,0,1,...,1,0,0,1,1,0,0,0,0,0


In [21]:
len(cnt_vector.get_feature_names())

885

# Train Test Split

In [70]:
x_train, x_test, y_train, y_test = train_test_split(x_cnt, y, test_size=0.25, random_state=25, stratify=y)
x_train

Unnamed: 0,abil,abl,abov,absolut,accent,accept,achiev,act,action,actor,...,wouldn,write,writer,written,wrong,wrote,ye,year,york,young
118,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
1374,0,0,0,0,0,0,0,2,1,0,...,0,0,0,0,0,0,0,0,0,0
1928,0,0,0,0,0,0,0,2,1,1,...,1,0,0,0,0,0,0,1,0,0
250,0,1,0,0,0,4,0,1,0,0,...,0,0,0,1,0,0,0,1,0,0
1621,0,0,0,0,0,0,0,3,0,0,...,2,0,0,0,0,0,0,1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1459,0,1,0,0,0,0,0,1,0,2,...,0,0,0,0,0,0,0,0,0,0
1730,0,0,0,0,0,0,0,2,0,1,...,0,0,0,0,0,0,0,0,0,1
725,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,2,2,0
850,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


# Model Training

### 1. GaussianNB

For good reults, data is continous and should be normally distributed.

In [71]:
gnb_model = GaussianNB()
gnb_model.fit(x_train, y_train)

In [72]:
# Testing Model Evaluation

y_pred = gnb_model.predict(x_test)

cnf_matrix = confusion_matrix(y_pred, y_test)
print("Confusion Matrix:\n",cnf_matrix)
print("-"*60)

accuracy = accuracy_score(y_pred, y_test)
print("Accuracy:",accuracy)
print("-"*60)

clf_report = classification_report(y_pred, y_test)
print("Classification Report:\n",clf_report)

Confusion Matrix:
 [[209 108]
 [ 41 142]]
------------------------------------------------------------
Accuracy: 0.702
------------------------------------------------------------
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.66      0.74       317
           1       0.57      0.78      0.66       183

    accuracy                           0.70       500
   macro avg       0.70      0.72      0.70       500
weighted avg       0.74      0.70      0.71       500



In [73]:
# Training Model Evaluation

y_pred_train = gnb_model.predict(x_train)

cnf_matrix = confusion_matrix(y_pred_train, y_train)
print("Confusion Matrix:\n",cnf_matrix)
print("-"*60)

accuracy = accuracy_score(y_pred_train, y_train)
print("Accuracy:",accuracy)
print("-"*60)

clf_report = classification_report(y_pred_train, y_train)
print("Classification Report:\n",clf_report)

Confusion Matrix:
 [[678 221]
 [ 72 529]]
------------------------------------------------------------
Accuracy: 0.8046666666666666
------------------------------------------------------------
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.75      0.82       899
           1       0.71      0.88      0.78       601

    accuracy                           0.80      1500
   macro avg       0.80      0.82      0.80      1500
weighted avg       0.82      0.80      0.81      1500



## 2. Multinomial NB

In [None]:
Will perform well on descrete data. 

In [74]:
mnb_model = MultinomialNB()
mnb_model.fit(x_train, y_train)

In [75]:
# Testing Model Evaluation

y_pred = mnb_model.predict(x_test)

cnf_matrix = confusion_matrix(y_pred, y_test)
print("Confusion Matrix:\n",cnf_matrix)
print("-"*60)

accuracy = accuracy_score(y_pred, y_test)
print("Accuracy:",accuracy)
print("-"*60)

clf_report = classification_report(y_pred, y_test)
print("Classification Report:\n",clf_report)

Confusion Matrix:
 [[203  55]
 [ 47 195]]
------------------------------------------------------------
Accuracy: 0.796
------------------------------------------------------------
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.79      0.80       258
           1       0.78      0.81      0.79       242

    accuracy                           0.80       500
   macro avg       0.80      0.80      0.80       500
weighted avg       0.80      0.80      0.80       500



In [76]:
# Training Model Evaluation

y_pred_train = mnb_model.predict(x_train)

cnf_matrix = confusion_matrix(y_pred_train, y_train)
print("Confusion Matrix:\n",cnf_matrix)
print("-"*60)

accuracy = accuracy_score(y_pred_train, y_train)
print("Accuracy:",accuracy)
print("-"*60)

clf_report = classification_report(y_pred_train, y_train)
print("Classification Report:\n",clf_report)

Confusion Matrix:
 [[630 124]
 [120 626]]
------------------------------------------------------------
Accuracy: 0.8373333333333334
------------------------------------------------------------
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.84      0.84       754
           1       0.83      0.84      0.84       746

    accuracy                           0.84      1500
   macro avg       0.84      0.84      0.84      1500
weighted avg       0.84      0.84      0.84      1500



## 3. Bernoullis NB

In [None]:
When we have binary descrete data

In [77]:
bnb_model = BernoulliNB()
bnb_model.fit(x_train, y_train)

In [78]:
# Testing Model Evaluation

y_pred = bnb_model.predict(x_test)

cnf_matrix = confusion_matrix(y_pred, y_test)
print("Confusion Matrix:\n",cnf_matrix)
print("-"*60)

accuracy = accuracy_score(y_pred, y_test)
print("Accuracy:",accuracy)
print("-"*60)

clf_report = classification_report(y_pred, y_test)
print("Classification Report:\n",clf_report)

Confusion Matrix:
 [[201  72]
 [ 49 178]]
------------------------------------------------------------
Accuracy: 0.758
------------------------------------------------------------
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.74      0.77       273
           1       0.71      0.78      0.75       227

    accuracy                           0.76       500
   macro avg       0.76      0.76      0.76       500
weighted avg       0.76      0.76      0.76       500



In [79]:
# Training Model Evaluation

y_pred_train = bnb_model.predict(x_train)

cnf_matrix = confusion_matrix(y_pred_train, y_train)
print("Confusion Matrix:\n",cnf_matrix)
print("-"*60)

accuracy = accuracy_score(y_pred_train, y_train)
print("Accuracy:",accuracy)
print("-"*60)

clf_report = classification_report(y_pred_train, y_train)
print("Classification Report:\n",clf_report)

Confusion Matrix:
 [[659 166]
 [ 91 584]]
------------------------------------------------------------
Accuracy: 0.8286666666666667
------------------------------------------------------------
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.80      0.84       825
           1       0.78      0.87      0.82       675

    accuracy                           0.83      1500
   macro avg       0.83      0.83      0.83      1500
weighted avg       0.83      0.83      0.83      1500



### TF-IDF

In [80]:
# Create Instance
tfidf_vector = TfidfVectorizer(stop_words='english', min_df = 0.05)

x_tfidf_array = tfidf_vector.fit_transform(review_list).toarray()

x_tfidf = pd.DataFrame(x_tfidf_array, columns=tfidf_vector.get_feature_names())
x_tfidf

Unnamed: 0,abil,abl,abov,absolut,accent,accept,achiev,act,action,actor,...,wouldn,write,writer,written,wrong,wrote,ye,year,york,young
0,0.000000,0.0,0.0,0.000000,0.154051,0.0,0.000000,0.077617,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.063681,0.000000,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.029383,0.000000,0.000000
2,0.000000,0.0,0.0,0.089848,0.000000,0.0,0.000000,0.053714,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.091867,0.000000,0.000000
3,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.189863,0.0,0.030716,...,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.027060,0.055536,0.037962
4,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.189863,0.0,0.030716,...,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.027060,0.055536,0.037962
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.000000,0.0,0.0,0.041013,0.000000,0.0,0.000000,0.000000,0.0,0.000000,...,0.042116,0.0,0.0,0.000000,0.000000,0.0,0.039886,0.041935,0.000000,0.000000
1996,0.057076,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,...,0.055481,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.055242,0.000000,0.000000
1997,0.088080,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.049845,0.0,0.048384,...,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
1998,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.044201,...,0.078218,0.0,0.0,0.063397,0.064761,0.0,0.000000,0.000000,0.000000,0.000000


In [81]:
print(cnt_vector.get_feature_names())

['abil', 'abl', 'abov', 'absolut', 'accent', 'accept', 'achiev', 'act', 'action', 'actor', 'actress', 'actual', 'ad', 'adapt', 'add', 'addit', 'admir', 'admit', 'adult', 'adventur', 'affect', 'age', 'agent', 'ago', 'agre', 'air', 'alien', 'aliv', 'allow', 'alon', 'alreadi', 'alway', 'amaz', 'america', 'american', 'amus', 'ani', 'anim', 'annoy', 'anoth', 'answer', 'anyon', 'anyth', 'apart', 'appar', 'appeal', 'appear', 'appreci', 'approach', 'appropri', 'aren', 'arm', 'arriv', 'art', 'artist', 'asid', 'ask', 'aspect', 'assist', 'atmospher', 'attack', 'attempt', 'attent', 'attract', 'audienc', 'author', 'averag', 'avoid', 'aw', 'award', 'away', 'babi', 'background', 'bad', 'bare', 'base', 'basic', 'battl', 'beat', 'beauti', 'becaus', 'becom', 'befor', 'begin', 'believ', 'ben', 'best', 'better', 'big', 'biggest', 'bit', 'black', 'blood', 'blow', 'blue', 'bodi', 'book', 'bore', 'boss', 'box', 'boy', 'brain', 'break', 'brief', 'brilliant', 'bring', 'british', 'brother', 'brought', 'bruce', 

# Train Test Split

In [82]:
x_train, x_test, y_train, y_test = train_test_split(x_tfidf, y, test_size=0.25, random_state=25, stratify=y)
x_train

Unnamed: 0,abil,abl,abov,absolut,accent,accept,achiev,act,action,actor,...,wouldn,write,writer,written,wrong,wrote,ye,year,york,young
118,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.060797,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
1374,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.078355,0.042737,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
1928,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.048845,0.026642,0.023707,...,0.041951,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.020885,0.000000,0.000000
250,0.0,0.039314,0.0,0.0,0.0,0.186440,0.000000,0.026513,0.000000,0.000000,...,0.000000,0.000000,0.0,0.036912,0.000000,0.0,0.000000,0.022673,0.000000,0.000000
1621,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.072617,0.000000,0.000000,...,0.083156,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.020699,0.000000,0.087115
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1459,0.0,0.046088,0.0,0.0,0.0,0.000000,0.000000,0.031082,0.000000,0.060341,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
1730,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.057985,0.000000,0.028142,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.034781
725,0.0,0.000000,0.0,0.0,0.0,0.080881,0.000000,0.000000,0.050188,0.000000,...,0.000000,0.000000,0.0,0.000000,0.065432,0.0,0.000000,0.078687,0.161488,0.000000
850,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.040481,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000


# Model Training

### 1. GaussianNB

In [None]:
For good reults, data is continous and should be normally distributed.

In [83]:
gnb_model = GaussianNB()
gnb_model.fit(x_train, y_train)

In [84]:
# Testing Model Evaluation

y_pred = gnb_model.predict(x_test)

cnf_matrix = confusion_matrix(y_pred, y_test)
print("Confusion Matrix:\n",cnf_matrix)
print("-"*60)

accuracy = accuracy_score(y_pred, y_test)
print("Accuracy:",accuracy)
print("-"*60)

clf_report = classification_report(y_pred, y_test)
print("Classification Report:\n",clf_report)

Confusion Matrix:
 [[194  71]
 [ 56 179]]
------------------------------------------------------------
Accuracy: 0.746
------------------------------------------------------------
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.73      0.75       265
           1       0.72      0.76      0.74       235

    accuracy                           0.75       500
   macro avg       0.75      0.75      0.75       500
weighted avg       0.75      0.75      0.75       500



In [85]:
# Training Model Evaluation

y_pred_train = gnb_model.predict(x_train)

cnf_matrix = confusion_matrix(y_pred_train, y_train)
print("Confusion Matrix:\n",cnf_matrix)
print("-"*60)

accuracy = accuracy_score(y_pred_train, y_train)
print("Accuracy:",accuracy)
print("-"*60)

clf_report = classification_report(y_pred_train, y_train)
print("Classification Report:\n",clf_report)

Confusion Matrix:
 [[661  87]
 [ 89 663]]
------------------------------------------------------------
Accuracy: 0.8826666666666667
------------------------------------------------------------
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.88      0.88       748
           1       0.88      0.88      0.88       752

    accuracy                           0.88      1500
   macro avg       0.88      0.88      0.88      1500
weighted avg       0.88      0.88      0.88      1500



## 2. Multinomial NB

In [None]:
Will perform well on descrete data. 

In [86]:
mnb_model = MultinomialNB()
mnb_model.fit(x_train, y_train)

In [87]:
# Testing Model Evaluation

y_pred = mnb_model.predict(x_test)

cnf_matrix = confusion_matrix(y_pred, y_test)
print("Confusion Matrix:\n",cnf_matrix)
print("-"*60)

accuracy = accuracy_score(y_pred, y_test)
print("Accuracy:",accuracy)
print("-"*60)

clf_report = classification_report(y_pred, y_test)
print("Classification Report:\n",clf_report)

Confusion Matrix:
 [[208  52]
 [ 42 198]]
------------------------------------------------------------
Accuracy: 0.812
------------------------------------------------------------
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.80      0.82       260
           1       0.79      0.82      0.81       240

    accuracy                           0.81       500
   macro avg       0.81      0.81      0.81       500
weighted avg       0.81      0.81      0.81       500



In [88]:
# Training Model Evaluation

y_pred_train = mnb_model.predict(x_train)

cnf_matrix = confusion_matrix(y_pred_train, y_train)
print("Confusion Matrix:\n",cnf_matrix)
print("-"*60)

accuracy = accuracy_score(y_pred_train, y_train)
print("Accuracy:",accuracy)
print("-"*60)

clf_report = classification_report(y_pred_train, y_train)
print("Classification Report:\n",clf_report)

Confusion Matrix:
 [[643 112]
 [107 638]]
------------------------------------------------------------
Accuracy: 0.854
------------------------------------------------------------
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.85      0.85       755
           1       0.85      0.86      0.85       745

    accuracy                           0.85      1500
   macro avg       0.85      0.85      0.85      1500
weighted avg       0.85      0.85      0.85      1500



## 3. Bernoullis NB

In [None]:
When we have binary descrete data

In [89]:
bnb_model = BernoulliNB()
bnb_model.fit(x_train, y_train)

In [90]:
# Testing Model Evaluation

y_pred = bnb_model.predict(x_test)

cnf_matrix = confusion_matrix(y_pred, y_test)
print("Confusion Matrix:\n",cnf_matrix)
print("-"*60)

accuracy = accuracy_score(y_pred, y_test)
print("Accuracy:",accuracy)
print("-"*60)

clf_report = classification_report(y_pred, y_test)
print("Classification Report:\n",clf_report)

Confusion Matrix:
 [[201  72]
 [ 49 178]]
------------------------------------------------------------
Accuracy: 0.758
------------------------------------------------------------
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.74      0.77       273
           1       0.71      0.78      0.75       227

    accuracy                           0.76       500
   macro avg       0.76      0.76      0.76       500
weighted avg       0.76      0.76      0.76       500



In [91]:
# Training Model Evaluation

y_pred_train = bnb_model.predict(x_train)

cnf_matrix = confusion_matrix(y_pred_train, y_train)
print("Confusion Matrix:\n",cnf_matrix)
print("-"*60)

accuracy = accuracy_score(y_pred_train, y_train)
print("Accuracy:",accuracy)
print("-"*60)

clf_report = classification_report(y_pred_train, y_train)
print("Classification Report:\n",clf_report)

Confusion Matrix:
 [[659 166]
 [ 91 584]]
------------------------------------------------------------
Accuracy: 0.8286666666666667
------------------------------------------------------------
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.80      0.84       825
           1       0.78      0.87      0.82       675

    accuracy                           0.83      1500
   macro avg       0.83      0.83      0.83      1500
weighted avg       0.83      0.83      0.83      1500



# Stemming

In [None]:
Normalized word it into base form
- Not giving you meaningful results
- heurastic/ dominant
- does not take words context in consideration

In [35]:
from nltk.stem import PorterStemmer, LancasterStemmer

In [52]:
lst = LancasterStemmer()
lst.stem("having")

'hav'

In [51]:
lst = LancasterStemmer()
lst.stem("caring")

'car'

In [38]:
lst = LancasterStemmer()
lst.stem("Playing")

'play'

In [39]:
text = "Python is Programming Language and Python Language is Preferable for Data Science"

text.split()

['Python',
 'is',
 'Programming',
 'Language',
 'and',
 'Python',
 'Language',
 'is',
 'Preferable',
 'for',
 'Data',
 'Science']

In [42]:
# Tokenization

from nltk.tokenize import word_tokenize
tokens = word_tokenize(text)

lst = LancasterStemmer()

for word in tokens:
    print(lst.stem(word))

python
is
program
langu
and
python
langu
is
pref
for
dat
sci


# Lemmitization

In [None]:
- Consider context of words
- Giving meaningful words
- we can add pos tag

In [43]:
from nltk.stem import WordNetLemmatizer

In [46]:
lemma = WordNetLemmatizer()
lemma.lemmatize("having", pos='v')  # bydefault = n

'have'

In [47]:
lemma = WordNetLemmatizer()
lemma.lemmatize("playing", pos='v')

'play'

In [50]:
lemma = WordNetLemmatizer()
lemma.lemmatize("caring", pos='v')

'care'

In [55]:
# Tokenization
text = "Python is Programming Language and Python Language is Preferable for Data Science"

from nltk.tokenize import word_tokenize
tokens = word_tokenize(text)

lemma = WordNetLemmatizer()

for word in tokens:
    print(lemma.lemmatize(word.lower(), pos='v'))

python
be
program
language
and
python
language
be
preferable
for
data
science
