In [2]:
from gensim.utils import tokenize
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import preprocess_string
import string

In [3]:
text = """crop science: by matching farming practices more closely to crop needs (e.g. fertilizer inputs);
environmental protection: by reducing environmental risks and footprint of farming (e.g. limiting leaching of nitrogen);"""
# Tokenization
tokens = list(tokenize(text))
print("Tokens:", tokens)

Tokens: ['crop', 'science', 'by', 'matching', 'farming', 'practices', 'more', 'closely', 'to', 'crop', 'needs', 'e', 'g', 'fertilizer', 'inputs', 'environmental', 'protection', 'by', 'reducing', 'environmental', 'risks', 'and', 'footprint', 'of', 'farming', 'e', 'g', 'limiting', 'leaching', 'of', 'nitrogen']


In [4]:
# Sentence Segmentation
sentences = text.split('.')
print("Sentences:", sentences)

Sentences: ['crop science: by matching farming practices more closely to crop needs (e', 'g', ' fertilizer inputs);\nenvironmental protection: by reducing environmental risks and footprint of farming (e', 'g', ' limiting leaching of nitrogen);']


In [5]:
# Lemmatization (Combining lowercasing and punctuation removal)
custom_filters = [lambda x: x.lower(), strip_punctuation]
lemmatized_tokens = preprocess_string(' '.join(tokens), filters=custom_filters)
print("Lemmatized Tokens:", lemmatized_tokens)


Lemmatized Tokens: ['crop', 'science', 'by', 'matching', 'farming', 'practices', 'more', 'closely', 'to', 'crop', 'needs', 'e', 'g', 'fertilizer', 'inputs', 'environmental', 'protection', 'by', 'reducing', 'environmental', 'risks', 'and', 'footprint', 'of', 'farming', 'e', 'g', 'limiting', 'leaching', 'of', 'nitrogen']


In [6]:
# Stemming (using Gensim's PorterStemmer)
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in tokens]
print("Stemmed Tokens:", stemmed_tokens)

Stemmed Tokens: ['crop', 'scienc', 'by', 'match', 'farm', 'practic', 'more', 'close', 'to', 'crop', 'need', 'e', 'g', 'fertil', 'input', 'environment', 'protect', 'by', 'reduc', 'environment', 'risk', 'and', 'footprint', 'of', 'farm', 'e', 'g', 'limit', 'leach', 'of', 'nitrogen']


In [7]:
# Word Normalization (Removing stop words and punctuation)
normalized_tokens = [token for token in tokens if token not in remove_stopwords(token) and token not in string.punctuation]
print("Normalized Tokens:", normalized_tokens)

Normalized Tokens: ['by', 'more', 'to', 'by', 'and', 'of', 'of']


In [13]:
pip install nltk


Note: you may need to restart the kernel to use updated packages.


In [14]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
import string

In [26]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
import nltk
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maham\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\maham\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maham\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\maham\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [32]:
# Example text
text = """crop science: by matching farming practices more closely to crop needs (e.g. fertilizer inputs);
environmental protection: by reducing environmental risks and footprint of farming (e.g. limiting leaching of nitrogen);"""


def tokenize_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    return tokens
# Tokenization
tokens = tokenize_text(text)
print("Tokens:", tokens)

Tokens: ['crop', 'science', ':', 'by', 'matching', 'farming', 'practices', 'more', 'closely', 'to', 'crop', 'needs', '(', 'e.g', '.', 'fertilizer', 'inputs', ')', ';', 'environmental', 'protection', ':', 'by', 'reducing', 'environmental', 'risks', 'and', 'footprint', 'of', 'farming', '(', 'e.g', '.', 'limiting', 'leaching', 'of', 'nitrogen', ')', ';']


In [33]:
def segment_sentences(text):
    # Sentence Segmentation
    sentences = sent_tokenize(text)
    return sentences
# Sentence Segmentation
sentences = segment_sentences(text)
print("Sentences:", sentences)


Sentences: ['crop science: by matching farming practices more closely to crop needs (e.g.', 'fertilizer inputs);\nenvironmental protection: by reducing environmental risks and footprint of farming (e.g.', 'limiting leaching of nitrogen);']


In [34]:
def lemmatize_tokens(tokens):
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]
    return lemmatized_tokens
# Lemmatization
lemmatized_tokens = lemmatize_tokens(tokens)
print("Lemmatized Tokens:", lemmatized_tokens)

Lemmatized Tokens: ['crop', 'science', ':', 'by', 'match', 'farm', 'practice', 'more', 'closely', 'to', 'crop', 'need', '(', 'e.g', '.', 'fertilizer', 'input', ')', ';', 'environmental', 'protection', ':', 'by', 'reduce', 'environmental', 'risk', 'and', 'footprint', 'of', 'farm', '(', 'e.g', '.', 'limit', 'leach', 'of', 'nitrogen', ')', ';']


In [35]:
def stem_tokens(tokens):
    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens
# Stemming
stemmed_tokens = stem_tokens(tokens)
print("Stemmed Tokens:", stemmed_tokens)

Stemmed Tokens: ['crop', 'scienc', ':', 'by', 'match', 'farm', 'practic', 'more', 'close', 'to', 'crop', 'need', '(', 'e.g', '.', 'fertil', 'input', ')', ';', 'environment', 'protect', ':', 'by', 'reduc', 'environment', 'risk', 'and', 'footprint', 'of', 'farm', '(', 'e.g', '.', 'limit', 'leach', 'of', 'nitrogen', ')', ';']


In [36]:
def normalize_tokens(tokens):
    # Word Normalization (Removing stop words and punctuation)
    stop_words = set(stopwords.words('english'))
    normalized_tokens = [token.lower() for token in tokens if token.lower() not in stop_words and token.lower() not in string.punctuation]
    return normalized_tokens
# Word Normalization
normalized_tokens = normalize_tokens(tokens)
print("Normalized Tokens:", normalized_tokens)

Normalized Tokens: ['crop', 'science', 'matching', 'farming', 'practices', 'closely', 'crop', 'needs', 'e.g', 'fertilizer', 'inputs', 'environmental', 'protection', 'reducing', 'environmental', 'risks', 'footprint', 'farming', 'e.g', 'limiting', 'leaching', 'nitrogen']
