In [84]:
# //Data Preprocessing with nltk tool steps
# 1st. Lower Case
# 2nd Tokenization
# 3rd  Stopwords Removal
# 4th Stemmed & Lemmatization
# 5th Removing Punctuation & Special Character
# 6th Joining Tokens
# 7th Text Vectorization (TF-IDF)

In [170]:
import pandas as pd
import nltk
nltk.download("punkt")


[nltk_data] Downloading package punkt to C:\Users\SIDDESH
[nltk_data]     VICHARE\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [171]:
pip install openpyxl




# Lower Case

In [172]:
csv_path="processed4_data.xlsx"
data = pd.read_excel(csv_path)

In [173]:
data["text"]=data['text'].str.lower()
data["intent"]=data['intent'].str.lower()

# Data Tokenize

In [174]:
# Clean the 'text' column by converting non-string values to strings
data['text'] = data['text'].astype(str)

# Tokenize the 'text' column and create the 'tokenized_text' column
data['tokenized_text'] = data['text'].apply(nltk.word_tokenize)

# Print the DataFrame to see the tokenized results
print(data)

                                            text        intent  \
0                                  how are you??      greeting   
1                                          hello      greeting   
2                                     what's up?      greeting   
3                                         thanks  appreciation   
4                              thank you so much  appreciation   
..                                           ...           ...   
157  tell me a joke to lighten the mood.                  joke   
158  do you have a funny joke?                            joke   
159   share a joke with me.                               joke   
160   give me a lighthearted joke.                        joke   
161                               tell me a joke          joke   

                                     tokenized_text  \
0                             [how, are, you, ?, ?]   
1                                           [hello]   
2                                 [what, '

In [175]:
data['text']=data['text'].astype(str)
data['tokenized_text']=data['text'].apply(nltk.word_tokenize)

data.head()

Unnamed: 0,text,intent,tokenized_text,filtered_text,stemmed_text,lemmatized_text,cleaned_text
0,how are you??,greeting,"[how, are, you, ?, ?]","['?', '?']","['how', 'are', 'you', '?', '?']","['how', 'are', 'you??']","['how', 'are', 'you']"
1,hello,greeting,[hello],['hello'],['hello'],['hello'],['hello']
2,what's up?,greeting,"[what, 's, up, ?]","[""'s"", '?']","['what', ""'s"", 'up', '?']","[""what's"", 'up?']","['whats', 'up']"
3,thanks,appreciation,[thanks],['thanks'],['thank'],['thanks'],['thanks']
4,thank you so much,appreciation,"[thank, you, so, much]","['thank', 'much']","['thank', 'you', 'so', 'much']","['thank', 'you', 'so', 'much']","['thank', 'you', 'so', 'much']"


In [176]:
columns=['text','intent','tokenized_text']

In [177]:
df=pd.DataFrame(data,columns=columns)
output_excel="processed_data.xlsx"

In [178]:
print(output_excel)

processed_data.xlsx


# Stopwords Removal

In [179]:
from nltk.corpus import stopwords
nltk.download("stopwords")
stop_words=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to C:\Users\SIDDESH
[nltk_data]     VICHARE\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [181]:
data['filtered_text']=data['tokenized_text'].apply(lambda tokens:[word for word in tokens if word.lower() not in stop_words])

In [182]:
data.head(20)

Unnamed: 0,text,intent,tokenized_text,filtered_text,stemmed_text,lemmatized_text,cleaned_text
0,how are you??,greeting,"[how, are, you, ?, ?]","[?, ?]","['how', 'are', 'you', '?', '?']","['how', 'are', 'you??']","['how', 'are', 'you']"
1,hello,greeting,[hello],[hello],['hello'],['hello'],['hello']
2,what's up?,greeting,"[what, 's, up, ?]","['s, ?]","['what', ""'s"", 'up', '?']","[""what's"", 'up?']","['whats', 'up']"
3,thanks,appreciation,[thanks],[thanks],['thank'],['thanks'],['thanks']
4,thank you so much,appreciation,"[thank, you, so, much]","[thank, much]","['thank', 'you', 'so', 'much']","['thank', 'you', 'so', 'much']","['thank', 'you', 'so', 'much']"
5,good morning,greeting,"[good, morning]","[good, morning]","['good', 'morn']","['good', 'morning']","['good', 'morning']"
6,good afternoon,greeting,"[good, afternoon]","[good, afternoon]","['good', 'afternoon']","['good', 'afternoon']","['good', 'afternoon']"
7,how are you?,greeting,"[how, are, you, ?]",[?],"['how', 'are', 'you', '?']","['how', 'are', 'you?']","['how', 'are', 'you']"
8,hello,greeting,[hello],[hello],['hello'],['hello'],['hello']
9,what's up?,greeting,"[what, 's, up, ?]","['s, ?]","['what', ""'s"", 'up', '?']","[""what's"", 'up?']","['whats', 'up']"


In [183]:
#save columns in the excel file
columns=['text','intent','tokenized_text','filtered_text']
df=pd.DataFrame(data,columns=columns)

In [184]:
output_excel="processed_data.xlsx"

In [185]:
df.to_excel(output_excel,index=False)

In [186]:
print(output_excel)

processed_data.xlsx


# Stemming & Lemmatization

In [187]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

data['stemmed_text'] = data['tokenized_text'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])
stemmed_texts = data['stemmed_text']
stemmed_texts

0                               [how, are, you, ?, ?]
1                                             [hello]
2                                   [what, 's, up, ?]
3                                             [thank]
4                              [thank, you, so, much]
                            ...                      
157    [tell, me, a, joke, to, lighten, the, mood, .]
158                [do, you, have, a, funni, joke, ?]
159                     [share, a, joke, with, me, .]
160                [give, me, a, lightheart, joke, .]
161                               [tell, me, a, joke]
Name: stemmed_text, Length: 162, dtype: object

In [188]:
columns=["text","intent","tokenized_text","filtered_text","stemmed_text"]

In [189]:
df=pd.DataFrame(data,columns=columns)

In [190]:
ouput_excel='processed3_data.xlsx'

In [191]:
df.to_excel(ouput_excel,index=False)

In [192]:
!python -m spacy download en

Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 12.8/12.8 MB 8.3 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
  Attempting uninstall: en-core-web-sm
    Found existing installation: en-core-web-sm 2.2.0
    Uninstalling en-core-web-sm-2.2.0:
      Successfully uninstalled en-core-web-sm-2.2.0
Successfully installed en-core-web-sm-3.6.0
[38;5;3m[!] As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use
the full pipeline package name 'en_core_web_sm' instead.[0m
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [193]:
import spacy

In [194]:
pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz

Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gzNote: you may need to restart the kernel to use updated packages.

  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz (12.0 MB)
     ---------------------------------------- 12.0/12.0 MB 7.7 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py): started
  Building wheel for en-core-web-sm (setup.py): finished with status 'done'
  Created wheel for en-core-web-sm: filename=en_core_web_sm-2.2.0-py3-none-any.whl size=12019114 sha256=8aeb703a5357eb0f64ac370262d9eef4eb03c0fb7222d9b3f56fa76718d636c1
  Stored in directory: c:\users\siddesh vichare\appdata\local\pip\cache\wheels\f9\7e\12\0c885b1d01a93f5cfff2e269634078c488729f52129c8f7bde
Succ

In [195]:
import nltk
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to C:\Users\SIDDESH
[nltk_data]     VICHARE\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\SIDDESH
[nltk_data]     VICHARE\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [196]:
pip install --upgrade nltk


Note: you may need to restart the kernel to use updated packages.


In [197]:
# import these modules
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [198]:
import pandas as pd
csv_path="processed_data.xlsx"
data = pd.read_excel(csv_path)

In [199]:
data.head()

Unnamed: 0,text,intent,tokenized_text,filtered_text
0,how are you??,greeting,"['how', 'are', 'you', '?', '?']","['?', '?']"
1,hello,greeting,['hello'],['hello']
2,what's up?,greeting,"['what', ""'s"", 'up', '?']","[""'s"", '?']"
3,thanks,appreciation,['thanks'],['thanks']
4,thank you so much,appreciation,"['thank', 'you', 'so', 'much']","['thank', 'much']"


In [200]:
data['lemmatized_text'] = data['text'].apply(lambda sentence: [lemmatizer.lemmatize(token) for token in sentence.split()])

In [201]:
data['lemmatized_text']

0                                 [how, are, you??]
1                                           [hello]
2                                     [what's, up?]
3                                          [thanks]
4                            [thank, you, so, much]
                           ...                     
157    [tell, me, a, joke, to, lighten, the, mood.]
158                [do, you, have, a, funny, joke?]
159                     [share, a, joke, with, me.]
160              [give, me, a, lighthearted, joke.]
161                             [tell, me, a, joke]
Name: lemmatized_text, Length: 162, dtype: object

In [202]:
data.head(20)

Unnamed: 0,text,intent,tokenized_text,filtered_text,lemmatized_text
0,how are you??,greeting,"['how', 'are', 'you', '?', '?']","['?', '?']","[how, are, you??]"
1,hello,greeting,['hello'],['hello'],[hello]
2,what's up?,greeting,"['what', ""'s"", 'up', '?']","[""'s"", '?']","[what's, up?]"
3,thanks,appreciation,['thanks'],['thanks'],[thanks]
4,thank you so much,appreciation,"['thank', 'you', 'so', 'much']","['thank', 'much']","[thank, you, so, much]"
5,good morning,greeting,"['good', 'morning']","['good', 'morning']","[good, morning]"
6,good afternoon,greeting,"['good', 'afternoon']","['good', 'afternoon']","[good, afternoon]"
7,how are you?,greeting,"['how', 'are', 'you', '?']",['?'],"[how, are, you?]"
8,hello,greeting,['hello'],['hello'],[hello]
9,what's up?,greeting,"['what', ""'s"", 'up', '?']","[""'s"", '?']","[what's, up?]"


In [203]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
lem = WordNetLemmatizer()
lemmatized_words = [lem.lemmatize(word) for word in "I'm running".split()]
print(lemmatized_words)

["I'm", 'running']


In [204]:
columns=['text','intent','tokenized_text','filtered_text','stemmed_text','lemmatized_text']
df=pd.DataFrame(data,columns=columns)

In [205]:
ouput_excel='processed4_data.xlsx'

# Removing Punctuation and Special Characters

In [207]:
import re

In [208]:
def remove_punctuation(words_list):
    #Define pattern whcih is all Alphanumerical character
    # With the "r" prefix (raw string)
    pattern=r'[^a-zA-Z0-9\s]'
    clean_words=[re.sub(pattern,"",word) for word in words_list]
    return clean_words

In [209]:
data['cleaned_text']=data['lemmatized_text'].apply(remove_punctuation)

In [210]:
data['cleaned_text']

0                                  [how, are, you]
1                                          [hello]
2                                      [whats, up]
3                                         [thanks]
4                           [thank, you, so, much]
                          ...                     
157    [tell, me, a, joke, to, lighten, the, mood]
158                [do, you, have, a, funny, joke]
159                     [share, a, joke, with, me]
160              [give, me, a, lighthearted, joke]
161                            [tell, me, a, joke]
Name: cleaned_text, Length: 162, dtype: object

In [211]:
columns=['text','intent','tokenized_text','filtered_text','stemmed_text','lemmatized_text','cleaned_text']

In [212]:
df=pd.DataFrame(data,columns=columns)

In [213]:
output_excel="processed6_data.xlsx"

In [237]:
data['joined_text']=data['cleaned_text'].apply(lambda x: ''.join(x))

In [238]:
columns=["text","intent",'tokenized_text','stemmed_text','lemmatized_text','cleaned_text','joined_text']

In [242]:
df=pd.DataFrame(data,columns=columns)
output_excel='processed9_data.xlsx'

In [243]:
df.to_excel(output_excel,index=False)

In [244]:
csv_path='processed9_data.xlsx'

In [245]:
df=pd.read_excel(csv_path)

In [247]:
data=pd.DataFrame(df)

In [248]:
data.head()

Unnamed: 0,text,intent,tokenized_text,stemmed_text,lemmatized_text,cleaned_text,joined_text
0,how are you??,greeting,"['how', 'are', 'you', '?', '?']","['how', 'are', 'you', '?', '?']","['how', 'are', 'you??']","['how', 'are', 'you']","['how', 'are', 'you']"
1,hello,greeting,['hello'],['hello'],['hello'],['hello'],['hello']
2,what's up?,greeting,"['what', ""'s"", 'up', '?']","['what', ""'s"", 'up', '?']","[""what's"", 'up?']","['whats', 'up']","['whats', 'up']"
3,thanks,appreciation,['thanks'],['thank'],['thanks'],['thanks'],['thanks']
4,thank you so much,appreciation,"['thank', 'you', 'so', 'much']","['thank', 'you', 'so', 'much']","['thank', 'you', 'so', 'much']","['thank', 'you', 'so', 'much']","['thank', 'you', 'so', 'much']"


In [276]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the joined_text column
tfidf_matrix = tfidf_vectorizer.fit_transform(data['joined_text'])

# Convert the TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Concatenate the TF-IDF DataFrame with the original data
data_with_tfidf = pd.concat([data, tfidf_df], axis=1)

In [294]:
data_with_tfidf

Unnamed: 0,text,intent,tokenized_text,stemmed_text,lemmatized_text,cleaned_text,joined_text,about,active,add,...,where,who,will,with,words,year,you,your,youre,youtube
0,how are you??,greeting,"['how', 'are', 'you', '?', '?']","['how', 'are', 'you', '?', '?']","['how', 'are', 'you??']","['how', 'are', 'you']","['how', 'are', 'you']",,,,...,,,,,,,,,,
1,hello,greeting,['hello'],['hello'],['hello'],['hello'],['hello'],,,,...,,,,,,,,,,
2,what's up?,greeting,"['what', ""'s"", 'up', '?']","['what', ""'s"", 'up', '?']","[""what's"", 'up?']","['whats', 'up']","['whats', 'up']",,,,...,,,,,,,,,,
3,thanks,appreciation,['thanks'],['thank'],['thanks'],['thanks'],['thanks'],,,,...,,,,,,,,,,
4,thank you so much,appreciation,"['thank', 'you', 'so', 'much']","['thank', 'you', 'so', 'much']","['thank', 'you', 'so', 'much']","['thank', 'you', 'so', 'much']","['thank', 'you', 'so', 'much']",,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157,,,,,,,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
158,,,,,,,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.239261,0.0,0.0,0.0
159,,,,,,,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.618708,0.0,0.0,0.000000,0.0,0.0,0.0
160,,,,,,,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0


In [291]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

#fir and Transform the joined_text column
tfidf_matrix = tfidf_vectorizer.fit_transform(data['joined_text'])

#Convert the TF-IDF matrix to DataFrame
tfidf_df=pd.DataFrame(tfidf_matrix.toarray(),columns=tfidf_vectorizer.get_feature_names_out())

data_with_tfidf=pd.concat([data,tfidf_df])