In [84]:
# //Data Preprocessing with nltk tool steps
# 1st. Lower Case
# 2nd Tokenization
# 3rd  Stopwords Removal
# 4th Stemmed & Lemmatization
# 5th Removing Punctuation & Special Character
# 6th Joining Tokens
# 7th Text Vectorization (TF-IDF)

In [23]:
import pandas as pd
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to C:\Users\SIDDESH
[nltk_data]     VICHARE\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [24]:
pip install openpyxl




# Lower Case

In [25]:
csv_path="Assistant_dataset.xlsx"
data = pd.read_excel(csv_path)

In [28]:
a=data

In [33]:
a.describe()

Unnamed: 0,text,intent
count,189,189
unique,186,14
top,ready for laugh tell me jokes,get_weather
freq,2,39


In [34]:
data["text"]=data['text'].str.lower()
data["intent"]=data['intent'].str.lower()

In [35]:
data["text"]

0                                hi jarvis
1                        hello goodmorning
2                hello jarvis how are you?
3                             good morning
4                           good afternoon
                      ...                 
184    tell me news about india growth gdp
185             goodbye will see you soon 
186                       hey how are you?
187          hey, how are you doing today?
188                             hey jarvis
Name: text, Length: 189, dtype: object

# Data Tokenize

In [36]:
# Clean the 'text' column by converting non-string values to strings
data['text'] = data['text'].astype(str)

# Tokenize the 'text' column and create the 'tokenized_text' column
data['tokenized_text'] = data['text'].apply(nltk.word_tokenize)

# Print the DataFrame to see the tokenized results
print(data)

                                    text    intent  \
0                              hi jarvis  greeting   
1                      hello goodmorning  greeting   
2              hello jarvis how are you?  greeting   
3                           good morning  greeting   
4                         good afternoon  greeting   
..                                   ...       ...   
184  tell me news about india growth gdp  get_news   
185           goodbye will see you soon    leaving   
186                     hey how are you?  greeting   
187        hey, how are you doing today?  greeting   
188                           hey jarvis  greeting   

                                  tokenized_text  
0                                   [hi, jarvis]  
1                           [hello, goodmorning]  
2              [hello, jarvis, how, are, you, ?]  
3                                [good, morning]  
4                              [good, afternoon]  
..                                           

In [37]:
data['text']=data['text'].astype(str)
data['tokenized_text']=data['text'].apply(nltk.word_tokenize)

data.head()

Unnamed: 0,text,intent,tokenized_text
0,hi jarvis,greeting,"[hi, jarvis]"
1,hello goodmorning,greeting,"[hello, goodmorning]"
2,hello jarvis how are you?,greeting,"[hello, jarvis, how, are, you, ?]"
3,good morning,greeting,"[good, morning]"
4,good afternoon,greeting,"[good, afternoon]"


In [38]:
columns=['text','intent','tokenized_text']

In [39]:
df=pd.DataFrame(data,columns=columns)
output_excel="processed_data.xlsx"

In [40]:
print(output_excel)

processed_data.xlsx


# Stopwords Removal

In [41]:
from nltk.corpus import stopwords
nltk.download("stopwords")
stop_words=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to C:\Users\SIDDESH
[nltk_data]     VICHARE\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [42]:
data['filtered_text']=data['tokenized_text'].apply(lambda tokens:[word for word in tokens if word.lower() not in stop_words])

In [43]:
data.head(20)

Unnamed: 0,text,intent,tokenized_text,filtered_text
0,hi jarvis,greeting,"[hi, jarvis]","[hi, jarvis]"
1,hello goodmorning,greeting,"[hello, goodmorning]","[hello, goodmorning]"
2,hello jarvis how are you?,greeting,"[hello, jarvis, how, are, you, ?]","[hello, jarvis, ?]"
3,good morning,greeting,"[good, morning]","[good, morning]"
4,good afternoon,greeting,"[good, afternoon]","[good, afternoon]"
5,how are you?,greeting,"[how, are, you, ?]",[?]
6,hello,greeting,[hello],[hello]
7,what's up?,greeting,"[what, 's, up, ?]","['s, ?]"
8,hello good evening,greeting,"[hello, good, evening]","[hello, good, evening]"
9,hi goodmorning,greeting,"[hi, goodmorning]","[hi, goodmorning]"


In [44]:
#save columns in the excel file
columns=['text','intent','tokenized_text','filtered_text']
df=pd.DataFrame(data,columns=columns)

In [45]:
output_excel="processed_data.xlsx"

In [46]:
df.to_excel(output_excel,index=False)

In [47]:
print(output_excel)

processed_data.xlsx


# Stemming & Lemmatization

In [48]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

data['stemmed_text'] = data['tokenized_text'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])
stemmed_texts = data['stemmed_text']
stemmed_texts

0                                      [hi, jarvi]
1                                [hello, goodmorn]
2                 [hello, jarvi, how, are, you, ?]
3                                     [good, morn]
4                                [good, afternoon]
                          ...                     
184    [tell, me, news, about, india, growth, gdp]
185                 [goodby, will, see, you, soon]
186                        [hey, how, are, you, ?]
187          [hey, ,, how, are, you, do, today, ?]
188                                   [hey, jarvi]
Name: stemmed_text, Length: 189, dtype: object

In [49]:
data.head()

Unnamed: 0,text,intent,tokenized_text,filtered_text,stemmed_text
0,hi jarvis,greeting,"[hi, jarvis]","[hi, jarvis]","[hi, jarvi]"
1,hello goodmorning,greeting,"[hello, goodmorning]","[hello, goodmorning]","[hello, goodmorn]"
2,hello jarvis how are you?,greeting,"[hello, jarvis, how, are, you, ?]","[hello, jarvis, ?]","[hello, jarvi, how, are, you, ?]"
3,good morning,greeting,"[good, morning]","[good, morning]","[good, morn]"
4,good afternoon,greeting,"[good, afternoon]","[good, afternoon]","[good, afternoon]"


In [50]:
columns=["text","intent","tokenized_text","filtered_text","stemmed_text"]

In [51]:
data.head(5)

Unnamed: 0,text,intent,tokenized_text,filtered_text,stemmed_text
0,hi jarvis,greeting,"[hi, jarvis]","[hi, jarvis]","[hi, jarvi]"
1,hello goodmorning,greeting,"[hello, goodmorning]","[hello, goodmorning]","[hello, goodmorn]"
2,hello jarvis how are you?,greeting,"[hello, jarvis, how, are, you, ?]","[hello, jarvis, ?]","[hello, jarvi, how, are, you, ?]"
3,good morning,greeting,"[good, morning]","[good, morning]","[good, morn]"
4,good afternoon,greeting,"[good, afternoon]","[good, afternoon]","[good, afternoon]"


In [52]:
df=pd.DataFrame(data,columns=columns)
df.head()

Unnamed: 0,text,intent,tokenized_text,filtered_text,stemmed_text
0,hi jarvis,greeting,"[hi, jarvis]","[hi, jarvis]","[hi, jarvi]"
1,hello goodmorning,greeting,"[hello, goodmorning]","[hello, goodmorning]","[hello, goodmorn]"
2,hello jarvis how are you?,greeting,"[hello, jarvis, how, are, you, ?]","[hello, jarvis, ?]","[hello, jarvi, how, are, you, ?]"
3,good morning,greeting,"[good, morning]","[good, morning]","[good, morn]"
4,good afternoon,greeting,"[good, afternoon]","[good, afternoon]","[good, afternoon]"


In [53]:
data.head()

Unnamed: 0,text,intent,tokenized_text,filtered_text,stemmed_text
0,hi jarvis,greeting,"[hi, jarvis]","[hi, jarvis]","[hi, jarvi]"
1,hello goodmorning,greeting,"[hello, goodmorning]","[hello, goodmorning]","[hello, goodmorn]"
2,hello jarvis how are you?,greeting,"[hello, jarvis, how, are, you, ?]","[hello, jarvis, ?]","[hello, jarvi, how, are, you, ?]"
3,good morning,greeting,"[good, morning]","[good, morning]","[good, morn]"
4,good afternoon,greeting,"[good, afternoon]","[good, afternoon]","[good, afternoon]"


In [54]:
ouput_excel='processed3_data.xlsx'

In [55]:
df.to_excel(ouput_excel,index=False)
df.head()

Unnamed: 0,text,intent,tokenized_text,filtered_text,stemmed_text
0,hi jarvis,greeting,"[hi, jarvis]","[hi, jarvis]","[hi, jarvi]"
1,hello goodmorning,greeting,"[hello, goodmorning]","[hello, goodmorning]","[hello, goodmorn]"
2,hello jarvis how are you?,greeting,"[hello, jarvis, how, are, you, ?]","[hello, jarvis, ?]","[hello, jarvi, how, are, you, ?]"
3,good morning,greeting,"[good, morning]","[good, morning]","[good, morn]"
4,good afternoon,greeting,"[good, afternoon]","[good, afternoon]","[good, afternoon]"


In [56]:
!python -m spacy download en

Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 12.8/12.8 MB 7.3 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
  Attempting uninstall: en-core-web-sm
    Found existing installation: en-core-web-sm 2.2.0
    Uninstalling en-core-web-sm-2.2.0:
      Successfully uninstalled en-core-web-sm-2.2.0
Successfully installed en-core-web-sm-3.6.0
[38;5;3m[!] As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use
the full pipeline package name 'en_core_web_sm' instead.[0m
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [57]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [58]:
import spacy

In [59]:
pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz

Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gzNote: you may need to restart the kernel to use updated packages.

  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz (12.0 MB)
     ---------------------------------------- 12.0/12.0 MB 8.5 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py): started
  Building wheel for en-core-web-sm (setup.py): finished with status 'done'
  Created wheel for en-core-web-sm: filename=en_core_web_sm-2.2.0-py3-none-any.whl size=12019114 sha256=41871bd3eff0784af69771d788944da679f763301966987f289f1ddd4489c20e
  Stored in directory: c:\users\siddesh vichare\appdata\local\pip\cache\wheels\f9\7e\12\0c885b1d01a93f5cfff2e269634078c488729f52129c8f7bde
Succ

In [60]:
import nltk
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to C:\Users\SIDDESH
[nltk_data]     VICHARE\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\SIDDESH
[nltk_data]     VICHARE\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [61]:
pip install --upgrade nltk

Note: you may need to restart the kernel to use updated packages.


In [62]:
df.head()

Unnamed: 0,text,intent,tokenized_text,filtered_text,stemmed_text
0,hi jarvis,greeting,"[hi, jarvis]","[hi, jarvis]","[hi, jarvi]"
1,hello goodmorning,greeting,"[hello, goodmorning]","[hello, goodmorning]","[hello, goodmorn]"
2,hello jarvis how are you?,greeting,"[hello, jarvis, how, are, you, ?]","[hello, jarvis, ?]","[hello, jarvi, how, are, you, ?]"
3,good morning,greeting,"[good, morning]","[good, morning]","[good, morn]"
4,good afternoon,greeting,"[good, afternoon]","[good, afternoon]","[good, afternoon]"


In [63]:
# import these modules
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [64]:
import pandas as pd
csv_path="processed3_data.xlsx"
data = pd.read_excel(csv_path)

In [65]:
data.head()

Unnamed: 0,text,intent,tokenized_text,filtered_text,stemmed_text
0,hi jarvis,greeting,"['hi', 'jarvis']","['hi', 'jarvis']","['hi', 'jarvi']"
1,hello goodmorning,greeting,"['hello', 'goodmorning']","['hello', 'goodmorning']","['hello', 'goodmorn']"
2,hello jarvis how are you?,greeting,"['hello', 'jarvis', 'how', 'are', 'you', '?']","['hello', 'jarvis', '?']","['hello', 'jarvi', 'how', 'are', 'you', '?']"
3,good morning,greeting,"['good', 'morning']","['good', 'morning']","['good', 'morn']"
4,good afternoon,greeting,"['good', 'afternoon']","['good', 'afternoon']","['good', 'afternoon']"


In [66]:
data['lemmatized_text'] = data['text'].apply(lambda sentence: [lemmatizer.lemmatize(token) for token in sentence.split()])

In [67]:
data['lemmatized_text']

0                                     [hi, jarvis]
1                             [hello, goodmorning]
2                  [hello, jarvis, how, are, you?]
3                                  [good, morning]
4                                [good, afternoon]
                          ...                     
184    [tell, me, news, about, india, growth, gdp]
185                [goodbye, will, see, you, soon]
186                          [hey, how, are, you?]
187           [hey,, how, are, you, doing, today?]
188                                  [hey, jarvis]
Name: lemmatized_text, Length: 189, dtype: object

In [68]:
data.head()

Unnamed: 0,text,intent,tokenized_text,filtered_text,stemmed_text,lemmatized_text
0,hi jarvis,greeting,"['hi', 'jarvis']","['hi', 'jarvis']","['hi', 'jarvi']","[hi, jarvis]"
1,hello goodmorning,greeting,"['hello', 'goodmorning']","['hello', 'goodmorning']","['hello', 'goodmorn']","[hello, goodmorning]"
2,hello jarvis how are you?,greeting,"['hello', 'jarvis', 'how', 'are', 'you', '?']","['hello', 'jarvis', '?']","['hello', 'jarvi', 'how', 'are', 'you', '?']","[hello, jarvis, how, are, you?]"
3,good morning,greeting,"['good', 'morning']","['good', 'morning']","['good', 'morn']","[good, morning]"
4,good afternoon,greeting,"['good', 'afternoon']","['good', 'afternoon']","['good', 'afternoon']","[good, afternoon]"


In [69]:
data.head(20)

Unnamed: 0,text,intent,tokenized_text,filtered_text,stemmed_text,lemmatized_text
0,hi jarvis,greeting,"['hi', 'jarvis']","['hi', 'jarvis']","['hi', 'jarvi']","[hi, jarvis]"
1,hello goodmorning,greeting,"['hello', 'goodmorning']","['hello', 'goodmorning']","['hello', 'goodmorn']","[hello, goodmorning]"
2,hello jarvis how are you?,greeting,"['hello', 'jarvis', 'how', 'are', 'you', '?']","['hello', 'jarvis', '?']","['hello', 'jarvi', 'how', 'are', 'you', '?']","[hello, jarvis, how, are, you?]"
3,good morning,greeting,"['good', 'morning']","['good', 'morning']","['good', 'morn']","[good, morning]"
4,good afternoon,greeting,"['good', 'afternoon']","['good', 'afternoon']","['good', 'afternoon']","[good, afternoon]"
5,how are you?,greeting,"['how', 'are', 'you', '?']",['?'],"['how', 'are', 'you', '?']","[how, are, you?]"
6,hello,greeting,['hello'],['hello'],['hello'],[hello]
7,what's up?,greeting,"['what', ""'s"", 'up', '?']","[""'s"", '?']","['what', ""'s"", 'up', '?']","[what's, up?]"
8,hello good evening,greeting,"['hello', 'good', 'evening']","['hello', 'good', 'evening']","['hello', 'good', 'even']","[hello, good, evening]"
9,hi goodmorning,greeting,"['hi', 'goodmorning']","['hi', 'goodmorning']","['hi', 'goodmorn']","[hi, goodmorning]"


In [70]:
columns=['text','intent','tokenized_text','filtered_text','stemmed_text','lemmatized_text']
df=pd.DataFrame(data,columns=columns)
df.head(5)

Unnamed: 0,text,intent,tokenized_text,filtered_text,stemmed_text,lemmatized_text
0,hi jarvis,greeting,"['hi', 'jarvis']","['hi', 'jarvis']","['hi', 'jarvi']","[hi, jarvis]"
1,hello goodmorning,greeting,"['hello', 'goodmorning']","['hello', 'goodmorning']","['hello', 'goodmorn']","[hello, goodmorning]"
2,hello jarvis how are you?,greeting,"['hello', 'jarvis', 'how', 'are', 'you', '?']","['hello', 'jarvis', '?']","['hello', 'jarvi', 'how', 'are', 'you', '?']","[hello, jarvis, how, are, you?]"
3,good morning,greeting,"['good', 'morning']","['good', 'morning']","['good', 'morn']","[good, morning]"
4,good afternoon,greeting,"['good', 'afternoon']","['good', 'afternoon']","['good', 'afternoon']","[good, afternoon]"


# Removing Punctuation and Special Characters

In [71]:
import re

In [72]:
def remove_punctuation(words_list):
    #Define pattern whcih is all Alphanumerical character
    # With the "r" prefix (raw string)
    pattern=r'[^a-zA-Z0-9\s]'
    clean_words=[re.sub(pattern,"",word) for word in words_list]
    return clean_words

In [73]:
data['cleaned_text']=data['lemmatized_text'].apply(remove_punctuation)

In [74]:
data['cleaned_text']

0                                     [hi, jarvis]
1                             [hello, goodmorning]
2                   [hello, jarvis, how, are, you]
3                                  [good, morning]
4                                [good, afternoon]
                          ...                     
184    [tell, me, news, about, india, growth, gdp]
185                [goodbye, will, see, you, soon]
186                           [hey, how, are, you]
187             [hey, how, are, you, doing, today]
188                                  [hey, jarvis]
Name: cleaned_text, Length: 189, dtype: object

In [75]:
data.columns

Index(['text', 'intent', 'tokenized_text', 'filtered_text', 'stemmed_text',
       'lemmatized_text', 'cleaned_text'],
      dtype='object')

In [76]:
data.head()

Unnamed: 0,text,intent,tokenized_text,filtered_text,stemmed_text,lemmatized_text,cleaned_text
0,hi jarvis,greeting,"['hi', 'jarvis']","['hi', 'jarvis']","['hi', 'jarvi']","[hi, jarvis]","[hi, jarvis]"
1,hello goodmorning,greeting,"['hello', 'goodmorning']","['hello', 'goodmorning']","['hello', 'goodmorn']","[hello, goodmorning]","[hello, goodmorning]"
2,hello jarvis how are you?,greeting,"['hello', 'jarvis', 'how', 'are', 'you', '?']","['hello', 'jarvis', '?']","['hello', 'jarvi', 'how', 'are', 'you', '?']","[hello, jarvis, how, are, you?]","[hello, jarvis, how, are, you]"
3,good morning,greeting,"['good', 'morning']","['good', 'morning']","['good', 'morn']","[good, morning]","[good, morning]"
4,good afternoon,greeting,"['good', 'afternoon']","['good', 'afternoon']","['good', 'afternoon']","[good, afternoon]","[good, afternoon]"


In [77]:
columns=['text','intent','tokenized_text','filtered_text','stemmed_text','lemmatized_text','cleaned_text']

In [78]:
df=pd.DataFrame(data,columns=columns)

In [79]:
df.head()

Unnamed: 0,text,intent,tokenized_text,filtered_text,stemmed_text,lemmatized_text,cleaned_text
0,hi jarvis,greeting,"['hi', 'jarvis']","['hi', 'jarvis']","['hi', 'jarvi']","[hi, jarvis]","[hi, jarvis]"
1,hello goodmorning,greeting,"['hello', 'goodmorning']","['hello', 'goodmorning']","['hello', 'goodmorn']","[hello, goodmorning]","[hello, goodmorning]"
2,hello jarvis how are you?,greeting,"['hello', 'jarvis', 'how', 'are', 'you', '?']","['hello', 'jarvis', '?']","['hello', 'jarvi', 'how', 'are', 'you', '?']","[hello, jarvis, how, are, you?]","[hello, jarvis, how, are, you]"
3,good morning,greeting,"['good', 'morning']","['good', 'morning']","['good', 'morn']","[good, morning]","[good, morning]"
4,good afternoon,greeting,"['good', 'afternoon']","['good', 'afternoon']","['good', 'afternoon']","[good, afternoon]","[good, afternoon]"


In [85]:
output_excel="processed6_data.xlsx"

In [86]:
data['joined_text']=data['cleaned_text'].apply(lambda x: ''.join(x))

In [87]:
columns=["text","intent",'tokenized_text','stemmed_text','lemmatized_text','cleaned_text','joined_text']

In [88]:
df=pd.DataFrame(data,columns=columns)
output_excel='processed11_data.xlsx'

In [84]:
df.to_excel(output_excel,index=False)

In [434]:
csv_path='processed11_data.xlsx'

In [218]:
df=pd.read_excel(csv_path)

In [219]:
data=pd.DataFrame(df)

In [435]:
data.head()

Unnamed: 0,text,intent,tokenized_text,filtered_text,stemmed_text,lemmatized_text,cleaned_text,joined_text
0,hi jarvis,greeting,"['hi', 'jarvis']","['hi', 'jarvis']","['hi', 'jarvi']","[hi, jarvis]","[hi, jarvis]",hijarvis
1,hello goodmorning,greeting,"['hello', 'goodmorning']","['hello', 'goodmorning']","['hello', 'goodmorn']","[hello, goodmorning]","[hello, goodmorning]",hellogoodmorning
2,hello jarvis how are you?,greeting,"['hello', 'jarvis', 'how', 'are', 'you', '?']","['hello', 'jarvis', '?']","['hello', 'jarvi', 'how', 'are', 'you', '?']","[hello, jarvis, how, are, you?]","[hello, jarvis, how, are, you]",hellojarvishowareyou
3,good morning,greeting,"['good', 'morning']","['good', 'morning']","['good', 'morn']","[good, morning]","[good, morning]",goodmorning
4,good afternoon,greeting,"['good', 'afternoon']","['good', 'afternoon']","['good', 'afternoon']","[good, afternoon]","[good, afternoon]",goodafternoon


In [436]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the joined_text column
tfidf_matrix = tfidf_vectorizer.fit_transform(data['joined_text'])

# Convert the TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Concatenate the TF-IDF DataFrame with the original data
data_with_tfidf = pd.concat([data, tfidf_df], axis=1)

In [437]:
data_with_tfidf

Unnamed: 0,text,intent,tokenized_text,filtered_text,stemmed_text,lemmatized_text,cleaned_text,joined_text,anybreakingnewsonglobalevent,anynewsaboutindia,...,whatthetodayweatherinkerala,whatthetodayweatherinlucknow,whatthetodayweatherinnagpur,whatthetodayweatherinpune,whatthetomorrowweatherinkerala,whatthetomorrowweatherinpatna,whatthetopnewsinindiatoday,whattheweatherinbanglore,whattheweatherinmumbai,whatyourfavoritejoke
0,hi jarvis,greeting,"['hi', 'jarvis']","['hi', 'jarvis']","['hi', 'jarvi']","[hi, jarvis]","[hi, jarvis]",hijarvis,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,hello goodmorning,greeting,"['hello', 'goodmorning']","['hello', 'goodmorning']","['hello', 'goodmorn']","[hello, goodmorning]","[hello, goodmorning]",hellogoodmorning,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,hello jarvis how are you?,greeting,"['hello', 'jarvis', 'how', 'are', 'you', '?']","['hello', 'jarvis', '?']","['hello', 'jarvi', 'how', 'are', 'you', '?']","[hello, jarvis, how, are, you?]","[hello, jarvis, how, are, you]",hellojarvishowareyou,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,good morning,greeting,"['good', 'morning']","['good', 'morning']","['good', 'morn']","[good, morning]","[good, morning]",goodmorning,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,good afternoon,greeting,"['good', 'afternoon']","['good', 'afternoon']","['good', 'afternoon']","[good, afternoon]","[good, afternoon]",goodafternoon,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144,ready for laugh tell me jokes,joke,"['ready', 'for', 'laugh', 'tell', 'me', 'jokes']","['ready', 'laugh', 'tell', 'jokes']","['readi', 'for', 'laugh', 'tell', 'me', 'joke']","[ready, for, laugh, tell, me, joke]","[ready, for, laugh, tell, me, joke]",readyforlaughtellmejoke,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
145,need a break entertain me with joke,joke,"['need', 'a', 'break', 'entertain', 'me', 'wit...","['need', 'break', 'entertain', 'joke']","['need', 'a', 'break', 'entertain', 'me', 'wit...","[need, a, break, entertain, me, with, joke]","[need, a, break, entertain, me, with, joke]",needabreakentertainmewithjoke,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
146,what your favorite joke?,joke,"['what', 'your', 'favorite', 'joke', '?']","['favorite', 'joke', '?']","['what', 'your', 'favorit', 'joke', '?']","[what, your, favorite, joke?]","[what, your, favorite, joke]",whatyourfavoritejoke,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
147,tell me joke that makes everyone laugh,joke,"['tell', 'me', 'joke', 'that', 'makes', 'every...","['tell', 'joke', 'makes', 'everyone', 'laugh']","['tell', 'me', 'joke', 'that', 'make', 'everyo...","[tell, me, joke, that, make, everyone, laugh]","[tell, me, joke, that, make, everyone, laugh]",tellmejokethatmakeeveryonelaugh,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [438]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

#fir and Transform the joined_text column
tfidf_matrix = tfidf_vectorizer.fit_transform(data['joined_text'])

#Convert the TF-IDF matrix to DataFrame
tfidf_df=pd.DataFrame(tfidf_matrix.toarray(),columns=tfidf_vectorizer.get_feature_names_out())

data_with_tfidf=pd.concat([data,tfidf_df])