In [84]:
# //Data Preprocessing with nltk tool steps
# 1st. Lower Case
# 2nd Tokenization
# 3rd  Stopwords Removal
# 4th Stemmed & Lemmatization
# 5th Removing Punctuation & Special Character
# 6th Joining Tokens
# 7th Text Vectorization (TF-IDF)

In [373]:
import pandas as pd
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [374]:
pip install openpyxl

Note: you may need to restart the kernel to use updated packages.


# Lower Case

In [375]:
csv_path="assistant_data.xlsx"
data = pd.read_excel(csv_path)

In [376]:
data.columns

Index(['text', 'intent'], dtype='object')

In [377]:
data["text"]=data['text'].str.lower()
data["intent"]=data['intent'].str.lower()

In [378]:
data["text"]

0                                    hi jarvis
1                            hello goodmorning
2                    hello jarvis how are you?
3                                 good morning
4                               good afternoon
                        ...                   
144              ready for laugh tell me jokes
145        need a break entertain me with joke
146                   what your favorite joke?
147     tell me joke that makes everyone laugh
148    jokes make any day better tell me jokes
Name: text, Length: 149, dtype: object

# Data Tokenize

In [379]:
# Clean the 'text' column by converting non-string values to strings
data['text'] = data['text'].astype(str)

# Tokenize the 'text' column and create the 'tokenized_text' column
data['tokenized_text'] = data['text'].apply(nltk.word_tokenize)

# Print the DataFrame to see the tokenized results
print(data)

                                        text    intent  \
0                                  hi jarvis  greeting   
1                          hello goodmorning  greeting   
2                  hello jarvis how are you?  greeting   
3                               good morning  greeting   
4                             good afternoon  greeting   
..                                       ...       ...   
144            ready for laugh tell me jokes      joke   
145      need a break entertain me with joke      joke   
146                 what your favorite joke?      joke   
147   tell me joke that makes everyone laugh      joke   
148  jokes make any day better tell me jokes      joke   

                                       tokenized_text  
0                                        [hi, jarvis]  
1                                [hello, goodmorning]  
2                   [hello, jarvis, how, are, you, ?]  
3                                     [good, morning]  
4                      

In [380]:
data['text']=data['text'].astype(str)
data['tokenized_text']=data['text'].apply(nltk.word_tokenize)

data.head()

Unnamed: 0,text,intent,tokenized_text
0,hi jarvis,greeting,"[hi, jarvis]"
1,hello goodmorning,greeting,"[hello, goodmorning]"
2,hello jarvis how are you?,greeting,"[hello, jarvis, how, are, you, ?]"
3,good morning,greeting,"[good, morning]"
4,good afternoon,greeting,"[good, afternoon]"


In [381]:
columns=['text','intent','tokenized_text']

In [382]:
df=pd.DataFrame(data,columns=columns)
output_excel="processed_data.xlsx"

In [383]:
print(output_excel)

processed_data.xlsx


# Stopwords Removal

In [384]:
from nltk.corpus import stopwords
nltk.download("stopwords")
stop_words=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [385]:
data['filtered_text']=data['tokenized_text'].apply(lambda tokens:[word for word in tokens if word.lower() not in stop_words])

In [386]:
data.head(20)

Unnamed: 0,text,intent,tokenized_text,filtered_text
0,hi jarvis,greeting,"[hi, jarvis]","[hi, jarvis]"
1,hello goodmorning,greeting,"[hello, goodmorning]","[hello, goodmorning]"
2,hello jarvis how are you?,greeting,"[hello, jarvis, how, are, you, ?]","[hello, jarvis, ?]"
3,good morning,greeting,"[good, morning]","[good, morning]"
4,good afternoon,greeting,"[good, afternoon]","[good, afternoon]"
5,how are you?,greeting,"[how, are, you, ?]",[?]
6,hello,greeting,[hello],[hello]
7,what's up?,greeting,"[what, 's, up, ?]","['s, ?]"
8,hello good evening,greeting,"[hello, good, evening]","[hello, good, evening]"
9,hi goodmorning,greeting,"[hi, goodmorning]","[hi, goodmorning]"


In [387]:
#save columns in the excel file
columns=['text','intent','tokenized_text','filtered_text']
df=pd.DataFrame(data,columns=columns)

In [388]:
output_excel="processed_data.xlsx"

In [389]:
df.to_excel(output_excel,index=False)

In [390]:
print(output_excel)

processed_data.xlsx


# Stemming & Lemmatization

In [391]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

data['stemmed_text'] = data['tokenized_text'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])
stemmed_texts = data['stemmed_text']
stemmed_texts

0                                         [hi, jarvi]
1                                   [hello, goodmorn]
2                    [hello, jarvi, how, are, you, ?]
3                                        [good, morn]
4                                   [good, afternoon]
                            ...                      
144               [readi, for, laugh, tell, me, joke]
145       [need, a, break, entertain, me, with, joke]
146                    [what, your, favorit, joke, ?]
147      [tell, me, joke, that, make, everyon, laugh]
148    [joke, make, ani, day, better, tell, me, joke]
Name: stemmed_text, Length: 149, dtype: object

In [392]:
data.head()

Unnamed: 0,text,intent,tokenized_text,filtered_text,stemmed_text
0,hi jarvis,greeting,"[hi, jarvis]","[hi, jarvis]","[hi, jarvi]"
1,hello goodmorning,greeting,"[hello, goodmorning]","[hello, goodmorning]","[hello, goodmorn]"
2,hello jarvis how are you?,greeting,"[hello, jarvis, how, are, you, ?]","[hello, jarvis, ?]","[hello, jarvi, how, are, you, ?]"
3,good morning,greeting,"[good, morning]","[good, morning]","[good, morn]"
4,good afternoon,greeting,"[good, afternoon]","[good, afternoon]","[good, afternoon]"


In [393]:
columns=["text","intent","tokenized_text","filtered_text","stemmed_text"]

In [394]:
data.head(5)

Unnamed: 0,text,intent,tokenized_text,filtered_text,stemmed_text
0,hi jarvis,greeting,"[hi, jarvis]","[hi, jarvis]","[hi, jarvi]"
1,hello goodmorning,greeting,"[hello, goodmorning]","[hello, goodmorning]","[hello, goodmorn]"
2,hello jarvis how are you?,greeting,"[hello, jarvis, how, are, you, ?]","[hello, jarvis, ?]","[hello, jarvi, how, are, you, ?]"
3,good morning,greeting,"[good, morning]","[good, morning]","[good, morn]"
4,good afternoon,greeting,"[good, afternoon]","[good, afternoon]","[good, afternoon]"


In [395]:
df=pd.DataFrame(data,columns=columns)
df.head()

Unnamed: 0,text,intent,tokenized_text,filtered_text,stemmed_text
0,hi jarvis,greeting,"[hi, jarvis]","[hi, jarvis]","[hi, jarvi]"
1,hello goodmorning,greeting,"[hello, goodmorning]","[hello, goodmorning]","[hello, goodmorn]"
2,hello jarvis how are you?,greeting,"[hello, jarvis, how, are, you, ?]","[hello, jarvis, ?]","[hello, jarvi, how, are, you, ?]"
3,good morning,greeting,"[good, morning]","[good, morning]","[good, morn]"
4,good afternoon,greeting,"[good, afternoon]","[good, afternoon]","[good, afternoon]"


In [396]:
data.head()

Unnamed: 0,text,intent,tokenized_text,filtered_text,stemmed_text
0,hi jarvis,greeting,"[hi, jarvis]","[hi, jarvis]","[hi, jarvi]"
1,hello goodmorning,greeting,"[hello, goodmorning]","[hello, goodmorning]","[hello, goodmorn]"
2,hello jarvis how are you?,greeting,"[hello, jarvis, how, are, you, ?]","[hello, jarvis, ?]","[hello, jarvi, how, are, you, ?]"
3,good morning,greeting,"[good, morning]","[good, morning]","[good, morn]"
4,good afternoon,greeting,"[good, afternoon]","[good, afternoon]","[good, afternoon]"


In [397]:
ouput_excel='processed3_data.xlsx'

In [400]:
df.to_excel(ouput_excel,index=False)
df.head()

Unnamed: 0,text,intent,tokenized_text,filtered_text,stemmed_text
0,hi jarvis,greeting,"[hi, jarvis]","[hi, jarvis]","[hi, jarvi]"
1,hello goodmorning,greeting,"[hello, goodmorning]","[hello, goodmorning]","[hello, goodmorn]"
2,hello jarvis how are you?,greeting,"[hello, jarvis, how, are, you, ?]","[hello, jarvis, ?]","[hello, jarvi, how, are, you, ?]"
3,good morning,greeting,"[good, morning]","[good, morning]","[good, morn]"
4,good afternoon,greeting,"[good, afternoon]","[good, afternoon]","[good, afternoon]"


In [341]:
!python -m spacy download en

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.1/12.8 MB 1.3 MB/s eta 0:00:10
     - -------------------------------------- 0.4/12.8 MB 3.3 MB/s eta 0:00:04
     -- ------------------------------------- 0.8/12.8 MB 5.4 MB/s eta 0:00:03
     ---- ----------------------------------- 1.4/12.8 MB 6.8 MB/s eta 0:00:02
     ---- ----------------------------------- 1.6/12.8 MB 7.1 MB/s eta 0:00:02
     ------- -------------------------------- 2.5/12.8 MB 8.4 MB/s eta 0:00:02
     --------- ------------------------------ 3.0/12.8 MB 8.8 MB/s eta 0:00:02
     ----------- ---------------------------- 3.6/12.8 MB 9.2 MB/s eta 0:00:02
     ------------ --------------------------- 4.

In [342]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [343]:
import spacy

In [344]:
pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz

Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gzNote: you may need to restart the kernel to use updated packages.

  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz (12.0 MB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py): started
  Building wheel for en-core-web-sm (setup.py): finished with status 'done'
  Created wheel for en-core-web-sm: filename=en_core_web_sm-2.2.0-py3-none-any.whl size=12019114 sha256=0ff79e7e96b8ec1702d0de17842271c5c2e3c1227c751ccd2b115dd10cdc1e9b
  Stored in directory: c:\users\admin\appdata\local\pip\cache\wheels\67\69\71\861327450e548c51b1054fd67ebb24295f5affa755b3babcf8
Successfully built en-core-web-sm
Installing collected packages: en-core-web-sm
  Attempting 

In [401]:
import nltk
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [402]:
pip install --upgrade nltk

Note: you may need to restart the kernel to use updated packages.


In [412]:
df.head()

Unnamed: 0,text,intent,tokenized_text,filtered_text,stemmed_text
0,hi jarvis,greeting,"[hi, jarvis]","[hi, jarvis]","[hi, jarvi]"
1,hello goodmorning,greeting,"[hello, goodmorning]","[hello, goodmorning]","[hello, goodmorn]"
2,hello jarvis how are you?,greeting,"[hello, jarvis, how, are, you, ?]","[hello, jarvis, ?]","[hello, jarvi, how, are, you, ?]"
3,good morning,greeting,"[good, morning]","[good, morning]","[good, morn]"
4,good afternoon,greeting,"[good, afternoon]","[good, afternoon]","[good, afternoon]"


In [413]:
# import these modules
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [414]:
import pandas as pd
csv_path="processed3_data.xlsx"
data = pd.read_excel(csv_path)

In [415]:
data.head()

Unnamed: 0,text,intent,tokenized_text,filtered_text,stemmed_text
0,hi jarvis,greeting,"['hi', 'jarvis']","['hi', 'jarvis']","['hi', 'jarvi']"
1,hello goodmorning,greeting,"['hello', 'goodmorning']","['hello', 'goodmorning']","['hello', 'goodmorn']"
2,hello jarvis how are you?,greeting,"['hello', 'jarvis', 'how', 'are', 'you', '?']","['hello', 'jarvis', '?']","['hello', 'jarvi', 'how', 'are', 'you', '?']"
3,good morning,greeting,"['good', 'morning']","['good', 'morning']","['good', 'morn']"
4,good afternoon,greeting,"['good', 'afternoon']","['good', 'afternoon']","['good', 'afternoon']"


In [416]:
data['lemmatized_text'] = data['text'].apply(lambda sentence: [lemmatizer.lemmatize(token) for token in sentence.split()])

In [417]:
data['lemmatized_text']

0                                        [hi, jarvis]
1                                [hello, goodmorning]
2                     [hello, jarvis, how, are, you?]
3                                     [good, morning]
4                                   [good, afternoon]
                            ...                      
144               [ready, for, laugh, tell, me, joke]
145       [need, a, break, entertain, me, with, joke]
146                     [what, your, favorite, joke?]
147     [tell, me, joke, that, make, everyone, laugh]
148    [joke, make, any, day, better, tell, me, joke]
Name: lemmatized_text, Length: 149, dtype: object

In [418]:
data.head()

Unnamed: 0,text,intent,tokenized_text,filtered_text,stemmed_text,lemmatized_text
0,hi jarvis,greeting,"['hi', 'jarvis']","['hi', 'jarvis']","['hi', 'jarvi']","[hi, jarvis]"
1,hello goodmorning,greeting,"['hello', 'goodmorning']","['hello', 'goodmorning']","['hello', 'goodmorn']","[hello, goodmorning]"
2,hello jarvis how are you?,greeting,"['hello', 'jarvis', 'how', 'are', 'you', '?']","['hello', 'jarvis', '?']","['hello', 'jarvi', 'how', 'are', 'you', '?']","[hello, jarvis, how, are, you?]"
3,good morning,greeting,"['good', 'morning']","['good', 'morning']","['good', 'morn']","[good, morning]"
4,good afternoon,greeting,"['good', 'afternoon']","['good', 'afternoon']","['good', 'afternoon']","[good, afternoon]"


In [353]:
data.head(20)

Unnamed: 0,text,intent,tokenized_text,filtered_text,lemmatized_text
0,hi jarvis,greeting,"['hi', 'jarvis']","['hi', 'jarvis']","[hi, jarvis]"
1,hello goodmorning,greeting,"['hello', 'goodmorning']","['hello', 'goodmorning']","[hello, goodmorning]"
2,hello jarvis how are you?,greeting,"['hello', 'jarvis', 'how', 'are', 'you', '?']","['hello', 'jarvis', '?']","[hello, jarvis, how, are, you?]"
3,good morning,greeting,"['good', 'morning']","['good', 'morning']","[good, morning]"
4,good afternoon,greeting,"['good', 'afternoon']","['good', 'afternoon']","[good, afternoon]"
5,how are you?,greeting,"['how', 'are', 'you', '?']",['?'],"[how, are, you?]"
6,hello,greeting,['hello'],['hello'],[hello]
7,what's up?,greeting,"['what', ""'s"", 'up', '?']","[""'s"", '?']","[what's, up?]"
8,hello good evening,greeting,"['hello', 'good', 'evening']","['hello', 'good', 'evening']","[hello, good, evening]"
9,hi goodmorning,greeting,"['hi', 'goodmorning']","['hi', 'goodmorning']","[hi, goodmorning]"


In [419]:
columns=['text','intent','tokenized_text','filtered_text','stemmed_text','lemmatized_text']
df=pd.DataFrame(data,columns=columns)
df.head(5)

Unnamed: 0,text,intent,tokenized_text,filtered_text,stemmed_text,lemmatized_text
0,hi jarvis,greeting,"['hi', 'jarvis']","['hi', 'jarvis']","['hi', 'jarvi']","[hi, jarvis]"
1,hello goodmorning,greeting,"['hello', 'goodmorning']","['hello', 'goodmorning']","['hello', 'goodmorn']","[hello, goodmorning]"
2,hello jarvis how are you?,greeting,"['hello', 'jarvis', 'how', 'are', 'you', '?']","['hello', 'jarvis', '?']","['hello', 'jarvi', 'how', 'are', 'you', '?']","[hello, jarvis, how, are, you?]"
3,good morning,greeting,"['good', 'morning']","['good', 'morning']","['good', 'morn']","[good, morning]"
4,good afternoon,greeting,"['good', 'afternoon']","['good', 'afternoon']","['good', 'afternoon']","[good, afternoon]"


# Removing Punctuation and Special Characters

In [420]:
import re

In [421]:
def remove_punctuation(words_list):
    #Define pattern whcih is all Alphanumerical character
    # With the "r" prefix (raw string)
    pattern=r'[^a-zA-Z0-9\s]'
    clean_words=[re.sub(pattern,"",word) for word in words_list]
    return clean_words

In [422]:
data['cleaned_text']=data['lemmatized_text'].apply(remove_punctuation)

In [423]:
data['cleaned_text']

0                                        [hi, jarvis]
1                                [hello, goodmorning]
2                      [hello, jarvis, how, are, you]
3                                     [good, morning]
4                                   [good, afternoon]
                            ...                      
144               [ready, for, laugh, tell, me, joke]
145       [need, a, break, entertain, me, with, joke]
146                      [what, your, favorite, joke]
147     [tell, me, joke, that, make, everyone, laugh]
148    [joke, make, any, day, better, tell, me, joke]
Name: cleaned_text, Length: 149, dtype: object

In [424]:
data.columns

Index(['text', 'intent', 'tokenized_text', 'filtered_text', 'stemmed_text',
       'lemmatized_text', 'cleaned_text'],
      dtype='object')

In [425]:
data.head()

Unnamed: 0,text,intent,tokenized_text,filtered_text,stemmed_text,lemmatized_text,cleaned_text
0,hi jarvis,greeting,"['hi', 'jarvis']","['hi', 'jarvis']","['hi', 'jarvi']","[hi, jarvis]","[hi, jarvis]"
1,hello goodmorning,greeting,"['hello', 'goodmorning']","['hello', 'goodmorning']","['hello', 'goodmorn']","[hello, goodmorning]","[hello, goodmorning]"
2,hello jarvis how are you?,greeting,"['hello', 'jarvis', 'how', 'are', 'you', '?']","['hello', 'jarvis', '?']","['hello', 'jarvi', 'how', 'are', 'you', '?']","[hello, jarvis, how, are, you?]","[hello, jarvis, how, are, you]"
3,good morning,greeting,"['good', 'morning']","['good', 'morning']","['good', 'morn']","[good, morning]","[good, morning]"
4,good afternoon,greeting,"['good', 'afternoon']","['good', 'afternoon']","['good', 'afternoon']","[good, afternoon]","[good, afternoon]"


In [208]:
columns=['text','intent','tokenized_text','filtered_text','stemmed_text','lemmatized_text','cleaned_text']

In [426]:
df=pd.DataFrame(data,columns=columns)

In [427]:
df.head()

Unnamed: 0,text,intent,tokenized_text,filtered_text,stemmed_text,lemmatized_text
0,hi jarvis,greeting,"['hi', 'jarvis']","['hi', 'jarvis']","['hi', 'jarvi']","[hi, jarvis]"
1,hello goodmorning,greeting,"['hello', 'goodmorning']","['hello', 'goodmorning']","['hello', 'goodmorn']","[hello, goodmorning]"
2,hello jarvis how are you?,greeting,"['hello', 'jarvis', 'how', 'are', 'you', '?']","['hello', 'jarvis', '?']","['hello', 'jarvi', 'how', 'are', 'you', '?']","[hello, jarvis, how, are, you?]"
3,good morning,greeting,"['good', 'morning']","['good', 'morning']","['good', 'morn']","[good, morning]"
4,good afternoon,greeting,"['good', 'afternoon']","['good', 'afternoon']","['good', 'afternoon']","[good, afternoon]"


In [210]:
output_excel="processed6_data.xlsx"

In [428]:
data['joined_text']=data['cleaned_text'].apply(lambda x: ''.join(x))

In [429]:
columns=["text","intent",'tokenized_text','stemmed_text','lemmatized_text','cleaned_text','joined_text']

In [432]:
df=pd.DataFrame(data,columns=columns)
output_excel='processed11_data.xlsx'

In [433]:
df.to_excel(output_excel,index=False)

In [434]:
csv_path='processed11_data.xlsx'

In [218]:
df=pd.read_excel(csv_path)

In [219]:
data=pd.DataFrame(df)

In [435]:
data.head()

Unnamed: 0,text,intent,tokenized_text,filtered_text,stemmed_text,lemmatized_text,cleaned_text,joined_text
0,hi jarvis,greeting,"['hi', 'jarvis']","['hi', 'jarvis']","['hi', 'jarvi']","[hi, jarvis]","[hi, jarvis]",hijarvis
1,hello goodmorning,greeting,"['hello', 'goodmorning']","['hello', 'goodmorning']","['hello', 'goodmorn']","[hello, goodmorning]","[hello, goodmorning]",hellogoodmorning
2,hello jarvis how are you?,greeting,"['hello', 'jarvis', 'how', 'are', 'you', '?']","['hello', 'jarvis', '?']","['hello', 'jarvi', 'how', 'are', 'you', '?']","[hello, jarvis, how, are, you?]","[hello, jarvis, how, are, you]",hellojarvishowareyou
3,good morning,greeting,"['good', 'morning']","['good', 'morning']","['good', 'morn']","[good, morning]","[good, morning]",goodmorning
4,good afternoon,greeting,"['good', 'afternoon']","['good', 'afternoon']","['good', 'afternoon']","[good, afternoon]","[good, afternoon]",goodafternoon


In [436]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the joined_text column
tfidf_matrix = tfidf_vectorizer.fit_transform(data['joined_text'])

# Convert the TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Concatenate the TF-IDF DataFrame with the original data
data_with_tfidf = pd.concat([data, tfidf_df], axis=1)

In [437]:
data_with_tfidf

Unnamed: 0,text,intent,tokenized_text,filtered_text,stemmed_text,lemmatized_text,cleaned_text,joined_text,anybreakingnewsonglobalevent,anynewsaboutindia,...,whatthetodayweatherinkerala,whatthetodayweatherinlucknow,whatthetodayweatherinnagpur,whatthetodayweatherinpune,whatthetomorrowweatherinkerala,whatthetomorrowweatherinpatna,whatthetopnewsinindiatoday,whattheweatherinbanglore,whattheweatherinmumbai,whatyourfavoritejoke
0,hi jarvis,greeting,"['hi', 'jarvis']","['hi', 'jarvis']","['hi', 'jarvi']","[hi, jarvis]","[hi, jarvis]",hijarvis,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,hello goodmorning,greeting,"['hello', 'goodmorning']","['hello', 'goodmorning']","['hello', 'goodmorn']","[hello, goodmorning]","[hello, goodmorning]",hellogoodmorning,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,hello jarvis how are you?,greeting,"['hello', 'jarvis', 'how', 'are', 'you', '?']","['hello', 'jarvis', '?']","['hello', 'jarvi', 'how', 'are', 'you', '?']","[hello, jarvis, how, are, you?]","[hello, jarvis, how, are, you]",hellojarvishowareyou,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,good morning,greeting,"['good', 'morning']","['good', 'morning']","['good', 'morn']","[good, morning]","[good, morning]",goodmorning,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,good afternoon,greeting,"['good', 'afternoon']","['good', 'afternoon']","['good', 'afternoon']","[good, afternoon]","[good, afternoon]",goodafternoon,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144,ready for laugh tell me jokes,joke,"['ready', 'for', 'laugh', 'tell', 'me', 'jokes']","['ready', 'laugh', 'tell', 'jokes']","['readi', 'for', 'laugh', 'tell', 'me', 'joke']","[ready, for, laugh, tell, me, joke]","[ready, for, laugh, tell, me, joke]",readyforlaughtellmejoke,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
145,need a break entertain me with joke,joke,"['need', 'a', 'break', 'entertain', 'me', 'wit...","['need', 'break', 'entertain', 'joke']","['need', 'a', 'break', 'entertain', 'me', 'wit...","[need, a, break, entertain, me, with, joke]","[need, a, break, entertain, me, with, joke]",needabreakentertainmewithjoke,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
146,what your favorite joke?,joke,"['what', 'your', 'favorite', 'joke', '?']","['favorite', 'joke', '?']","['what', 'your', 'favorit', 'joke', '?']","[what, your, favorite, joke?]","[what, your, favorite, joke]",whatyourfavoritejoke,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
147,tell me joke that makes everyone laugh,joke,"['tell', 'me', 'joke', 'that', 'makes', 'every...","['tell', 'joke', 'makes', 'everyone', 'laugh']","['tell', 'me', 'joke', 'that', 'make', 'everyo...","[tell, me, joke, that, make, everyone, laugh]","[tell, me, joke, that, make, everyone, laugh]",tellmejokethatmakeeveryonelaugh,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [438]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

#fir and Transform the joined_text column
tfidf_matrix = tfidf_vectorizer.fit_transform(data['joined_text'])

#Convert the TF-IDF matrix to DataFrame
tfidf_df=pd.DataFrame(tfidf_matrix.toarray(),columns=tfidf_vectorizer.get_feature_names_out())

data_with_tfidf=pd.concat([data,tfidf_df])