In [1]:
import pandas as pd
import nltk
import numpy as np
import re
from nltk.stem import wordnet # to perform lemmitization
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer # to perfor bow
from sklearn.feature_extraction.text import TfidfVectorizer # to perfor bow
from nltk import pos_tag # for parts of speech
from sklearn.metrics import pairwise_distances # to perform cosine similarity
from nltk import word_tokenize # to create tokens
from nltk.corpus import stopwords # for stop words

# Download nltk data libraries. All can be downloaded by using nltk.download('all')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
# nltk.download('all')

ModuleNotFoundError: No module named 'nltk'

In [None]:
pip install --upgrade nltk

In [None]:
df = pd.read_csv('/kaggle/input/indian-government-schemes/updated_data.csv')

df.head(20) # See first 20 lines

In [None]:
df.shape[0] # Returns the number of rows in dataset

In [None]:
df.isnull().sum()

# Filling or handling the missing values

In [None]:
df.ffill(axis = 0,inplace=True) # fills the null value with the previous value.
df.head(20)

In [None]:
df = df.drop(columns=["Unnamed: 9"])

# Checking if still missing values persist or not

In [None]:
df.isnull().sum()

# Creating a new dataset

In [None]:
df1 =df.head(10)
df1

In [None]:
# function that converts text into lower case removes special characters

def step1(x):
    for i in x:
        a=str(i).lower()
        p=re.sub(r'[^a-z0-9]',' ',a)
        print(p)

In [None]:
step1(df1['scheme_name'])

In [None]:
nltk.download('punkt_tab')
# word tokenizing

s = 'aicte short term training programme sfurti scheme'
words = word_tokenize(s)
print(words)

# Lemmatization

In [None]:
lemma = wordnet.WordNetLemmatizer() # initializing lemmatizer

lemma.lemmatize('absorbed', pos = 'v')

In [None]:
nltk.download('averaged_perceptron_tagger_eng')
pos_tag(nltk.word_tokenize(s), tagset=None) # returns the parts of speech of ever

In [None]:
# function that performs text normalization steps

def text_normalization(text):
    text=str(text).lower() # text to lower case
    spl_char_text=re.sub(r'[^ a-z]','',text) # removing special characters
    tokens=nltk.word_tokenize(spl_char_text) # word tokenizing
    lema=wordnet.WordNetLemmatizer() # initializing lemmatization
    tags_list=pos_tag(tokens,tagset=None)
    lema_words=[] # empty list
    for token, pos_token in tags_list:
        if pos_token.startswith('V'): # Verb
            pos_val='v'
        elif pos_token.startswith('J'): # Adhactive
            pos_val='a'
        elif pos_token.startswith('R'): # Adverb
            pos_val='r'
        else:
            pos_val='n' # Noun
        lema_token=lema.lemmatize(token,pos_val) # performing lemmatization
        lema_words.append(lema_token) # appending the lemmatized token into a list

    return " ".join(lema_words) # returns the lemmatized tokens as a sentence

In [None]:
text_normalization('garudas scheme for funeral expense')

In [None]:
df['lemmatized_text'] = df['scheme_name'].apply(text_normalization)
df.tail(20)

In [None]:
# all the stop words we have

stop = stopwords.words('english')
stop

# bag of words

In [None]:
cv = CountVectorizer()  # initializing the count vectorizer

X = cv.fit_transform(df['lemmatized_text']).toarray()

X

In [None]:
# returns all the unique word from data

features = cv.get_feature_names_out()
df_bow = pd.DataFrame(X, columns = features)
df_bow.head()

In [None]:
Question = 'Can you tell me how you help people?' # considering an example

In [None]:
# checking for stop words

Q = []
a = Question.split()
for i in a:
    if i in stop:
        continue
    else:
        Q.append(i)
    b = " ".join(Q)

b # print the values

In [None]:
Question_lemma = text_normalization(b) # applyig the function that we created for
Question_bow = cv.transform([Question_lemma]).toarray() # applying bow

In [None]:
Question_bow

# Now to get the related response we shall find the cosine similaity between the question and the lemmatized text we have.

# similarity

In [None]:
# cosine similarity for the above question we considered.

cosine_value = 1 - pairwise_distances(df_bow, Question_bow, metric = 'cosine')
(cosine_value)

In [None]:
df['similarity_bow'] = cosine_value # creating a new column
df

In [None]:
df['Text Response'] = (
    df['slug'].astype(str) + " " +
    df['details'].astype(str) + " " +
    df['benefits'].astype(str) + " " +
    df['eligibility'].astype(str) + " " +
    df['application'].astype(str) + " " +
    df['documents'].astype(str) + " " +
    df['level'].astype(str) + " " +
    df['schemeCategory'].astype(str) + " " +
    df['tags'].astype(str)
)


In [None]:
df_simi = pd.DataFrame(df, columns=['Text Response','similarity_bow']) # taking similarity value of responses for the question we took
df_simi

In [None]:
df_simi_sort = df_simi.sort_values(by='similarity_bow', ascending=False) # sorting the values
df_simi_sort.head()

In [None]:
threshold = 0.2 # considering the value of p=smiliarity to be greater than 0.2
df_threshold = df_simi_sort[df_simi_sort['similarity_bow'] > threshold] 
df_threshold

In [None]:
index_value = cosine_value.argmax() # returns the index number of highest value
index_value 

In [None]:
(Question)

In [None]:
df['Text Response'].loc[index_value] # The text at the above index becomes the response for the question

# tf-idf

In [None]:
Question1 = 'What is Pradhan Mantri Awas Yojana?'

In [None]:
tfidf = TfidfVectorizer()
Question_lemma1 = text_normalization(Question1)
Question_tfidf = tfidf.fit_transform([Question_lemma1]).toarray() # applying tf-idf

In [None]:
x_tfidf = tfidf.fit_transform(df['lemmatized_text']).toarray()

In [None]:
# returns all the unique word from data with a score of that word

df_tfidf=pd.DataFrame(x_tfidf,columns=tfidf.get_feature_names_out()) 
df_tfidf.head()

In [None]:
df.shape

# Similarity

In [None]:
# Split the transformed data back into the document matrix and the question vector
df_tfidf =x_tfidf[:-1]  # All rows except the last (documents)
Question_tfidf =x_tfidf[-1].reshape(1, -1)  # The last row (question)

In [None]:
cos=1-pairwise_distances(df_tfidf,x_tfidf,metric='cosine')  # applying cosine similarity
cos

In [None]:
print(cos.shape)

In [None]:
if len(cos) < len(df):
    cos = np.pad(cos, (0, len(df) - len(cos)), mode='constant', constant_values=0)

In [None]:
cos = cos[:, 0] 
print(cos.shape)


In [None]:
df['similarity_tfidf']= cos.flatten() # creating a new column 
df_simi_tfidf = pd.DataFrame(df, columns=['Text Response','similarity_tfidf']) # taking similarity value of responses for the question we took
df_simi_tfidf 

In [None]:
df_simi_tfidf_sort = df_simi_tfidf.sort_values(by='similarity_tfidf', ascending=False) # sorting the values
df_simi_tfidf_sort.head(10)

In [None]:
threshold = 0.2 # considering the value of p=smiliarity to be greater than 0.2
df_threshold = df_simi_tfidf_sort[df_simi_tfidf_sort['similarity_tfidf'] > threshold] 
df_threshold

In [None]:
index_value1 = cos.argmax() # returns the index number of highest value
index_value1

In [None]:
Question1

In [None]:
df['Text Response'].loc[index_value1]  # returns the text at that index

# Model Using Bag of Words

In [None]:
# Function that removes stop words and process the text

def stopword_(text):   
    tag_list=pos_tag(nltk.word_tokenize(text),tagset=None)
    stop=stopwords.words('english')
    lema=wordnet.WordNetLemmatizer()
    lema_word=[]
    for token,pos_token in tag_list:
        if token in stop:
            continue
        if pos_token.startswith('V'):
            pos_val='v'
        elif pos_token.startswith('J'):
            pos_val='a'
        elif pos_token.startswith('R'):
            pos_val='r'
        else:
            pos_val='n'
        lema_token=lema.lemmatize(token,pos_val)
        lema_word.append(lema_token)
    return " ".join(lema_word) 

In [None]:
# defining a function that returns response to query using bow

def chat_bow(text):
    s=stopword_(text)
    lemma=text_normalization(s) # calling the function to perform text normalization
    bow=cv.transform([lemma]).toarray() # applying bow
    cosine_value = 1- pairwise_distances(df_bow,bow, metric = 'cosine' )
    index_value=cosine_value.argmax() # getting index value 
    return df['Text Response'].loc[index_value]

In [None]:
chat_bow('Tell me about PM Kisan Samman Nidhi.')

In [None]:
chat_bow('Who can apply for PMAY-G?')

In [None]:
chat_bow('What benefits does PM KISAN scheme provide?')

# Model Using tf-idf

In [None]:
# defining a function that returns response to query using tf-idf

def chat_tfidf(text):
    lemma=text_normalization(text) # calling the function to perform text normalization
    tf=tfidf.transform([lemma]).toarray() # applying tf-idf
    cos=1-pairwise_distances(df_tfidf,tf,metric='cosine') # applying cosine similarity
    index_value=cos.argmax() # getting index value 
    return df['Text Response'].loc[index_value]

In [None]:
chat_tfidf('Is this scheme for rural or urban citizens?')