<a href="https://colab.research.google.com/github/liznjoki/Data-Science-Projects/blob/main/GroupingQuestions_SimilarityScore.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize, TreebankWordDetokenizer
from nltk.corpus import stopwords
from nltk import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

import nltk
import string

import spacy

nltk.download("stopwords")
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

nlp = spacy.load('en_core_web_sm')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
df= pd.read_csv("/content/Customer Representative.csv")

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,Job Position,Question,Answer,Answer Rating,Interview Phases,Skill Assessed
0,1,Customer Service Representative,What are Your Biggest Achievements?,"During my last job, I learned some interperson...",Average,General,Accomplishments
1,2,Customer Service Representative,Name any Two Improvements You Made in the Prev...,"As a few of my team members were late to work,...",Good,General,Accomplishments
2,3,Customer Service Representative,Tell me about a professional accomplishment yo...,One of my proudest professional accomplishment...,Good,General,Accomplishments
3,4,Customer Service Representative,Have you ever utilized customer feedback to en...,"Yes, I've leveraged customer feedback to impro...",Average,Role Specific,Analytical mindset
4,5,Customer Service Representative,Have You Used Customer Feedback to Ensure Busi...,"Yes, I have used customer feedback to improve ...",Good,Role Specific,Analytical mindset


In [None]:
df= df.drop(columns=["Skill Assessed"], axis=1)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1262 entries, 0 to 1261
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   S.No.      1262 non-null   int64 
 1   Job Role   1262 non-null   object
 2   Questions  1262 non-null   object
 3   Category   1262 non-null   object
 4   Answers    1262 non-null   object
 5   Ratings    1262 non-null   object
dtypes: int64(1), object(5)
memory usage: 59.3+ KB


In [None]:
df.shape

(1262, 6)

In [None]:
df.isna().sum()

S.No.        0
Job Role     0
Questions    0
Category     0
Answers      0
Ratings      0
dtype: int64

In [None]:
df.shape

(459, 6)

#Text cleaning and processing

In [None]:
#preprocessing of the dataset, removing punctuations and making lowercase

def preprocess_text_column(df, column_name):
  #stemmer= PorterStemmer()
  #stop_words = set(stopwords.words("english"))
  #Preprocess a single text
  def preprocess_text(text):
    #Remove punctuation and convert to lowercase
    text= text.translate(str.maketrans("", "", string.punctuation)).lower()

    #Tokenize the text
    words= word_tokenize(text)

    #Remove stop words and apply stemming
    #words= [stemmer.stem(word) for word in words if word not in stop_words]

    #Join the processed words back into a single string

    return " ".join(words)

  df[column_name] = df[column_name].apply(preprocess_text)

  return df




In [None]:
cleaned_df = preprocess_text_column(df, "Question")

In [None]:
cleaned_df.head()

Unnamed: 0.1,Unnamed: 0,Job Position,Question,Answer,Answer Rating,Interview Phases,Skill Assessed
0,1,Customer Service Representative,what are your biggest achievements,"During my last job, I learned some interperson...",Average,General,Accomplishments
1,2,Customer Service Representative,name any two improvements you made in the prev...,"As a few of my team members were late to work,...",Good,General,Accomplishments
2,3,Customer Service Representative,tell me about a professional accomplishment yo...,One of my proudest professional accomplishment...,Good,General,Accomplishments
3,4,Customer Service Representative,have you ever utilized customer feedback to en...,"Yes, I've leveraged customer feedback to impro...",Average,Role Specific,Analytical mindset
4,5,Customer Service Representative,have you used customer feedback to ensure busi...,"Yes, I have used customer feedback to improve ...",Good,Role Specific,Analytical mindset


In [None]:
cleaned_df.Question.unique()

In [None]:
cleaned_df.reset_index(inplace=True, drop=True)

#Grouping Question according to similarity scores using 0.6 similarity score threshold.

In [None]:
#Extract TF-IDF features from the Question
tfidf_vectorizer= TfidfVectorizer()
tfidf_matrix= tfidf_vectorizer.fit_transform(cleaned_df["Question"])

#Calculate the pairwise cosine similarity between Question
cosine_similarities= cosine_similarity(tfidf_matrix)

#Define a threshold for grouping Question

threshold= 0.6

#Create an empty list to store the groups

grouped_Question= []

#Create a dictionary to store question-to-group mapping

question_to_group = {}

#Counter for group numbering

group_number= 1

#Iterate through the cosine similarity matrix
for i in range(len(cleaned_df)):

  #Check if the question is already assigned to a group
  if i not in question_to_group:

    #Initialize an empty list to store the current group
    current_group= [i]

    #Iterate through the cosine similarity scores for the current question
    for j in range(len(cleaned_df)):
      if i!=j and cosine_similarities[i, j] >= threshold:
        current_group.append(j)

        #Mark the question assigned to the current group
        question_to_group[j] = group_number

    #If the current group containts more than one question, add it to the list of groups
    if len(current_group) > 1:
      grouped_Question.append(current_group)

      #Increment the group number
      group_number +=1





In [None]:
 #Print the grouped Question with group numbers
for group_number, group in enumerate(grouped_Question, start=1):
  print(f"Group {group_number}")
  for i in group:
    print(cleaned_df.loc[i, "Question"])

Group 1
what are your career goals
what are your career goals
what are your longterm career goals
Group 2
where do you see yourself in three years
where do you see yourself in five years
Group 3
where do you see yourself five years down the line
where do you see yourself in five years
Group 4
what are you hoping to do in five years
what are you hoping to do in five years
Group 5
how do you respond when you do not know the answer to a question
how do you respond to a customers question for which you do not know the answer
how do you handle a situation where you do not know the answer to a customers question
has a customer ever asked a question you did not know the answer to how did you respond
what do you do when you don ’ t know the answer to a customer ’ s question
Group 6
what do you know about our products or services
what do you think of our companys products and services
Group 7
what do you know about this company
what do you know about our company or product
Group 8
why do you wa

In [None]:
#Create a new 'labels' column to hold group numbers
cleaned_df["labels"] = None


#Assign group number to the 'labels' column
for i, group in enumerate(grouped_Question, start=1):
  cleaned_df.loc[group, "labels"]= i

#Show the modified DataFrame
cleaned_df

Unnamed: 0.1,Unnamed: 0,Job Position,Question,Answer,Answer Rating,Interview Phases,Skill Assessed,labels
0,1,Customer Service Representative,what are your biggest achievements,"During my last job, I learned some interperson...",Average,General,Accomplishments,
1,2,Customer Service Representative,name any two improvements you made in the prev...,"As a few of my team members were late to work,...",Good,General,Accomplishments,
2,3,Customer Service Representative,tell me about a professional accomplishment yo...,One of my proudest professional accomplishment...,Good,General,Accomplishments,
3,4,Customer Service Representative,have you ever utilized customer feedback to en...,"Yes, I've leveraged customer feedback to impro...",Average,Role Specific,Analytical mindset,
4,5,Customer Service Representative,have you used customer feedback to ensure busi...,"Yes, I have used customer feedback to improve ...",Good,Role Specific,Analytical mindset,
...,...,...,...,...,...,...,...,...
454,456,Customer Service Representative,have you ever collaborated with other customer...,"Yes, I often work with others if I can’t solve...",Good,Behavioral,Team Work and Collaboration,
455,457,Customer Service Representative,how do you motivate your team to provide excel...,I believe that providing excellent customer se...,Good,Behavioral,Team Work and Collaboration,
456,458,Customer Service Representative,how do you collaborate with your team members ...,I recently worked for a tech company that pro...,Average,Behavioral,Team Work and Collaboration,62
457,459,Customer Service Representative,have you ever successfully managed a conflict ...,"While working for my current employer, I asked...",Good,Behavioral,Team Work and Collaboration,63


In [None]:
cleaned_df.labels.nunique()

64

In [None]:
#sort the dataframe according to the group numbers
cleaned_df.sort_values(by="labels", inplace=True)
cleaned_df

Unnamed: 0.1,Unnamed: 0,Job Position,Question,Answer,Answer Rating,Interview Phases,Skill Assessed,labels
11,12,Customer Service Representative,what are your career goals,my goals include mastering the role of custome...,Good,General,Career Goals,1
18,19,Customer Service Representative,what are your career goals,I am passionate about customer service and see...,Good,General,Career Goals,1
19,20,Customer Service Representative,what are your longterm career goals,My long-term career goals include continuing t...,Good,General,Career Goals,1
12,13,customer service representative,where do you see yourself in three years,within three to five years i hope to have move...,Good,General,Career Goals,2
14,15,customer service representative,where do you see yourself five years down the ...,i would like to take on additional roles and r...,Average,General,Career Goals,3
...,...,...,...,...,...,...,...,...
450,452,Customer Service Manager,describe a situation where you had to delegate...,As a project leader for a digital marketing te...,Good,Behavioral,Team Work and Collaboration,
451,453,Customer Service Manager,share an example of a time when you had to pro...,I once led a team on which one individual cons...,Good,Behavioral,Team Work and Collaboration,
452,454,Customer Service Manager,do you work well with other people,Open Communication: I prioritize open and tran...,Good,Behavioral,Team Work and Collaboration,
454,456,Customer Service Representative,have you ever collaborated with other customer...,"Yes, I often work with others if I can’t solve...",Good,Behavioral,Team Work and Collaboration,


In [None]:
cleaned_df["labels"].isnull().sum()

198

#Experimenting with three different similarity thresholds

In [None]:
#Extract TF-IDF features from the Question
tfidf_vectorizer= TfidfVectorizer()
tfidf_matrix= tfidf_vectorizer.fit_transform(cleaned_df["Question"])

#Calculate the pairwise cosine similarity between Question
cosine_similarities= cosine_similarity(tfidf_matrix)

# Experiment with different threshold values
threshold_values = [0.5, 0.6, 0.7, 0.8]

for threshold in threshold_values:
    grouped_Question = []
    question_to_group = {}
    group_number = 1

    for i in range(len(cleaned_df)):
        if i not in question_to_group:
            current_group = [i]
            for j in range(len(cleaned_df)):
                if i != j and cosine_similarities[i, j] >= threshold:
                    current_group.append(j)
                    question_to_group[j] = group_number

            if len(current_group) > 1:
                grouped_Question.append(current_group)
                group_number += 1

    # Print the grouped Question with group numbers for the current threshold
    print(f"Threshold: {threshold}")
    for group_number, group in enumerate(grouped_Question, start=1):
        print(f"Group {group_number}")
        for i in group:
            print(cleaned_df.loc[i, "Question"])
    print("\n")


##Replace all questions in the same group with the first question asked.


In [None]:
# Dictionary to store the first question in each group
first_questions = {}

# Iterate through each row in the DataFrame
for index, row in cleaned_df.iterrows():
    label = row['labels']
    question = row['Question']

    # If the label is encountered for the first time and it's not a NaN value, store the question as the first question for that label
    if pd.notnull(label) and label not in first_questions:
        first_questions[label] = question

# Function to replace questions with first questions based on the label, else keep the original question
def replace_with_first_question(row):
    label = row['labels']
    question = row['Question']
    if pd.notnull(label):
        return first_questions.get(label, question)
    else:
        return question

# Create a new column 'generalized interview questions' by mapping Labels to first questions
cleaned_df['generalized interview questions'] = cleaned_df.apply(replace_with_first_question, axis=1)


In [None]:
cleaned_df

Unnamed: 0.1,Unnamed: 0,Job Position,Question,Answer,Answer Rating,Interview Phases,Skill Assessed,labels,generalized interview questions
11,12,Customer Service Representative,what are your career goals,my goals include mastering the role of custome...,Good,General,Career Goals,1,what are your career goals
18,19,Customer Service Representative,what are your career goals,I am passionate about customer service and see...,Good,General,Career Goals,1,what are your career goals
19,20,Customer Service Representative,what are your longterm career goals,My long-term career goals include continuing t...,Good,General,Career Goals,1,what are your career goals
12,13,customer service representative,where do you see yourself in three years,within three to five years i hope to have move...,Good,General,Career Goals,2,where do you see yourself in three years
14,15,customer service representative,where do you see yourself five years down the ...,i would like to take on additional roles and r...,Average,General,Career Goals,3,where do you see yourself five years down the ...
...,...,...,...,...,...,...,...,...,...
450,452,Customer Service Manager,describe a situation where you had to delegate...,As a project leader for a digital marketing te...,Good,Behavioral,Team Work and Collaboration,,describe a situation where you had to delegate...
451,453,Customer Service Manager,share an example of a time when you had to pro...,I once led a team on which one individual cons...,Good,Behavioral,Team Work and Collaboration,,share an example of a time when you had to pro...
452,454,Customer Service Manager,do you work well with other people,Open Communication: I prioritize open and tran...,Good,Behavioral,Team Work and Collaboration,,do you work well with other people
454,456,Customer Service Representative,have you ever collaborated with other customer...,"Yes, I often work with others if I can’t solve...",Good,Behavioral,Team Work and Collaboration,,have you ever collaborated with other customer...


In [None]:
cleaned_df.to_csv("Final_Customer_Representative_Data.csv", index=False)