In [1]:
import pandas as pd
import numpy as np
import regex as re
import string
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk

nltk.download('wordnet')
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

[nltk_data] Error loading wordnet: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>
Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nlt

In [2]:
df = pd.read_csv('/kaggle/input/kaggle-llm-science-exam/test.csv')

In [3]:
df_sub = df.copy()

In [4]:
def lower(text):
    return text.lower()
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])
def preprocess(text):
    text = lower(text)
    text = remove_punctuation(text)
    text = remove_stopwords(text)
    text = lemmatize_words(text)
    return text

In [5]:
df_sub['prompt'] = df["prompt"].apply(lambda text: preprocess(text))
df_sub['A'] = df["A"].apply(lambda text: preprocess(text))
df_sub['B'] = df["B"].apply(lambda text: preprocess(text))
df_sub['C'] = df["C"].apply(lambda text: preprocess(text))
df_sub['D'] = df["D"].apply(lambda text: preprocess(text))
df_sub['E'] = df["E"].apply(lambda text: preprocess(text))

In [6]:
def vocabs(df, col):
    split_col = df[col].apply(lambda x: x.split())
    uniq = list()
    for row in split_col:
        uniq = uniq + row
    return list(set(uniq))

uniq_vocabs = vocabs(df_sub,'prompt')

In [7]:
def vectorize(tokens, filtered_vocabs):
    ''' This function takes list of words in a sentence as input 
    and returns a vector of size of filtered_vocab.It puts 0 if the 
    word is not present in tokens and count of token if present.'''
    vector=[]
    for w in filtered_vocabs:
        vector.append(tokens.count(w))
    return vector

In [8]:
df_sub['prompt'] =  df_sub['prompt'].apply(lambda x: vectorize(x.split(), uniq_vocabs))
df_sub['A'] =  df_sub['A'].apply(lambda x: vectorize(x.split(), uniq_vocabs))
df_sub['B'] =  df_sub['B'].apply(lambda x: vectorize(x.split(), uniq_vocabs))
df_sub['C'] =  df_sub['C'].apply(lambda x: vectorize(x.split(), uniq_vocabs))
df_sub['D'] =  df_sub['D'].apply(lambda x: vectorize(x.split(), uniq_vocabs))
df_sub['E'] =  df_sub['E'].apply(lambda x: vectorize(x.split(), uniq_vocabs))

In [9]:
def get_jaccard_sim(a, b):
    mult1 = np.array(a)*np.array(b)    
    mult1 = mult1 > 0 
    mult2 = np.array(a) + np.array(b)
    mult2 = mult2 > 0 
    return float(sum(mult1)/sum(mult2))

choice = np.array(['A','B','C','D','E'])

def predict(row, prompt, options):
    sel = []
    for opt in options:
        score = get_jaccard_sim(row[prompt],row[opt])
        sel.append(score)
    order = np.argsort(np.array(sel))[::-1]
    return ' '.join(choice[order][:3])
        
df_sub['prediction'] = df_sub.apply(lambda row: predict(row,'prompt',choice), axis=1)

In [10]:
def cosine_sim(a,b):
    return np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b))

def predict(row, prompt, options):
    sel = []
    for opt in options:
        score = cosine_sim(row[prompt],row[opt])
        sel.append(score)
    order = np.argsort(np.array(sel))[::-1]
    return ' '.join(choice[order][:3])

df_sub['prediction_2'] = df_sub.apply(lambda row: predict(row,'prompt',choice), axis=1)

  return np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b))


In [11]:
submission_df = pd.DataFrame()
pred_col = 'prediction_2'
submission_df['id'] = df_sub['id']
submission_df['prediction'] = df_sub[pred_col]

In [12]:
submission_df.to_csv('submission.csv', index=False)

Following are the three technique to get numeric vector representation of each text message:
<br>(a) Bag-of-Words
<br>(b) TF-IDF
<br>(c) Word2Vec

TF-IDF
<br>1) The tf–idf is the product of two statistics, term frequency and inverse document frequency. There are various ways for determining the exact values of both statistics.
<br>2) A formula that aims to define the importance of a keyword or phrase within a document or a web page.
<br><br>Hypothesis: BoW or TF-IDF will perform poor on this dataset because all of the five options are similar to each other.

## Bow implementation on sklearn

In [13]:
# import pandas as pd
# from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 
# sentence_1="This is a good job.I will not miss it for anything"
# sentence_2="This is not good at all"
# sentence_1="Welcome to Great Learning , Now start learning"
# sentence_2="Learning is a good practice"
 
# CountVec = CountVectorizer(ngram_range=(1,1), # to use bigrams ngram_range=(2,2)
#                            stop_words='english')
# #transform
# Count_data = CountVec.fit_transform([sentence_1,sentence_2])
 
# #create dataframe
# cv_dataframe=pd.DataFrame(Count_data.toarray(),columns=CountVec.get_feature_names_out())
# print(cv_dataframe)


In [14]:
# CountVec.vocabulary_

In [15]:
# print(sentence_1)
# print(sentence_2)

In [16]:
# import pandas as pd
# from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 
# sentence_1="This is a good job.I will not miss it for anything"
# sentence_2="This is not good at all"
# sentence_1="Welcome to Great Learning , Now start learning"
# sentence_2="Learning is a good practice"
 
# CountVec = CountVectorizer(ngram_range=(1,1), # to use bigrams ngram_range=(2,2)
#                            stop_words='english')
# #transform
# Count_data = CountVec.fit_transform([sentence_1,sentence_2])
 
# #create dataframe
# cv_dataframe=pd.DataFrame(Count_data.toarray(),columns=CountVec.get_feature_names_out())
# print(cv_dataframe)


Limitation:
    
1. The model ignores the location information of the word. 
2. Bag of word models doesn’t respect the semantics of the word. e.g. ‘soccer’ and ‘football’
3. The range of vocabulary is a big issue

Source: https://www.mygreatlearning.com/blog/bag-of-words/
Source : https://stats.stackexchange.com/questions/289400/quantify-the-similarity-of-bags-of-words

## TF-IDF

In [17]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# # Create TfidfVectorizer object
# vectorizer = TfidfVectorizer()

In [18]:
# vectorizer.vocabulary_

## kekurangan, harus di stemming dulu biar accurate dan accurately satu vocab

In [19]:
# # Generate matrix of word vectors
# tfidf_matrix = vectorizer.fit_transform(df['prompt'])

# # Print the shape of tfidf_matrix
# print(tfidf_matrix.shape)

In [20]:
# from sklearn.metrics.pairwise import cosine_similarity
# cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [21]:
# indices = pd.Series(df.index, index=df['prompt']).drop_duplicates()


In [22]:
# #ubah jadi ambil top 3 answer yang paling relevan
# def get_recommendations(title, cosine_sim, indices):
#     # Get the index of the movie that matches the title
#     idx = indices[title]
#     # Get the pairwsie similarity scores
#     sim_scores = list(enumerate(cosine_sim[idx]))
#     # Sort the movies based on the similarity scores
#     sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
#     # Get the scores for 10 most similar movies
#     sim_scores = sim_scores[1:11]
#     # Get the movie indices
#     movie_indices = [i[0] for i in sim_scores]
#     # Return the top 10 most similar movies
#     return metadata['title'].iloc[movie_indices]

## Similarity using Word2Vec and Glove Encoding

https://stackoverflow.com/questions/56071689/whats-the-major-difference-between-glove-and-word2vec

In [23]:
# import spacy
# nlp = spacy.load('en_core_web_lg')
# doc0 = nlp(df['prompt'].loc[0])
# doc1 = nlp(df['prompt'].loc[1])
# doc2 = nlp(df['prompt'].loc[2])

In [24]:
# print(doc0.similarity(doc1))

# print(doc1.similarity(doc2))

# print(doc0.similarity(doc2))


In [25]:
# By default spaCy calculates cosine similarity. Similarity is determined by comparing word vectors or word embeddings, multi-dimensional meaning representations of a word.
# print(np.dot(doc1.vector, doc2.vector) / (np.linalg.norm(doc1.vector) * np.linalg.norm(doc2.vector)))
# It seems that spaCy's .vector method created the vectors. 
# Documentation says that spaCy's models are trained from GloVe's vectors.

# spaCy provides a mapping from a vocabular of common words to vectors.
# These vectors, sometimes called "word embeddings," are designed (using the GloVe algorithm) to map semantic meaning into numeric proximity.

# The most notable difference between Word2vec 
# and GloVe is the training process. Word2vec uses a shallow neural network to create vectors, while GloVe uses a global matrix factorization technique.