## Common Sense Reasoning - NLP Assignment 

**Objective:** 

To build a system that can outperform a baseline model with an accuracy of 0.2 using fastText, word2vec, and GloVe word embeddings. Additionally, compare the performance of these pretrained embedding models in terms of accuracy.


In [150]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import json
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestClassifier
import gensim.downloader

### 1- Load from the JSON file

In [151]:
data = []
with open('train_rand_split.jsonl', 'r') as f:
    for line in f:
        data.append(json.loads(line))

        
data[:2] 

[{'answerKey': 'A',
  'id': '075e483d21c29a511267ef62bedc0461',
  'question': {'question_concept': 'punishing',
   'choices': [{'label': 'A', 'text': 'ignore'},
    {'label': 'B', 'text': 'enforce'},
    {'label': 'C', 'text': 'authoritarian'},
    {'label': 'D', 'text': 'yell at'},
    {'label': 'E', 'text': 'avoid'}],
   'stem': 'The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change?'}},
 {'answerKey': 'B',
  'id': '61fe6e879ff18686d7552425a36344c8',
  'question': {'question_concept': 'people',
   'choices': [{'label': 'A', 'text': 'race track'},
    {'label': 'B', 'text': 'populated areas'},
    {'label': 'C', 'text': 'the desert'},
    {'label': 'D', 'text': 'apartment'},
    {'label': 'E', 'text': 'roadblock'}],
   'stem': 'Sammy wanted to go to where the people were.  Where might he go?'}}]

In [152]:
       
questions = []
answer_keys = []
all_answers = []

for entry in data:
    question = entry['question']['stem']
    answer_key = entry['answerKey']
    choices = entry['question']['choices']
    all_choices = [choice['text'] for choice in choices]
    
    # Concatenate all the answers into a single string
    all_answers.append(", ".join(all_choices))
    
    # Append the question and answer key to their respective lists
    questions.append(question)
    answer_keys.append(answer_key)

df = pd.DataFrame({'question': questions, 'answerKey': answer_keys, 'all_answers': all_answers})


df.head()


Unnamed: 0,question,answerKey,all_answers
0,The sanctions against the school were a punish...,A,"ignore, enforce, authoritarian, yell at, avoid"
1,Sammy wanted to go to where the people were. ...,B,"race track, populated areas, the desert, apart..."
2,To locate a choker not located in a jewelry bo...,A,"jewelry store, neck, jewlery box, jewelry box,..."
3,Google Maps and other highway and street GPS s...,D,"united states, mexico, countryside, atlas, oceans"
4,"The fox walked from the city into the forest, ...",C,"pretty flowers., hen house, natural habitat, s..."


### 2- Pre-process the text data

In [153]:
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    # Convert to lowercase
    tokens = [word.lower() for word in tokens]
    # Remove punctuation
    tokens = [word for word in tokens if word not in string.punctuation]
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join tokens back into a string
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text


In [154]:
# Apply text preprocessing to the 'question' column
df['question'] = df['question'].apply(preprocess_text)
df.head()

Unnamed: 0,question,answerKey,all_answers
0,sanction school punishing blow seemed effort s...,A,"ignore, enforce, authoritarian, yell at, avoid"
1,sammy wanted go people might go,B,"race track, populated areas, the desert, apart..."
2,locate choker located jewelry box boutique wou...,A,"jewelry store, neck, jewlery box, jewelry box,..."
3,google map highway street gps service replaced,D,"united states, mexico, countryside, atlas, oceans"
4,fox walked city forest looking,C,"pretty flowers., hen house, natural habitat, s..."


### 3- Random baseline model

In [155]:

def calculate_accuracy_random(data):
    correct_predictions = 0
    total_questions = len(data)

    for question in data:
        answer_key = question["answerKey"]
        prediction_label = random.choice(question["question"]["choices"])["label"]

        if prediction_label == answer_key:
            correct_predictions += 1

    accuracy = correct_predictions / total_questions
    return accuracy

# Calculate accuracy for random baseline model
accuracy_random_baseline = calculate_accuracy_random(data)

print(f"Accuracy for random baseline model: {accuracy_random_baseline}")

Accuracy for random baseline model: 0.20429114053998562


In [156]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(questions, labels, test_size=0.2, random_state=42)


### 3- Load pre-trained word embeddings


In [22]:
word2vec_model = gensim.downloader.load('word2vec-google-news-300')
glove_model = gensim.downloader.load('glove-wiki-gigaword-300')
fasttext_model = gensim.downloader.load('fasttext-wiki-news-subwords-300')

### 4- Convert the text data into vector representations

In [157]:
def get_word_embeddings(text, model):
    embeddings = []
    for word in text:
        if word in model:
            embeddings.append(model[word])
        else:
            embeddings.append(np.zeros(model.vector_size))  # Use zero vector for out-of-vocabulary words
    return np.mean(embeddings, axis=0)


In [158]:
X_train_word2vec = np.array([get_word_embeddings(text, word2vec_model) for text in X_train])
X_test_word2vec = np.array([get_word_embeddings(text, word2vec_model) for text in X_test])


In [159]:
X_train_glove = np.array([get_word_embeddings(text, glove_model) for text in X_train])
X_test_glove = np.array([get_word_embeddings(text, glove_model) for text in X_test])


In [160]:
X_train_fasttext = np.array([get_word_embeddings(text, fasttext_model) for text in X_train])
X_test_fasttext = np.array([get_word_embeddings(text, fasttext_model) for text in X_test])


### 5- Train a classifier model - Using RF


In [161]:
classifier_word2vec = RandomForestClassifier(random_state=42)
classifier_word2vec.fit(X_train_word2vec, y_train)


In [162]:
classifier_glove = RandomForestClassifier(random_state=42)
classifier_glove.fit(X_train_glove, y_train)


In [163]:
classifier_fasttext = RandomForestClassifier(random_state=42)
classifier_fasttext.fit(X_train_fasttext, y_train)


### 6- Evaluate the performance of the models

In [164]:
y_pred_word2vec = classifier_word2vec.predict(X_test_word2vec)
accuracy_word2vec = accuracy_score(y_test, y_pred_word2vec)
print("Accuracy using Word2Vec:", accuracy_word2vec)

Accuracy using Word2Vec: 0.20369420215495126


In [165]:
y_pred_glove = classifier_glove.predict(X_test_glove)
accuracy_glove = accuracy_score(y_test, y_pred_glove)
print("Accuracy using GloVe:", accuracy_glove)

Accuracy using GloVe: 0.2047203694202155


In [166]:
y_pred_fasttext = classifier_fasttext.predict(X_test_fasttext)
accuracy_fasttext = accuracy_score(y_test, y_pred_fasttext)
print("Accuracy using fastText:", accuracy_fasttext)

Accuracy using fastText: 0.20266803488968702
