# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk


# Import Data

In [None]:
conclusions = pd.read_csv('/content/drive/MyDrive/new-mind/conclusions.csv')
opinions = pd.read_csv('/content/drive/MyDrive/new-mind/opinions.csv')
topics = pd.read_csv('/content/drive/MyDrive/new-mind/topics.csv')

# Preprocess the Data

In [None]:
# remove type and effectiveness columns
topics = topics.drop(['id', 'type', 'effectiveness'], axis=1)
topics.head()

Unnamed: 0,topic_id,text
0,007ACE74B050,"On my perspective, I think that the face is a ..."
1,00944C693682,With so many things in this world that few peo...
2,00BD97EA4041,"No because, why should a computer know how you..."
3,00C6E82FE5BA,I think that it wouldn't be valueable to have ...
4,013B9AA6B9DB,"Well, some people believe that it was somethin..."


In [None]:
merged_topics = topics.groupby('topic_id')['text'].apply(lambda x: ' '.join(x)).reset_index()
merged_topics.head()

Unnamed: 0,topic_id,text
0,00066EA9880D,The developement of these cars should be stopp...
1,000E6DE9E817,I am arguing against the policy change
2,0016926B079C,I think that students would benefit from learn...
3,00203C45FC55,Distance education is actually detrimental to ...
4,0029F4D19C3F,I think that's crazy! If kids don't have and o...


In [None]:
opinions.head()

Unnamed: 0,id,topic_id,text,type,effectiveness
0,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate
1,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate
2,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate
3,36a565e45db7,007ACE74B050,"though some say that life on Mars does exist, ...",Rebuttal,Ineffective
4,fb65fe816ba3,007ACE74B050,"It says in paragraph 7, on April 5, 1998, Mars...",Evidence,Adequate


In [None]:
dataset = pd.merge(merged_topics, opinions, on='topic_id')
dataset = dataset.drop(['id', 'topic_id', 'effectiveness'], axis=1)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26743 entries, 0 to 26742
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text_x  26743 non-null  object
 1   text_y  26743 non-null  object
 2   type    26743 non-null  object
dtypes: object(3)
memory usage: 626.9+ KB


In [None]:
dataset.head()

Unnamed: 0,text_x,text_y,type
0,The developement of these cars should be stopp...,the driver will be alerted when they will need...,Claim
1,The developement of these cars should be stopp...,This is such a dangerous thing because we all ...,Evidence
2,The developement of these cars should be stopp...,Another thing that can go wrong with these car...,Claim
3,The developement of these cars should be stopp...,Every person with any kind of technological de...,Evidence
4,The developement of these cars should be stopp...,who to blame for the wreck if there were possi...,Claim


In [None]:
import nltk
import string

# Download necessary resources (comment this out if already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


def preprocess_text(text):
  # Lowercase text
  text = text.lower()
  # Remove punctuation
  text = ''.join([c for c in text if c not in string.punctuation])
  # Tokenize text (split into words)
  tokens = nltk.word_tokenize(text)
  # Remove stop words
  stopwords = nltk.corpus.stopwords.words('english')
  tokens = [token for token in tokens if token not in stopwords]
  # # lemmatize
  lemmatizer = nltk.WordNetLemmatizer()
  # Remove empty tokens
  tokens = [token for token in tokens if token]
  # join tokens
  tokens = ' '.join(tokens)
  return tokens


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# preprocess the text_x and text_y column
dataset['text_x'] = dataset['text_x'].apply(preprocess_text)
dataset['text_y'] = dataset['text_y'].apply(preprocess_text)
dataset.head()

Unnamed: 0,text_x,text_y,type
0,developement car stopped immediately many haza...,driver alerted need take driving responsibilit...,Claim
1,developement car stopped immediately many haza...,dangerous thing know whenever human get attent...,Evidence
2,developement car stopped immediately many haza...,another thing go wrong car type techological m...,Claim
3,developement car stopped immediately many haza...,every person kind technological device experie...,Evidence
4,developement car stopped immediately many haza...,blame wreck possibly sort technological malfun...,Claim


In [None]:
topics_df = merged_topics['text'].apply(preprocess_text)  # assuming your topic dataset is in a CSV file
comments_df = opinions['text'] .apply(preprocess_text)  # assuming your comment dataset is in a CSV file

In [None]:
dataset.head()

Unnamed: 0,text_x,text_y,type
0,developement car stopped immediately many haza...,driver alerted need take driving responsibilit...,Claim
1,developement car stopped immediately many haza...,dangerous thing know whenever human get attent...,Evidence
2,developement car stopped immediately many haza...,another thing go wrong car type techological m...,Claim
3,developement car stopped immediately many haza...,every person kind technological device experie...,Evidence
4,developement car stopped immediately many haza...,blame wreck possibly sort technological malfun...,Claim


# Model

## Option 1: Tfidf Vectorizer

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


x = dataset['text_y']
y = dataset['text_x']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()
x_train_tfidf = vectorizer.fit_transform(x_train)
x_test_tfidf = vectorizer.transform(x_test)

In [None]:
model = MultinomialNB()
model.fit(x_train_tfidf, y_train)

y_pred = model.predict(x_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [None]:
preprocessed_topics.info()

<class 'pandas.core.series.Series'>
RangeIndex: 4005 entries, 0 to 4004
Series name: text
Non-Null Count  Dtype 
--------------  ----- 
4005 non-null   object
dtypes: object(1)
memory usage: 31.4+ KB


## Option 2: DistilBert

In [None]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

# Create a tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
def get_sentence_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the [CLS] token representation
    return outputs[0][:, 0, :].numpy()

get_sentence_embedding(topics_df[0]).shape

(1, 768)

In [None]:
topics['text']

In [None]:
from tqdm import tqdm
# get the sentence embeddings of all topics and store it in a dataset
topic_embeddings = []
for i in tqdm(range(len(topics))):
    topic_embeddings.append((topics.iloc[0].topic_id, get_sentence_embedding(topics.iloc[0].text)))

100%|██████████| 4024/4024 [06:28<00:00, 10.36it/s]


In [None]:
# pickle topic_embeddings
import pickle
with open('/content/drive/MyDrive/new-mind/topic_embeddings.pkl', 'wb') as f:
    pickle.dump(topic_embeddings, f)

('007ACE74B050',
 array([[-5.14411449e-01, -2.58573820e-03, -8.96150395e-02,
         -2.39867926e-01, -4.14842516e-02, -6.71444118e-01,
          3.94392848e-01,  1.09256208e+00,  4.65742975e-01,
         -9.29346442e-01,  1.73650786e-01, -1.81320742e-01,
         -3.96660089e-01,  2.37740412e-01,  6.32460713e-01,
          1.77152753e-01, -3.11897606e-01,  8.18433642e-01,
          1.20577291e-01,  2.65999347e-01,  5.76491058e-02,
         -3.31549972e-01,  1.03003472e-01, -2.44283885e-01,
          7.05133155e-02, -2.02340290e-01,  1.87549833e-02,
         -3.66087019e-01, -7.64015689e-02, -5.44314869e-02,
         -3.51164758e-01, -4.58184257e-02, -9.67276514e-01,
         -5.58649540e-01,  1.11720763e-01, -2.64584869e-01,
         -1.81115896e-01,  3.79292697e-01, -2.93563828e-02,
         -1.35106206e-01, -2.57255465e-01,  1.05176367e-01,
         -1.85861945e-01, -2.23252743e-01, -1.29182026e-01,
          1.25824809e-01, -3.75820208e+00,  1.39303403e-02,
          4.83538881e-0

In [None]:
opinions.iloc[0].text

'I think that the face is a natural landform because there is no life on Mars that we have descovered yet '

In [None]:
# search with topic using specific id
id =  opinions[opinions['text'] == comment].topic_id.iloc[0]
topic = topics[topics['topic_id'] == id].text.iloc[0]
print(topic)

On my perspective, I think that the face is a natural landform because I dont think that there is any life on Mars. In these next few paragraphs, I'll be talking about how I think that is is a natural landform 


In [None]:
# generate a list of 100 random numbers less than the length of opinions
random_numbers = np.random.randint(0, len(opinions), 100)

In [None]:
correct = 0
incorrect = 0
for i in range(100):
  # get the comment embedding
  comment = opinions.iloc[i]
  comment_embedding = get_sentence_embedding(comment.text)

  # calculate sentence similarity for all topics
  similarities = []
  for _, topic_embedding in topic_embeddings:
      similarities.append(cosine_similarity(topic_embedding, comment_embedding)[0][0])
  similarities = np.array(similarities)

  # get the index of the most similar topic
  most_similar_topic_index = np.argsort(similarities)[:10]
  most_similar_topics = []
  for index in most_similar_topic_index:
    most_similar_topic_id = topic_embeddings[most_similar_topic_index][0]
    most_similar_topic = topics[topics['topic_id'] == most_similar_topic_id].text.iloc[0]
    most_similar_topics.append(most_similar_topic)

  # print("Most similar topic:", most_similar_topic)

  # get the actual topic of the comment
  id =  comment.topic_id
  topic = topics[topics['topic_id'] == id].text
  if topic.empty:
    continue
  # print(topic)

  if most_similar_topic == :
    correct += 1
  else:
    incorrect += 1

print("Correct:", correct)
print("Incorrect:", incorrect)

Correct: 6
Incorrect: 93


## Sentence Similarity

In [None]:
!pip install sentence_transformers

In [None]:
from sentence_transformers import SentenceTransformer, util

sent_model = SentenceTransformer('Alibaba-NLP/gte-base-en-v1.5', trust_remote_code=True)


In [None]:
topics_emb = sent_model.encode(topics['text'].tolist())

In [None]:
type(topics_emb)
np.save("/content/drive/MyDrive/new-mind/topics_emb.npy", topics_emb)

In [None]:
topic_ids = []

for i in topics['text']:
  topic_ids.append(topics[topics['text'] == i].iloc[0].topic_id)

len(topic_ids)

4024

In [None]:
topics_ids = np.array(topic_ids)
np.save("/content/drive/MyDrive/new-mind/topics_ids.npy", topics_ids)

In [None]:
from sentence_transformers.util import cos_sim

for i in opinions
out = cos_sim(sent_model.encode(comment.text), topics_emb)

In [None]:
np.argsort(out[0])

tensor([3213, 1281, 2809,  ...,  870,  679,  626])

In [None]:
def calculate_similarity(your_sentence_embedding, sentence_embeddings, batch_size=100):
  similarities = []
  for i in range(0, len(sentence_embeddings), batch_size):
    batch_embeddings = sentence_embeddings[i:i+batch_size]
    batch_similarities = np.dot(your_sentence_embedding.reshape(1, -1), batch_embeddings.T)
    similarities.extend(batch_similarities.tolist()[0])
  return similarities


In [None]:
# generate 1000 random number less than the langth of opinions
random_numbers = np.random.randint(0, len(opinions), 3000)

In [None]:
top1 = 0
top5 = 0
top10 = 0
top20 = 0
for i in tqdm(random_numbers):
  # get the comment embedding
  comment = opinions.iloc[i]
  comment_embedding = sent_model.encode(comment.text)

  # calculate sentence similarity for all topics
  similarities = calculate_similarity(comment_embedding, topics_emb)

  # get the index of the most similar topic
  most_similar_topic_index = np.argsort(similarities)[::-1]
  most_similar_topic_id = [topic_ids[i] for i in most_similar_topic_index]


  # get the actual id
  id = comment.topic_id
  # print(f"actual: {id} - predicted: {most_similar_topic_id[0]}")
  if id in most_similar_topic_id[:20]:
    top20 += 1
  if id in most_similar_topic_id[:10]:
    top10 += 1
  if id in most_similar_topic_id[:5]:
    top5 += 1
  if id == most_similar_topic_id[0]:
    top1 += 1


top1, top5, top10, top20

100%|██████████| 3000/3000 [10:51<00:00,  4.60it/s]


(76, 202, 319, 504)

In [None]:
top1 / 3000, top5 / 3000, top10 / 3000, top20 / 3000

(0.025333333333333333, 0.06733333333333333, 0.10633333333333334, 0.168)

In [None]:
most_similar_topic_index[0]

tensor([3585,  961,   36,  ..., 2554,    0,  540])

In [None]:
most_similar_topic_index, np.argsort(similarities)[:5]

(0, array([   0, 2674, 2675, 2676, 2677]))