In [None]:
import re
import numpy as np
from scipy.spatial.distance import cosine
from collections import defaultdict
from sklearn.metrics import precision_score, recall_score, f1_score
from collections import defaultdict

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m52.3 MB/s[0m eta [36m0:00:0

In [None]:
def preprocess(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

# Experiment with different threshold values
cosine_similarity_threshold = 0.5  # Adjust as needed

In [None]:
from transformers import AutoTokenizer, AutoModel, TFAutoModel
MODEL = "cardiffnlp/twitter-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModel.from_pretrained(MODEL)

In [None]:
def get_embedding(text):
    text = preprocess(text)
    print("Preprocessed Text:", text)  # Print preprocessed text
    encoded_input = tokenizer(text, return_tensors='pt')
    features = model(**encoded_input)
    features = features[0].detach().cpu().numpy()
    features_max = np.max(features[0], axis=0)  # Use max-pooling instead of mean
    return features_max

query = "The book was awesome"

tweets = [
    "This is an interesting topic of discussion",
    "Looking forward to the weekend",
    "Enjoyed the concert last night",
    "The weather is perfect for a picnic",
    "Learning about machine learning techniques"
]

ground_truth = [0, 0, 0, 0, 0]

In [None]:
d = defaultdict(int)
cosine_similarity_threshold = 0.5

for idx, tweet in enumerate(tweets):
    sim = 1 - cosine(get_embedding(query), get_embedding(tweet))
    d[tweet] = sim
    print(f"Similarity between query and '{tweet}': {sim}")  # Print similarity score

true_positive = sum(1 for idx, (tweet, sim) in enumerate(d.items()) if sim >= cosine_similarity_threshold and ground_truth[idx] == 1)
false_positive = sum(1 for idx, (tweet, sim) in enumerate(d.items()) if sim >= cosine_similarity_threshold and ground_truth[idx] == 0)
false_negative = sum(1 for idx, (tweet, sim) in enumerate(d.items()) if sim < cosine_similarity_threshold and ground_truth[idx] == 1)

# Avoid division by zero
precision = true_positive / (true_positive + false_positive + 1e-10)
recall = true_positive / (true_positive + false_negative + 1e-10)
f1_score = 2 * (precision * recall) / (precision + recall + 1e-10)

print("Cosine Similarity Metrics:")
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1_score)

NameError: ignored