<a href="https://colab.research.google.com/github/nkrj01/Tweets-analysis-with-Open-AI/blob/main/embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Creating Ada and tf-idf embedding**
This notebook is for creating ada embedding using openAI and tf-idf embedding using sk-learn. Embedding are saved as a csv file and use in a separate notebook for classification.

In [None]:
! pip install cohere
! pip install openai

In [None]:
import ast
import openai
import pandas as pd
import numpy as np
import re
from difflib import SequenceMatcher
import tensorflow as tf
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# Download required NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
import string
from sklearn.feature_extraction.text import TfidfVectorizer

## **Helper functions**

In [None]:
from sklearn.externals._packaging.version import collections
def removeHyperlinks(col):
  # Regular expression to match URLs (both http/https and www variants)
  url_pattern = r'https?://\S+|www\.\S+'

  # Use the sub() function to replace all URLs with an empty string
  text_without_links = re.sub(url_pattern, '', col)
  return text_without_links

def removeSpecialCharacters(col):
    # Define a regular expression pattern to match special characters at the start and end of the sentence
    pattern = r'[^A-Za-z\s]+'

    # Use the sub() function to remove special characters
    cleaned_sentence = re.sub(pattern, '', col)
    cleaned_sentence = cleaned_sentence.rstrip()
    cleaned_sentence = re.sub(r'\s+', ' ', cleaned_sentence)

    return cleaned_sentence


def similarity_ratio(a, b):
  # this function matches two strings and returns the ratio of similarity between two string
  # an output of 1 mean exact same
  return SequenceMatcher(a, b).ratio()

def removeDuplicates(df, threshold=0.95):
  rows_to_drop = []
  for i, row1 in df.iterrows():
    if i% 10 == 0 and i!=0:
      print(i)
    for j, row2 in df.iterrows():

      if j>i and i not in rows_to_drop and j not in rows_to_drop:
        similarity = similarity_ratio(row1["text"], row2["text"])
        if similarity>threshold:
          rows_to_drop.append(j)
  df_cleaned = df.drop(rows_to_drop)
  return df_cleaned


def getAdaEmbedding(train_text: list, model="text-embedding-ada-002") -> list:
  total_size = len(train_text)
  batch_end = 0
  batch_size = 500
  n_steps = int(total_size/batch_size) + 1
  ada_embedding = []
  for i in range(n_steps):
    batch_start = batch_end
    batch_end = batch_start+batch_size
    if batch_end<=total_size:
      pass
    else:
      batch_end = total_size
      batch_size = total_size % batch_size
    text = train_text[batch_start:batch_end]
    output = openai.Embedding.create(input = text, model=model)
    for j in range(batch_size):
      ada_embedding.append(output['data'][j]['embedding'])
  return ada_embedding

## **Data Import**

In [None]:
df_train = pd.read_csv(r"/content/drive/MyDrive/Colab Notebooks/OpenAI/train.csv")
df_test = pd.read_csv(r"/content/drive/MyDrive/Colab Notebooks/OpenAI/test.csv")
df_train.shape

## **Cleaning and storing the cleaned text**

In [None]:
# text cleaning function
def clean_text(df_train):
  df_train["text"] = df_train["text"].astype("string")
  df_train["text"] = df_train["text"].apply(removeHyperlinks)
  df_train["text"] = df_train["text"].apply(removeSpecialCharacters)
  df_train = removeDuplicates(df_train)
  df_train["keyword"] = df_train["keyword"].fillna(" ")
  df_train["text"] = df_train["text"] + ". " + df_train["keyword"] # join keyword and text column
  return df_train

df_train_clean = clean_text(df_train)
df_test_clean = clean_text(df_test)
df_train.to_csv(r'/content/drive/MyDrive/Colab Notebooks/OpenAI/train_clean.csv')
df_test.to_csv(r'/content/drive/MyDrive/Colab Notebooks/OpenAI/test_clean.csv')

## **Ada Embedding using Open AI**

In [None]:
from google.colab import userdata
openai.api_key = userdata.get('openai')

# importing the cleaned text
df_train = pd.read_csv(r'/content/drive/MyDrive/Colab Notebooks/OpenAI/train_clean.csv')
train_text = df_train["text"].to_list()

# Ada embedding
ada_embedding = getAdaEmbedding(train_text)
df_train["ada_embedding"] = ada_embedding

# saving embedded vectors
df_train.to_csv(r'/content/drive/MyDrive/Colab Notebooks/OpenAI/train_ada_embedded.csv', index=False)

## **TfIdf encoder**

In [None]:
# importing the cleaned text
df_train = pd.read_csv(r'/content/drive/MyDrive/Colab Notebooks/OpenAI/train_clean.csv')
df_train["keyword"] = df_train["keyword"].fillna(" ")
df_train["text"] = df_train["text"] + ". " + df_train["keyword"]

# function for removing the stop words and lemmatizing, i.e., pre processing.
def pre_processing(text):
  # Initialize the lemmatizer and stop words
  lemmatizer = WordNetLemmatizer()
  stop_words = set(stopwords.words('english'))
  punctuation = set(string.punctuation)

  processed_text = []
  for sentence in text:
    tokens = word_tokenize(sentence)
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and word not in punctuation]
    processed_text.append(' '.join(lemmatized_tokens))
  return processed_text

# text pre-processing
text = df_train["text"].to_list()
processed_text = pre_processing(text)

# tf-idf encoding
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_text)
tfidf = tfidf_matrix.toarray().tolist()
df_train["tfidf"] = tfidf

# saving the encoded vectors
df_train.to_csv(r'/content/drive/MyDrive/Colab Notebooks/OpenAI/train_tfidf_encoded.csv', index=False)