## Assi #3 FastText

### 1. Load Yelp Dataset
### 2. Text Preprocessing
### 3. Modeling
### 4. Pretrained-FastText
### 5. Load Annoy
### 6. Extract Embeddings
### 7. Find Similar Words
### 8. Find Opposite Words
### 9. Pretrained vs Scratch Train Comparison
### 10. Extract CSV Table for Results

# Silvana Yacoub 20201091
# Maher Mohsen   20200415

In [1]:
  

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/Dataset_User_Agreement.pdf
/kaggle/input/yelp_academic_dataset_review.json
/kaggle/input/yelp_academic_dataset_checkin.json
/kaggle/input/yelp_academic_dataset_business.json
/kaggle/input/yelp_academic_dataset_tip.json
/kaggle/input/yelp_academic_dataset_user.json


## 1. Load Yelp Dataset

In [7]:
import json
import pandas as pd
data_file = open("/kaggle/input/yelp_academic_dataset_tip.json")
data = []
for line in data_file:
    data.append(json.loads(line))
tip_df = pd.DataFrame(data)
data_file.close()


In [8]:
tip_df

Unnamed: 0,user_id,business_id,text,date,compliment_count
0,AGNUgVwnZUey3gcPCJ76iw,3uLgwr0qeCNMjKenHJwPGQ,Avengers time with the ladies.,2012-05-18 02:17:21,0
1,NBN4MgHP9D3cw--SnauTkA,QoezRbYQncpRqyrLH6Iqjg,They have lots of good deserts and tasty cuban...,2013-02-05 18:35:10,0
2,-copOvldyKh1qr-vzkDEvw,MYoRNLb5chwjQe3c_k37Gg,It's open even when you think it isn't,2013-08-18 00:56:08,0
3,FjMQVZjSqY8syIO-53KFKw,hV-bABTK-glh5wj31ps_Jw,Very decent fried chicken,2017-06-27 23:05:38,0
4,ld0AperBXk1h6UbqmM80zw,_uN0OudeJ3Zl_tf6nxg5ww,Appetizers.. platter special for lunch,2012-10-06 19:43:09,0
...,...,...,...,...,...
908910,eYodOTF8pkqKPzHkcxZs-Q,3lHTewuKFt5IImbXJoFeDQ,Disappointed in one of your managers.,2021-09-11 19:18:57,0
908911,1uxtQAuJ2T5Xwa_wp7kUnA,OaGf0Dp56ARhQwIDT90w_g,Great food and service.,2021-10-30 11:54:36,0
908912,v48Spe6WEpqehsF2xQADpg,hYnMeAO77RGyTtIzUSKYzQ,Love their Cubans!!,2021-11-05 13:18:56,0
908913,ckqKGM2hl7I9Chp5IpAhkw,s2eyoTuJrcP7I_XyjdhUHQ,Great pizza great price,2021-11-20 16:11:44,0


In [9]:
# Select only the 'text' column and convert it to a Series
text_series = tip_df['text']

# Select a subset of rows (e.g., first 10000 rows)
text_subset_series = text_series.head(10000)

# Convert the Series to a DataFrame with only one column
text_subset_df = pd.DataFrame({'text': text_subset_series})

# Display the subset DataFrame
print(text_subset_df)


                                                   text
0                        Avengers time with the ladies.
1     They have lots of good deserts and tasty cuban...
2                It's open even when you think it isn't
3                             Very decent fried chicken
4                Appetizers.. platter special for lunch
...                                                 ...
9995         We love the nachos and jerk chicken wings!
9996           Mom, Dad, Sis, nephews, & us for brunch.
9997                                          Meh. Dry.
9998        Great latte. Extra hot, just how I like it!
9999  Late flight to Dallas for conference in Dr Wor...

[10000 rows x 1 columns]


## 2. Text Preprocessing

In [10]:
import pandas as pd
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

# Load English tokenizer, tagger, parser, NER, and word vectors
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

# Function to preprocess text
def preprocess_text(text):
    
    # Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove leading/trailing white spaces and convert to lowercase
    text = text.strip().lower()
    
    # Tokenize the text
    tokens = nlp(text)
    
    # Lemmatize tokens and remove stopwords and non-alphabetic tokens
    tokens = [token.lemma_ for token in tokens if token.lemma_ not in STOP_WORDS and token.is_alpha]
    
    # Join tokens back into a string
    return ' '.join(tokens)

# Apply preprocessing to the 'text' column
text_subset_df['preprocessed_text'] = text_subset_df['text'].apply(preprocess_text)

# Display the preprocessed text
print(text_subset_df['preprocessed_text'])


0                                       avenger time lady
1                    lot good desert tasty cuban sandwich
2                                              open think
3                                    decent fried chicken
4                         appetizer platter special lunch
                              ...                        
9995                        love nachos jerk chicken wing
9996                           mom dad sis nephews brunch
9997                                              meh dry
9998                         great latte extra hot I like
9999    late flight dallas conference dr worth bag fly...
Name: preprocessed_text, Length: 10000, dtype: object


## 3. Modeling

In [33]:
from gensim.models import FastText
from gensim.utils import simple_preprocess
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets
train_data, valid_data = train_test_split(text_subset_df[['preprocessed_text']], test_size=0.2, random_state=42)

# Tokenize the text data using Gensim's simple_preprocess
train_corpus = [simple_preprocess(text) for text in train_data['preprocessed_text']]
valid_corpus = [simple_preprocess(text) for text in valid_data['preprocessed_text']]

# Train the FastText model
model = FastText(vector_size=100, window=10, min_count=1, workers=4)
model.build_vocab(corpus_iterable=train_corpus)
model.train(corpus_iterable=train_corpus, total_examples=len(train_corpus), epochs=1500)


(61875750, 69090000)

## 4. Pretrained-FastText

In [12]:
import fasttext
from huggingface_hub import hf_hub_download

model_path = hf_hub_download(repo_id="facebook/fasttext-et-vectors", filename="model.bin")
pretrained_model = fasttext.load_model(model_path)

model.bin:   0%|          | 0.00/7.24G [00:00<?, ?B/s]



In [14]:
import random
import string
from annoy import AnnoyIndex

random_words = ['apple', 'banana', 'carrot', 'dog', 'elephant', 'flower', 'guitar', 'house', 'internet', 'jungle', 'kite', 'lion', 'mountain', 'notebook', 'orange', 'penguin', 'queen', 'rabbit', 'sunshine', 'tree']


## 5. Load Annoy

In [15]:
from annoy import AnnoyIndex
import random

# Annoy Index Setup
def setup_annoy_index(pretrained_model):
    num_dims = pretrained_model.get_dimension()
    annoy_index = AnnoyIndex(num_dims, 'angular')
    for i, word in enumerate(pretrained_model.get_words()):
        vector = pretrained_model.get_word_vector(word)
        annoy_index.add_item(i, vector)
    annoy_index.build(50)  # 50 trees for approximation
    return annoy_index


## 6. Extract Embeddings

In [16]:
# Word Vector Retrieval Functions
def get_embedding_vectors_pretrained(pretrained_model, words):
    return [pretrained_model.get_word_vector(word) for word in words]

def get_embedding_vectors_model(model, words):
    return [model.wv[word] for word in words]

In [28]:
# Call the function to get embedding vectors using the pretrained model
embedding_vectors_pretrained = get_embedding_vectors_pretrained(pretrained_model, random_words)

# Call the function to get embedding vectors using the model
embedding_vectors_model = get_embedding_vectors_model(model, random_words)


In [38]:
embedding_vectors_pretrained[0]

array([-0.04744264,  0.00384141,  0.00681557,  0.04292644,  0.00821912,
        0.00470234, -0.00639371, -0.01219297,  0.03871919,  0.10926136,
        0.00069809, -0.01011722, -0.02791139,  0.04468519, -0.05492625,
        0.05739029,  0.00486601, -0.08057441,  0.03488096,  0.00156933,
        0.04467723,  0.01161048, -0.09750076,  0.05279666,  0.01010914,
       -0.0185289 ,  0.08211946,  0.03969616, -0.02790017,  0.04107035,
       -0.0087419 , -0.03521007,  0.05865506,  0.02087817,  0.04321015,
        0.00992097, -0.04947798,  0.01697828, -0.05086547,  0.00774659,
        0.00903939, -0.00128553, -0.05244497, -0.06282768,  0.03611057,
       -0.02365493, -0.02683609, -0.01017902, -0.00073479, -0.04777697,
        0.01084739,  0.05563303, -0.06064871,  0.0955272 ,  0.02082256,
        0.01266333, -0.11118156, -0.02926121, -0.0423243 ,  0.01441076,
        0.0282505 ,  0.02537795,  0.01983598, -0.0151052 , -0.00106145,
        0.02459156, -0.01237434, -0.03405061,  0.01224788, -0.01

In [39]:
embedding_vectors_model[0]

array([-1.7426789 ,  1.3343756 ,  1.4989388 , -3.3512259 ,  0.1364948 ,
        1.2998782 ,  1.5213771 ,  3.2781029 , -0.24570261,  2.0123072 ,
       -5.4301324 ,  0.5975554 , -2.4727957 ,  2.1775308 , -2.848344  ,
        3.012284  ,  1.3892561 , -5.005315  , -2.7365255 ,  1.6266829 ,
        2.9624965 , -3.1665933 ,  3.8534505 , -3.722627  , -0.17431489,
       -0.753417  , -0.37247497, -0.92289835,  2.2489753 , -0.07911886,
       -3.8998075 ,  3.303678  ,  3.94861   ,  1.9617566 ,  1.7799965 ,
       -2.0607963 , -0.35215387,  1.2693568 ,  3.9579241 , -1.6779844 ,
       -1.4867896 ,  1.3114182 ,  0.28707972, -1.4058387 , -0.9827716 ,
       -0.13998191, -2.3808253 ,  1.5548446 , -2.0455868 , -0.8498095 ,
        5.9399014 ,  4.671759  ,  4.89676   ,  2.291641  ,  3.0579643 ,
       -1.4225575 ,  2.3422408 , -3.9113786 ,  2.1370952 , -0.22201627,
       -5.6238914 , -0.33667004, -0.06739493,  1.9262583 , -0.5009882 ,
        4.112852  ,  1.1300583 ,  4.292957  , -0.60009676, -1.75

## 7. Find Similar Words

In [29]:
# Word Similarity Functions
def find_similar_words_pretrained(pretrained_model, word, topn=10):
    return pretrained_model.get_nearest_neighbors(word, k=topn)

def find_similar_words_model(model, word, topn=10):
    return model.wv.most_similar(word, topn=topn)


## 8. Find Opposite Words

In [30]:
# Word Opposite Functions
def find_opposite_words_pretrained(pretrained_model, annoy_index, word, topn=10):
    word_vector = pretrained_model.get_word_vector(word)
    similar_indices = annoy_index.get_nns_by_vector(-word_vector, topn, include_distances=True)
    opposite_words = [(pretrained_model.get_words()[idx], 1 - distance) for idx, distance in zip(similar_indices[0], similar_indices[1])]
    return opposite_words

def find_opposite_words_model(model, word, topn=10):
    opposite_words = model.wv.most_similar(negative=[word], topn=topn)
    return opposite_words

## 9. Pretrained vs Scratch Train Comparison

In [53]:
# Main Function
def compare_models(pretrained_model, model):
    
    # Setup Annoy Index
    annoy_index = setup_annoy_index(pretrained_model)
    
    # Find similar and opposite words for each random word using pretrained model
    similar_words_dict_pretrained = {}
    opposite_words_dict_pretrained = {}
    for word in random_words:
        similar_words_dict_pretrained[word] = find_similar_words_pretrained(pretrained_model, word)
        opposite_words_dict_pretrained[word] = find_opposite_words_pretrained(pretrained_model, annoy_index, word)
    
    # Find similar and opposite words for each random word using model
    similar_words_dict_model = {}
    opposite_words_dict_model = {}
    for word in random_words:
        similar_words_dict_model[word] = find_similar_words_model(model, word)
        opposite_words_dict_model[word] = find_opposite_words_model(model, word)
    
    # Print the results for both models
    for word in random_words:
        print(f"Word: {word}\n")
        
        print("Similar Words (Pretrained Model):")
        for similar_word, similarity_score in similar_words_dict_pretrained[word]:
            print(f"{similar_word}: {similarity_score}")
        print("\n")
        
        print("Opposite Words (Pretrained Model):")
        for opposite_word, opposite_score in opposite_words_dict_pretrained[word]:
            print(f"{opposite_word}: {opposite_score}")
        print("\n")
        
        print("Similar Words (Model):")
        for similar_word, similarity_score in similar_words_dict_model[word]:
            print(f"{similar_word}: {similarity_score}")
        print("\n")
        
        print("Opposite Words (Model):")
        for opposite_word, opposite_score in opposite_words_dict_model[word]:
            print(f"{opposite_word}: {opposite_score}")
        print("\n")
        print("-------------------------------------------------------------------")
    return similar_words_dict_pretrained, opposite_words_dict_pretrained, similar_words_dict_model, opposite_words_dict_model

In [54]:
# Call the main function
similar_words_dict_pretrained, opposite_words_dict_pretrained, similar_words_dict_model, opposite_words_dict_model = compare_models(pretrained_model, model)

Word: apple

Similar Words (Pretrained Model):
0.6745341420173645: Snapple
0.6637983918190002: appleyard
0.6263399124145508: apples
0.6080800890922546: applespoti
0.6041028499603271: applet
0.5897731184959412: Nipple
0.5882380604743958: applespot
0.5674155354499817: apple-lisa
0.5537776350975037: Rypple
0.5484605431556702: Apple


Opposite Words (Pretrained Model):
QB1: -0.15393579006195068
β0: -0.15540635585784912
80,7.: -0.16572999954223633
D-x: -0.1677711009979248
ÜPK: -0.17205655574798584
EÕSi: -0.17626047134399414
Zipfi: -0.17942500114440918
q0: -0.17958998680114746
ERHS: -0.18480980396270752
Ew: -0.18548107147216797


Similar Words (Model):
applebee: 0.7106369137763977
apply: 0.6901652216911316
scrapple: 0.6876152157783508
ripple: 0.636448860168457
pineapple: 0.558565616607666
appt: 0.515812337398529
triple: 0.4471432864665985
approval: 0.4225643575191498
applicable: 0.40602049231529236
ample: 0.3747911751270294


Opposite Words (Model):
gs: 0.3849387764930725
customize: 0.340331

## 10. Extract CSV Table for Results

In [161]:
import csv

def export_to_csv(pretrained_results, scratch_results, output_file):
    with open(output_file, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)

        # Write header row
        writer.writerow(["Word", "Pretrained Similar Word 1", "Pretrained Similarity 1",
                         "Pretrained Opposite Word 1", "Pretrained Opposite Similarity 1",
                         "Scratch Similar Word 1", "Scratch Similarity 1",
                         "Scratch Opposite Word 1", "Scratch Opposite Similarity 1"])

        # Write data rows
        for word in pretrained_results['similar_words']:
            pretrained_similar = pretrained_results['similar_words'][word]
            pretrained_opposite = pretrained_results['opposite_words'][word]
            scratch_similar =  scratch_results['similar_words'][word]
            
            scratch_opposite = scratch_results['opposite_words'][word]
            
            row = [word]

            for i in range(10):
                if i < len(pretrained_similar):
                    row.extend(pretrained_similar[i])
                else:
                    row.extend(["", ""])

                if i < len(pretrained_opposite):
                    row.extend(pretrained_opposite[i])
                else:
                    row.extend(["", ""])

                if i < len(scratch_similar):
                    row.extend(scratch_similar[i])
                else:
                    row.extend(["", ""])

                if i < len(scratch_opposite):
                    row.extend(scratch_opposite[i])
                else:
                    row.extend(["", ""])

                writer.writerow(row)

In [162]:
export_to_csv(pretrained_results, scratch_results, "word_embedding_comparison22.csv")


In [59]:
model.save('fast-text-v0.bin')