In [3]:
import pandas as pd
import numpy as np
import gensim
from gensim.models import FastText
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

# Load data
df = pd.read_json('/kaggle/input/yelpdata/yelp_academic_dataset_tip.json', lines=True)

# Select a subset of data
df = df[['text']].sample(frac=0.01)  # Adjust the fraction as per memory and processing power

# Preprocessing function
def preprocess(text):
    lemmatizer = WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english')  # Using NLTK stopwords
    tokens = word_tokenize(text.lower())
    lemmatized = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stopwords]
    return lemmatized

# Apply preprocessing
df['processed'] = df['text'].apply(preprocess)
corpus = df['processed'].tolist()

In [4]:
model = FastText(vector_size=100, window=3, min_count=1)  # Configurable parameters
model.build_vocab(corpus)
model.train(corpus, total_examples=len(corpus), epochs=2000)  # Adjust epochs as needed

(102803124, 116962000)

In [5]:
# Generate random words from the corpus for testing
random_words = np.random.choice([word for sublist in corpus for word in sublist], size=30, replace=False)

# Get similar and dissimilar words
results = {}
for word in random_words:
    similar = model.wv.most_similar(word, topn=10)
    dissimilar = model.wv.most_similar(negative=[word], topn=10)
    results[word] = {'similar': similar, 'dissimilar': dissimilar}

In [6]:
from huggingface_hub import hf_hub_download
import fasttext

pretrained_model = fasttext.load_model(hf_hub_download("facebook/fasttext-et-vectors", "model.bin"))

# Test with the same random words
pretrained_results = {}
for word in random_words:
    try:
        # Get similar words
        similar_pretrained = pretrained_model.get_nearest_neighbors(word, k=10)
        dissimilar_pretrained = pretrained_model.get_nearest_neighbors(word, k=len(pretrained_model.words))[-1:-11:-1]

        pretrained_results[word] = {
            'similar': [(neighbor[1], neighbor[0]) for neighbor in similar_pretrained],
            'dissimilar': [(neighbor[1], neighbor[0]) for neighbor in dissimilar_pretrained]
        }
    except KeyError:
        # This handles cases where the word is not in the model's vocabulary
        pretrained_results[word] = {'similar': 'Word not in pretrained model', 'dissimilar': 'Word not in pretrained model'}



In [7]:
results

{'place': {'similar': [('food', 0.629185140132904),
   ('service', 0.5960652828216553),
   ('sevice', 0.5614011883735657),
   ('great', 0.525833249092102),
   ('really', 0.5253211855888367),
   ('always', 0.5172836780548096),
   ('spot', 0.506657063961029),
   ('good', 0.4986167252063751),
   ('friendly', 0.4962637722492218),
   ('staff', 0.48061591386795044)],
  'dissimilar': [('pb', 0.40178194642066956),
   ('nfw', 0.38733232021331787),
   ('rodin', 0.38270050287246704),
   ('hwy', 0.36418068408966064),
   ('dd', 0.3550730347633362),
   ('j', 0.34447941184043884),
   ('hq', 0.34183189272880554),
   ('mt', 0.34010371565818787),
   ('kat', 0.3272410035133362),
   ('lpg', 0.3256540298461914)]},
 'server': {'similar': [('served', 0.5290517807006836),
   ('service', 0.5125400424003601),
   ('waiter', 0.5123371481895447),
   ('staff', 0.48701438307762146),
   ('ver', 0.48140811920166016),
   ('servant', 0.4524403214454651),
   ('servicio', 0.4290315806865692),
   ('place', 0.42364129424095

In [8]:
pretrained_results

{'place': {'similar': [('Anyplace', 0.7997679710388184),
   ('Replace', 0.782713770866394),
   ('Surplace', 0.7820640802383423),
   ('workplace', 0.7792938351631165),
   ('showplace', 0.7744765281677246),
   ('3rdplace', 0.7685235738754272),
   ('placed', 0.7530255913734436),
   ('.replace', 0.749077320098877),
   ('o.replace', 0.7484595775604248),
   ('replace', 0.743986189365387)],
  'dissimilar': [('D-22', -0.4109508693218231),
   ('a-Ha', -0.393137127161026),
   ('UAs', -0.38015085458755493),
   ('gadf', -0.3758648931980133),
   ('Giba', -0.3662368357181549),
   ('pop6', -0.3641536235809326),
   ('李勣', -0.36254453659057617),
   ('Tiv', -0.3616906702518463),
   ('5748', -0.35804688930511475),
   ('丹陽郡', -0.3558458685874939)]},
 'server': {'similar': [('X-server', 0.8163670897483826),
   ('DNS-server', 0.7785198092460632),
   ('DHCP-server', 0.77684086561203),
   ('Mapserver', 0.7720630168914795),
   ('Labaserver', 0.7535282373428345),
   ('webserver', 0.7394537329673767),
   ('laise