In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import time
import multiprocessing as mp

In [2]:
# Preprocessing steps
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Read the data
df = pd.read_csv("in_domain_train.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])[['sentence']]

# Sample the data
sample_size = 100
df = df.sample(n=sample_size, random_state=42)
df.reset_index(drop=True, inplace=True)

In [4]:
# Preprocessing function
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [5]:
# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    return text

In [6]:
df['sentence'] = df['sentence'].apply(preprocess_text)

In [7]:
# Split data into train, validation, and test sets
X_train, X_temp = train_test_split(df['sentence'], test_size=0.3, random_state=42)
X_val, X_test = train_test_split(X_temp, test_size=0.5, random_state=42)

In [8]:
print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of X_test:", X_test.shape)

Shape of X_train: (70,)
Shape of X_val: (15,)
Shape of X_test: (15,)


In [9]:
# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=10)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [10]:
# Count Vectorizer
count_vectorizer = CountVectorizer(max_features=10)
X_train_count = count_vectorizer.fit_transform(X_train)
X_val_count = count_vectorizer.transform(X_val)
X_test_count = count_vectorizer.transform(X_test)

In [11]:
# LDA for topic modeling
lda = LatentDirichletAllocation(n_components=2, random_state=42)
X_train_lda = lda.fit_transform(X_train_count)
X_val_lda = lda.transform(X_val_count)
X_test_lda = lda.transform(X_test_count)

In [12]:
import multiprocessing as mp
import time

In [13]:
# Define a function to check if any of the search terms are in the article
def contains_search_term(article, search_terms):
    return any(term in article for term in search_terms)


In [14]:
# Function to search for specific words in articles
def search_words(articles, search_terms):
    search_results = []
    for article in articles:
        if any(term in article for term in search_terms):
            search_results.append(article)
    return search_results

In [15]:
# Example search terms
search_terms = ['Mary', 'beer']

# Search for articles containing the search terms
search_results = search_words(df['sentence'], search_terms)

In [16]:
print("Number of search results:", len(search_results))

Number of search results: 1


In [17]:
# Extract hot keywords from search results using TF-IDF
search_results_tfidf = tfidf_vectorizer.transform(search_results)
hot_keywords = tfidf_vectorizer.get_feature_names_out()

In [18]:
# Display results
print("Search Results:")
print(search_results)

Search Results:
['i ordered if john drink his beer']


In [14]:
# Function to search for specific words in articles using multiprocessing
def search_words_parallel(articles, search_terms):
    with mp.Pool(mp.cpu_count()) as pool:
        search_results = pool.starmap(contains_search_term, [(article, search_terms) for article in articles])
    return [article for article, result in zip(articles, search_results) if result]

In [15]:
# Example search terms
search_terms = ['mary']

# Search for articles containing the search terms
start_time = time.time()
search_results = search_words_parallel(df['sentence'], search_terms)
end_time = time.time()

print(f"Number of search results: {len(search_results)} (Time taken: {end_time - start_time:.2f}s)")

In [19]:
# Extract hot keywords from search results using TF-IDF
search_results_tfidf = tfidf_vectorizer.transform(search_results)
hot_keywords_tfidf = tfidf_vectorizer.get_feature_names_out()

In [20]:
# Display results
print("Search Results:")
print(search_results)

print("\nHot Keywords (TF-IDF):")
print(hot_keywords_tfidf)
print(search_results_tfidf.toarray())

Search Results:
['i ordered if john drink his beer']

Hot Keywords (TF-IDF):
['and' 'in' 'is' 'it' 'john' 'of' 'that' 'the' 'to' 'you']
[[0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]]


In [21]:
# Example of using gensim for Word2Vec
tokenized_articles = [article.split() for article in df['sentence']]
word2vec_model = Word2Vec(sentences=tokenized_articles, vector_size=100, window=5, min_count=1, workers=mp.cpu_count())

In [22]:
# Extract hot keywords from search results using Word2Vec
def get_word2vec_keywords(model, search_results, top_n=10):
    all_keywords = []
    for article in search_results:
        words = article.split()
        keywords = [word for word in words if word in model.wv]
        all_keywords.extend(keywords)
    return sorted(set(all_keywords), key=lambda word: model.wv[word], reverse=True)[:top_n]

In [None]:
hot_keywords_word2vec = get_word2vec_keywords(word2vec_model, search_results)

In [None]:
# Compare TF-IDF and Word2Vec
def compare_techniques(tfidf_keywords, word2vec_keywords, search_terms):
    tfidf_similarity = cosine_similarity(tfidf_vectorizer.transform([' '.join(tfidf_keywords)]), tfidf_vectorizer.transform([' '.join(search_terms)]))
    word2vec_similarity = np.mean([word2vec_model.wv.similarity(word, search_terms[0]) for word in word2vec_keywords if word in word2vec_model.wv])

    print("\nTF-IDF Hot Keywords:")
    print(tfidf_keywords)
    print("TF-IDF Similarity to Search Terms:", tfidf_similarity[0][0])

    print("\nWord2Vec Hot Keywords:")
    print(word2vec_keywords)
    print("Word2Vec Similarity to Search Terms:", word2vec_similarity)

compare_techniques(hot_keywords_tfidf, hot_keywords_word2vec, search_terms)