## Vectorize words

In [1]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import nltk
import spacy
from gensim.models import Word2Vec
import numpy as np

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/caixinyi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/caixinyi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [94]:
json_file_path = 'data/raw/450_all.json'

In [95]:
# combine the text together
with open(json_file_path, 'r') as file:
    json_data = json.load(file)
    subject_data = [entry['detail']['subject'] for entry in json_data]
    content_data = [entry['detail']['content'] for entry in json_data]
    post_data = [f"{subject} {content}" for subject, content in zip(subject_data, content_data)]
    answer_data = [answer['content'] for entry in json_data if entry['type'] == 'question' for answer in entry['answers']]
data = post_data + answer_data

# lowercase and tokenize the sentence
lower_sentence = [sentence.lower() for sentence in data]
tokenized_data = [
    [token for token in nltk.word_tokenize(sentence) if token.isalpha()]
    for sentence in lower_sentence
]
print(tokenized_data)



In [96]:
# Term-frequency vectorizer method with spacy stop-word list

en = spacy.load('en_core_web_sm')
spacy_stop_words = list(en.Defaults.stop_words)
vectorizer = TfidfVectorizer(stop_words=spacy_stop_words)
tfidf_matrix = vectorizer.fit_transform(lower_sentence)

print(lower_sentence)

# Get feature names (words) with high TF-IDF scores
feature_names = vectorizer.get_feature_names_out()

# Words to be removed based on TF-IDF threshold
tfidf_threshold = 0.5  # Adjust as needed
stop_words = set([feature_names[i] for i, score in enumerate(tfidf_matrix.sum(axis=0).tolist()[0]) if score < tfidf_threshold])
stop_words.update(spacy_stop_words)





In [5]:
custom_stop_words = {"need", "problem", "time", "like"}
stop_words.update(custom_stop_words)
print(stop_words)
print(len(stop_words))
filtered_data = [[word for word in sentence if word not in stop_words] for sentence in tokenized_data]


{'denote', 'mistakes', 'tool', 'capital', '184', 'introduced', 'folders4', 'coursecode', 'iperf3', 'with', 'clarifying', 'datagram', 'operate', 'nth', 'attempted', 'mostly', 'fact', 'difs', 'test_results', 'webex', 'realize', 'calls', '255', 'worth', '56355', 'emb', 'mix', 'possibly', 'essence', 'occurring', 'somebody', 'instruction', 'paper', 'recent', 'porfessor', 'x3', 'correspond', 'viazoomgagan', 'cwnd2', 'merry', 'increased', 'p24', 'programmatically', 'themselves', 'unale', 'dst', 'receving', '1k', 'had', 'bucket', 'instructors', 'bzero', 'seq', 'pacific', 'acked', '127', 'tricks', 'quit', 'htmlfollow', 'under', '2fkstgcobulb14t3', 'ai_flags', 'userguide', 'practices', 'grow', 'plan', 'area', 'stored', 'thetcp', 'unanswered', 'boxes', 'for', 'fixing', 'a8', 'discription', 'describe', 'decides', 'tcom370', 'pthreadsstudent', 'ensures', 'cvf', 'pick', 'cfvjwdrrnhc3cw9zz1pmevzxr0jfut09meeting', 'speed', '1t', 'hubs', 'europe', 'exponentially', 'clientandserver', 'fail_no_user', 'sn

In [101]:
# build the word2vec model
print(filtered_data)
print(len(filtered_data))
print([len(f) for f in filtered_data])
model = Word2Vec(filtered_data, vector_size=512, min_count=3)
model.save('word2vec_model_450.bin')

799
[62, 4, 16, 18, 16, 35, 63, 11, 22, 57, 11, 19, 53, 10, 12, 11, 12, 12, 20, 17, 9, 15, 23, 13, 12, 15, 16, 3, 12, 7, 21, 28, 12, 37, 10, 15, 30, 7, 12, 10, 12, 18, 6, 6, 4, 4, 16, 16, 12, 9, 16, 6, 11, 10, 12, 46, 36, 27, 2, 30, 25, 8, 17, 10, 11, 83, 10, 11, 18, 12, 13, 28, 13, 17, 2, 25, 2, 9, 15, 47, 36, 21, 20, 48, 14, 34, 34, 16, 8, 11, 12, 19, 13, 11, 90, 24, 8, 5, 29, 14, 45, 23, 12, 11, 9, 25, 19, 17, 35, 24, 22, 14, 11, 34, 11, 26, 17, 15, 72, 10, 19, 8, 14, 31, 30, 9, 14, 8, 15, 12, 17, 8, 14, 14, 45, 9, 13, 53, 25, 9, 64, 38, 107, 21, 18, 36, 46, 29, 29, 30, 28, 21, 10, 7, 5, 17, 21, 51, 19, 14, 19, 19, 25, 8, 23, 8, 22, 57, 61, 27, 10, 25, 22, 12, 19, 9, 9, 23, 22, 21, 8, 14, 18, 21, 28, 36, 10, 16, 10, 13, 29, 82, 19, 4, 32, 26, 24, 79, 29, 19, 14, 20, 18, 23, 8, 21, 16, 46, 7, 17, 10, 7, 16, 10, 12, 13, 28, 27, 23, 6, 21, 18, 19, 21, 10, 26, 13, 6, 23, 88, 12, 19, 29, 18, 20, 13, 34, 27, 5, 23, 44, 21, 57, 42, 11, 10, 20, 30, 49, 9, 26, 15, 8, 49, 26, 46, 9, 30, 24, 5

## Test vectorized model

In [99]:
def print_vector(model, word):
    word_vector = model.wv[word]
    print(f'Vector for {word}: ')
    print(word_vector)


def print_similar_words(model, word, num):
    similar_words = model.wv.most_similar(word, topn=num)
    print(f'Top {num} similar words to {word}: ')
    print(similar_words)


def print_similarity(model, word1, word2):
    cosine_similarity = model.wv.similarity(word1, word2)
    print(f'Similarity between {word1} and {word2}: ')
    print(cosine_similarity)


In [100]:
# Load the trained Word2Vec model
m = Word2Vec.load('word2vec_model_450.bin')

print_similarity(m, "tcp", "udp")
print_similarity(m, "student", "udp")
print_similar_words(m, 'udp', 10)
print_similar_words(m, 'network', 10)

Similarity between tcp and udp: 
0.9977243
Similarity between student and udp: 
0.98415184
Top 10 similar words to udp: 
[('client', 0.997833788394928), ('tcp', 0.9977242350578308), ('send', 0.9976084232330322), ('serverm', 0.9975574016571045), ('question', 0.9974934458732605), ('address', 0.9974839091300964), ('number', 0.9974711537361145), ('server', 0.9974404573440552), ('data', 0.9973480701446533), ('message', 0.9973389506340027)]
Top 10 similar words to network: 
[('server', 0.9979001879692078), ('tcp', 0.9978952407836914), ('client', 0.9978930354118347), ('use', 0.9978609681129456), ('question', 0.9977902770042419), ('address', 0.9977751970291138), ('case', 0.9975879192352295), ('course', 0.9975672364234924), ('send', 0.997564971446991), ('data', 0.997564971446991)]


## Clustering

In [16]:
# Load your trained Word2Vec model
model = Word2Vec.load("word2vec_model_450.bin")

# Get word vectors and corresponding words
word_vectors = model.wv.vectors
words = model.wv.index_to_key

print(len(words))

1018


In [8]:
# Number of clusters
# num_clusters = 10
num_clusters = len(words) // 10

In [9]:
# Apply k-means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=10)
clusters = kmeans.fit_predict(word_vectors)

word_cluster_mapping = dict(zip(words, clusters))
cluster_sums = {cluster_num: np.zeros_like(word_vectors[0]) for cluster_num in range(num_clusters)}
cluster_counts = {cluster_num: 0 for cluster_num in range(num_clusters)}

In [10]:
# Accumulate the sum of vectors and count for each cluster
for word, cluster in word_cluster_mapping.items():
    cluster_sums[cluster] += model.wv[word]
    cluster_counts[cluster] += 1

# Calculate the average vector for each cluster
cluster_averages = {cluster_num: cluster_sum / cluster_counts[cluster_num] for cluster_num, cluster_sum in cluster_sums.items()}

In [24]:
# Write  each cluster into cluster.txt
file_name = "clusters.txt"
with open(file_name, "w") as file:
    for cluster_num in range(num_clusters):
        cluster_words = [word for word, cluster in word_cluster_mapping.items() if cluster == cluster_num]
        # file.write(f"Cluster {cluster_num + 1}: {cluster_averages[cluster_num]} \n{cluster_words}\n")
        file.write(f"Cluster {cluster_num}: {cluster_words}\n")

## Customized doc2vec

In [37]:
# Tokenize the paragraph into words
def tokenize_paragraph(paragraph):
    return [
        [token for token in nltk.word_tokenize(sentence.lower()) if token.isalpha()]
        for sentence in nltk.sent_tokenize(paragraph)
    ]

In [21]:
# Function to find the closest cluster for a word
def find_closest_cluster(word):
    try:
        word_vector = model.wv[word]
        return kmeans.predict(word_vector.reshape(1, -1))[0]
    except KeyError:
        return -1

In [39]:
# Turn the words into their closest cluster
def doc2cluster(token):
    token_list = []
    for sentence in token:
        sentence_list = []
        for word in sentence:
            cluster_assignment = find_closest_cluster(word)
            if cluster_assignment != -1:
                sentence_list.append(cluster_assignment)
        token_list.append(sentence_list)
    return token_list

In [42]:
# Wrap the above process into a function
def build_histogram(token_sentence, num_c):
    if len(token_sentence) == 0:
        return
    histogram = [0]*num_c
    for word_cluster in token_sentence:
        histogram[word_cluster] += 1
    histogram_normalize = [count / len(token_sentence) for count in histogram]
    return histogram_normalize

In [40]:
# test the sample paragraph
sample_paragraph = "number of UDP sockets allowed to set up at serverM. As I was programming the serverM, I did not notice the identical config and wrote a socket for each other server. However, question @373 suggested one UDP socket is sufficient for all the three servers. May I keep the three UDP socket format?"

tokenized_sample_paragraph = tokenize_paragraph(sample_paragraph)
sample_paragraph_cluster = doc2cluster(tokenized_sample_paragraph)
print(sample_paragraph_cluster)
for sample_sentence_cluster in sample_paragraph_cluster:
    print(build_histogram(sample_sentence_cluster,num_clusters))

[[6, 70, 5, 90, 79, 18], [49, 18, 61, 63, 3, 15], [89, 70, 3, 44], [70, 3, 99]]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.16666666666666666, 0.16666666666666666, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.16666666666666666, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.16666666666666666, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.16666666666666666, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.16666666666666666, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.16666666666666666, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.16666666666666666, 0.0, 0.0, 0.16666666666666666, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.16666666666666666, 0.0

In [74]:
with open(json_file_path, 'r') as file:
    json_data = json.load(file)
    subject_data = [entry['detail']['subject'] for entry in json_data if entry['type'] == 'question']
    content_data = [entry['detail']['content'] for entry in json_data if entry['type'] == 'question']
    question_data = [f"{subject}. {content}" for subject, content in zip(subject_data, content_data)]
print(len(question_data))

file_name = "histogram.txt"
question_list = []
with open(file_name, "w") as file:
    for i, question in enumerate(question_data):
        question_clusters = doc2cluster(tokenize_paragraph(question))
        for j, question_cluster in enumerate(question_clusters):
            if len(question_cluster) != 0:
                question_list.append({"histo":build_histogram(question_cluster,num_clusters), "loc_q":i, "loc_s": j})

    for q in question_list:
        file.write(f"histogram: {q['histo']}\nlocation: {q['loc_q']}-{q['loc_s']}\n\n")

370


In [89]:
from sklearn.metrics.pairwise import cosine_similarity

def find_similar_sentence(sentence_list, input_histo, num):
    input_histogram_array = np.array([input_histo])
    similarities = cosine_similarity([input_histo], [s['histo'] for s in sentence_list])
    similarities_list = similarities[0].tolist()
    sorted_histograms = sorted(zip(sentence_list, similarities_list), key=lambda x: x[1], reverse=True)
    closest_histograms = sorted_histograms[:num]
    return [f"{c[0]['loc_s']} in {c[0]['loc_q']} "for c in closest_histograms]
    # return closest_histograms

In [90]:
sample_question = "number of UDP sockets allowed to set up at serverM"
tokenized_sample_question = tokenize_paragraph(sample_question)
sample_question_cluster = doc2cluster(tokenized_sample_question)
sample_question_histo = build_histogram(sample_question_cluster[0],num_clusters)
find_similar_sentence(question_list, sample_question_histo,2)

['0 in 257 ', '1 in 228 ']

In [92]:
print(tokenize_paragraph(question_data[257])[0])

['number', 'of', 'udp', 'sockets', 'allowed', 'to', 'set', 'upat', 'serverm']
