# 1. Word Frequency

In [3]:
import re
import gensim
import jieba
import pandas as pd
import warnings
import csv

# ignore DeprecationWarnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# read in CSV file containing text data
df = pd.read_csv('/Users/Lynn/Downloads/red/merged.csv', encoding="utf-8-sig")

def sent_to_words_lower(sentences):
    for sent in sentences:
        sent = sent.lower()
        sent = re.sub(r'http\S+|https\S+', '', sent)  # remove http and https
        sent = re.sub('<[^>]+>', '', sent)  # remove HTML tags
        sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
        sent = re.sub('\s+', ' ', sent)  # remove newline chars
        sent = re.sub('[^\w\s]', ' ', sent)  # remove punctuations
        sent = re.sub('@\w+', '', sent)
        yield sent

# apply sent_to_words_lower function to 'note_desc' column and convert resulting generator to list
note_desc_lower = list(sent_to_words_lower(df['note_desc'].astype(str).values.tolist()))

# define function to tokenize Chinese text using jieba and remove stopwords
def tokenize(text):
    # load user-defined dictionary for jieba
    jieba.load_userdict("dict.txt")
    
    # define function to check if character is Chinese
    def is_chinese(uchar):
        if uchar >= '\u4e00' and uchar <= '\u9fa5':
            return True
        else:
            return False
    
    # define function to tokenize text using jieba and remove stopwords
    def seg_depart(sentence):
        sentence_depart = jieba.cut(sentence.strip())
        outstr = ''
        for word in sentence_depart:
            # check if word is Chinese and not a stopword
            if is_chinese(word) and len(word.strip()) > 0:
                outstr += word
                outstr += " "
        return outstr
    
    # apply seg_depart function to each document in text and split resulting string into list of words
    tokenized = [seg_depart(doc).split(' ')[:-1] for doc in text]
    
    return tokenized

# apply tokenize function to note_desc_lower list to tokenize and remove stopwords
note_desc_tokenized = tokenize(note_desc_lower)

# define function to generate bigrams and trigrams from tokenized text data
def generate_ngrams(texts):
    # generate bigrams and trigrams
    bigram = gensim.models.Phrases(texts, min_count=10, threshold=10)
    trigram = gensim.models.Phrases(bigram[texts], min_count=10, threshold=10)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    # apply bigram and trigram models to each document in text and return resulting list of tokenized documents
    ngram_tokenized = [trigram_mod[bigram_mod[doc]] for doc in texts]

    return ngram_tokenized

# apply generate_ngrams function to note_desc_tokenized list to generate bigrams and trigrams
note_desc_ngram = generate_ngrams(note_desc_tokenized)

# create dictionary of term frequencies
term_freq = {}
for doc in note_desc_ngram:
    for term in doc:
        if term in term_freq:
            term_freq[term] += 1
        else:
            term_freq[term] = 1

# sort dictionary by frequency and write to CSV file
sorted_term_freq = sorted(term_freq.items(), key=lambda x: x[1], reverse=True)
print(sorted_term_freq)


Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/g3/whtrsvys0vg0n49n23hq5v180000gp/T/jieba.cache
Loading model cost 0.621 seconds.
Prefix dict has been built successfully.


[('话题', 7145), ('的', 5180), ('我', 4116), ('王者荣耀', 3407), ('了', 2124), ('就', 1257), ('王者', 1215), ('他', 1203), ('是', 1157), ('你', 922), ('说', 782), ('玩', 758), ('游戏', 742), ('和', 679), ('也', 677), ('在', 661), ('都', 587), ('一个', 562), ('打', 487), ('皮肤', 472), ('英雄', 463), ('自己', 442), ('辅助', 416), ('可以', 392), ('玩家', 384), ('有', 376), ('女_玩家', 374), ('然后', 373), ('真的', 371), ('就是', 367), ('这', 367), ('还', 352), ('人', 330), ('女生', 328), ('啊', 314), ('打野', 310), ('被', 309), ('吗', 305), ('这个', 302), ('队友', 300), ('她', 297), ('吧', 292), ('对面', 288), ('但是', 282), ('不', 280), ('我们', 280), ('男', 266), ('射手', 261), ('把', 259), ('没', 245), ('因为', 245), ('还是', 236), ('会', 235), ('什么', 227), ('很', 221), ('瑶', 219), ('的_时候', 218), ('都是', 213), ('又', 213), ('没有', 209), ('跟', 209), ('怎么', 207), ('看', 204), ('觉得', 202), ('给', 194), ('上', 194), ('他们', 193), ('话题_王者荣耀_上分', 189), ('去', 184), ('小乔', 184), ('法师', 181), ('直接', 181), ('不是', 177), ('要', 176), ('喜欢', 176), ('好', 175), ('技能', 175), ('一直', 174), 

In [4]:
with open('M_FREQ.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['term', 'freq', 'label'])
    for item in sorted_term_freq:
        term, freq = item[0], item[1]
        if freq >= 10 and len(term) > 1:  # check frequency and term length
            writer.writerow([term, freq, ''])  # add blank label column


In [15]:
import pandas as pd
import jieba
import gensim
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

jieba.load_userdict("dict.txt")

# Load the csv file
df = pd.read_csv('/Users/Lynn/Downloads/red/PP_merged.csv')

def sent_to_words_lower(sentences):
    for sent in sentences:
        sent = sent.lower()
        sent = re.sub(r'http\S+|https\S+', '', sent)  # remove http and https
        sent = re.sub('<[^>]+>', '', sent)  # remove HTML tags
        sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
        sent = re.sub('\s+', ' ', sent)  # remove newline chars
        sent = re.sub('[^\w\s]', ' ', sent)  # remove punctuations
        sent = re.sub('@\w+', '', sent)
        yield sent
        
# Define a function to remove stopwords and perform word segmentation using Jieba
def preprocess(sentence):
    stopwords = [line.strip() for line in open('stopwords.txt', 'r', encoding='utf-8').readlines()]
    sentence_depart = jieba.cut(sentence.strip())
    outstr = ''
    for word in sentence_depart:
        if word not in stopwords and len(word.strip()) > 0:
            outstr += word + ' '
    return outstr.strip()

# Preprocess the messages in the dataframe
df['tokenz'] = df['tokenz'].astype(str)
df['content1'] = list(map(preprocess, sent_to_words_lower(df['tokenz'].tolist())))

# Tokenize the preprocessed messages and build bigram and trigram models
result_fenci = [sent.split(' ') for sent in df['content1'].tolist()]
bigram = Phrases(result_fenci, min_count=10, threshold=10)
trigram = Phrases(bigram[result_fenci], min_count=10, threshold=10)
bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)
ngram_fenci = [trigram_mod[bigram_mod[doc]] for doc in result_fenci]

# Train a Word2Vec model on the tokenized messages
model = Word2Vec(ngram_fenci, vector_size=100, window=5, min_count=5, workers=4)


In [25]:
# Set the keyword and get the most similar words
keyword = '瑶妹'
similar_words = model.wv.most_similar(keyword, topn=1000)

# Iterate through the list of similar words, print their similarity score and frequency, and save them to a csv file
output = []
for word, score in similar_words:
    frequency = model.wv.get_vecattr(word, "count")
    similarity = model.wv.similarity(keyword, word)
    output.append([word, frequency, similarity])
    print(f"{word}: {similarity}, frequency: {frequency}")


荣耀: 0.9994925260543823, frequency: 118
不能: 0.9994712471961975, frequency: 124
女_玩家: 0.9994580149650574, frequency: 438
皮肤: 0.9994460940361023, frequency: 490
不会: 0.9994437098503113, frequency: 120
这是: 0.9994397163391113, frequency: 60
都是: 0.9994369745254517, frequency: 233
峡谷: 0.9994359016418457, frequency: 136
段位: 0.9994359016418457, frequency: 91
玩家: 0.9994328022003174, frequency: 489
实力: 0.9994248747825623, frequency: 62
需要: 0.9994230270385742, frequency: 52
不想: 0.9994224309921265, frequency: 71
传说: 0.9994185566902161, frequency: 43
朋友: 0.9994171261787415, frequency: 163
女孩子: 0.9994118809700012, frequency: 230
一下: 0.9994110465049744, frequency: 154
英雄: 0.9994086623191833, frequency: 602
都有: 0.9994077086448669, frequency: 46
单排: 0.999400794506073, frequency: 73
主页: 0.9993976950645447, frequency: 67
的是: 0.9993942379951477, frequency: 75
有点: 0.9993941187858582, frequency: 94
不知道: 0.9993923306465149, frequency: 115
妹妹: 0.9993922114372253, frequency: 48
双排: 0.9993860125541687, frequency:

In [None]:
output_df = pd.DataFrame(output, columns=['word', 'frequency', 'similarity'])
output_df.to_csv('F.csv', index=False)