In [1]:
import re
import operator
from collections import Counter
from pyspark import SparkContext
SparkContext.setSystemProperty('spark.executor.memory', '16g')
SparkContext.setSystemProperty('spark.executor.extraJavaOptions', '-verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps')
sc = SparkContext("local[*]", 'dcard')

In [2]:
from pyspark.sql import SparkSession
import sys
my_spark = SparkSession \
    .builder \
    .appName("dcard") \
    .config("spark.mongodb.input.uri", "mongodb://192.168.2.12:27017/dcard.talk_posts") \
    .config("spark.mongodb.output.uri", "mongodb://192.168.2.12:27017/dcard.talk_posts") \
    .getOrCreate()

In [3]:
df = my_spark.read.format("com.mongodb.spark.sql.DefaultSource").load()

In [4]:
from pyspark.accumulators import AccumulatorParam

In [5]:
df.printSchema()

root
 |-- _id: integer (nullable = true)
 |-- anonymousDepartment: boolean (nullable = true)
 |-- anonymousSchool: boolean (nullable = true)
 |-- commentCount: integer (nullable = true)
 |-- content: string (nullable = true)
 |-- createdAt: string (nullable = true)
 |-- department: string (nullable = true)
 |-- excerpt: string (nullable = true)
 |-- forumAlias: string (nullable = true)
 |-- forumId: string (nullable = true)
 |-- forumName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- hidden: boolean (nullable = true)
 |-- hiddenByAuthor: boolean (nullable = true)
 |-- likeCount: integer (nullable = true)
 |-- media: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- url: string (nullable = true)
 |-- pinned: boolean (nullable = true)
 |-- replyId: integer (nullable = true)
 |-- replyTitle: string (nullable = true)
 |-- reportReason: string (nullable = true)
 |-- school: string (nullable = true)
 |-- tags: array (nullable = true

In [6]:
df.groupBy("school").count().show()

+-----------+-----+
|     school|count|
+-----------+-----+
|       新堡大學|    9|
| 國立高雄海洋科技大學|   42|
|        屏科大|  446|
|      亞當森大學|    1|
|       格魯斯特|    1|
| 國立臺南護理專科學校|   22|
|     香港浸會大學|    1|
|    love♏的♎|    1|
|         七七|    1|
|   Ladycaca|    1|
|       伊比爾喬|    1|
|      理論型情聖|    1|
|        OwO|    1|
|💸挖壕溝的女子嘟🔨|    1|
|         米米|    1|
|     142小女紙|    2|
|        紅心K|    1|
|       新鮮な肝|    1|
|       米蘭大學|    2|
|  倫敦大學伯貝克學院|    1|
+-----------+-----+
only showing top 20 rows



In [7]:
df.take(5)

[Row(_id=6150, anonymousDepartment=False, anonymousSchool=False, commentCount=19, content='好酷噢可以按讚XD', createdAt='2014-04-10T08:16:24.673Z', department='資訊工程學系', excerpt='好酷噢可以按讚XD', forumAlias='talk', forumId='255fd275-fec2-49d2-8e46-2e1557ffaeb0', forumName='閒聊', gender='M', hidden=False, hiddenByAuthor=False, likeCount=82, media=[], pinned=False, replyId=None, replyTitle='null', reportReason='', school='淡江大學', tags=[], title='新功能耶！', updatedAt='2014-04-10T08:16:24.673Z', withNickname=None),
 Row(_id=6151, anonymousDepartment=False, anonymousSchool=False, commentCount=60, content='大家快來給點建議吧 : )', createdAt='2014-04-10T09:15:11.945Z', department=' ', excerpt='大家快來給點建議吧 : )', forumAlias='talk', forumId='255fd275-fec2-49d2-8e46-2e1557ffaeb0', forumName='閒聊', gender='D', hidden=False, hiddenByAuthor=False, likeCount=18, media=[], pinned=False, replyId=None, replyTitle='null', reportReason='', school='狄卡', tags=[], title='新版建議', updatedAt='2014-04-10T09:15:11.945Z', withNickname=None),
 R

In [8]:
content = df.select('content')
print(content.rdd.getNumPartitions())
content_rdd = content.rdd.repartition(16).cache()


3


In [9]:
content_rdd.getNumPartitions()


16

In [10]:
lineLengths = content_rdd.map(lambda s: len(s))
totalLength = lineLengths.reduce(lambda a, b: a + b)
print(totalLength)

81525


In [11]:
def remove_url_and_punctuation(sentence):
    # remove url
    if 'http' in sentence:
        sentence = re.sub(r'^https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE)

    # remove punctuation
    text_list = re.split('\W+', sentence)
    return list(filter(None, text_list))

def to_ngrams(unigrams, length):
    return Counter(zip(*[unigrams[i:] for i in range(length)]))

In [12]:
test = '🤔剛剛要回文時已經找不到了\n總之，內容就是跟姊姊一起刮卡然後中了205萬，於是發上來的照片我怎麼看都沒中😂。\n\n祝大家新年快樂，也希望大家在刮彩卷之類的東西能夠好好看清楚喔🙌，以免遇到以為中獎一家人很high，結果被網友說沒中的哭哭戲碼。\nhttps://i.imgur.com/Svbg4BF.jpg\n沒有截文章 截到圖片而已\n\n大家加油喔👊'

In [13]:
remove_url_and_punctuation(test)

['剛剛要回文時已經找不到了',
 '總之',
 '內容就是跟姊姊一起刮卡然後中了205萬',
 '於是發上來的照片我怎麼看都沒中',
 '祝大家新年快樂',
 '也希望大家在刮彩卷之類的東西能夠好好看清楚喔',
 '以免遇到以為中獎一家人很high',
 '結果被網友說沒中的哭哭戲碼',
 '沒有截文章',
 '截到圖片而已',
 '大家加油喔']

In [14]:
result = content_rdd.map(lambda a: remove_url_and_punctuation(a['content'])).collect()

In [15]:
result[:10]

[['呃', '忘了當初選的選項', '想要知道的話可以顯示嗎', '我猜線在是不行', '要改了系統才能吧'],
 ['有時候難得遇到認識的人其實很高興', '不過因為太久沒看到', '不知道要說什麼', '所以就快步離開了', '大家都有這樣的經驗嗎'],
 ['感覺淡江的同學好多阿', '抽到好多都是淡江的', '好友也都是'],
 ['學校即將舉行耶誕舞會',
  '我已經向六個女生邀請她們當我舞伴',
  '但是都被拒絕',
  '我很難過',
  '我無法再承受任何打擊',
  '我心理系同學對我說',
  '不要難過',
  '遇到任何困難',
  '可以隨時找她談談',
  '我很感動',
  '於是我問她',
  '你可以當我舞伴嗎',
  '她對我說',
  '你現在累積到7個了',
  '我還要去找第八個嗎'],
 ['摩立特集團聲請破產保護前',
  '公司內部的高薪顧問為何不用波特著名的',
  '競爭五力分析法',
  '來自救',
  '是因為策略無效',
  '還是因為世界已經改變',
  '但公司卻未能調適',
  '波特一九七九年在哈佛商業評論發表的文章中指出',
  '產業內的競爭情勢',
  '繫於五種基本力量',
  '新進者的威脅',
  '替代性產品的威脅',
  '供應商的議價能力',
  '顧客的議價能力及產業內部的競爭對手',
  '而由合力來決定產業最終的獲利潛力',
  '富比世雜誌指出',
  '他的理論植基於',
  '寡占',
  '理論',
  '企業的策略目標是為了擊敗對手',
  '而非對客戶提供加值',
  '企業要想持續獲得超額利潤',
  '必須避免競爭',
  '並找出結構性的障礙',
  '來保護既有的超額利潤',
  '富比世指出',
  '然而企業的競爭並非戰爭',
  '未必要對手失敗才能造就本身的成功',
  '就如同藝術表演一樣',
  '藝術家可以各自尋找自己的觀眾',
  '至於波特最重視的結構性障礙',
  '近廿年來由於全球化及網路風',
  '已經吹垮了大部分障礙',
  '市場改由客戶當家做主',
  '富比世強調',
  '今天企業已不再靠擊敗對手',
  '並以結構性障礙來保護自己免於競爭',
  '而必須不斷創新來為客戶提供加值',
  '找到新方法來討客戶歡心

In [16]:
# compute unigram and bigram count
import time

start = time.time()

unigram_counter = Counter()
bigram_counter = Counter()
trigram_counter = Counter()

result = content_rdd.map(lambda a: remove_url_and_punctuation(a['content'])).collect()

for post in result:
    for line in post:
        unigram_counter.update(line)
        bigram_counter.update(to_ngrams(line, 2))
        trigram_counter.update(to_ngrams(line, 3))

end = time.time()
print(end - start)

86.7028558254242


In [17]:
print(unigram_counter.most_common(10))
print(bigram_counter.most_common(10))
print(trigram_counter.most_common(10))


[('的', 555099), ('我', 355158), ('是', 336459), ('不', 258394), ('一', 244268), ('有', 243204), ('了', 175677), ('人', 147487), ('在', 140291), ('到', 127480)]
[(('沒', '有'), 44665), (('什', '麼'), 42757), (('可', '以'), 40622), (('一', '個'), 38825), (('自', '己'), 38156), (('知', '道'), 37185), (('大', '家'), 36739), (('覺', '得'), 35108), (('因', '為'), 34636), (('真', '的'), 33222)]
[(('不', '知', '道'), 18889), (('的', '時', '候'), 15378), (('有', '沒', '有'), 13913), (('_', '_', '_'), 13161), (('為', '什', '麼'), 9674), (('自', '己', '的'), 8391), (('哈', '哈', '哈'), 7241), (('我', '覺', '得'), 6488), (('是', '不', '是'), 6298), (('真', '的', '很'), 6167)]


In [18]:
def one_to_three_grams(line):
    print(line)
    result = (Counter(line), to_ngrams(line, 2), to_ngrams(line, 3))
    print(len(result[0]), len(result[1]), len(result[2]))
    return result
#     return (Counter(line), to_ngrams(line, 2), to_ngrams(line, 3))

In [19]:
print(content_rdd.top(1))
remove_url_and_punctuation(content_rdd.top(1)[0]['content'])

[Row(content='🤔🤔\n聽說今年在屏東某地的潮X高中\n全國繁星第一 (110人)\n但只有46個人上國立\n難道這就是所謂有學校就讀的概念嗎?\n\n還有據說繁星進大學的 都蠻優秀的\n是這樣嗎？')]


['聽說今年在屏東某地的潮X高中',
 '全國繁星第一',
 '110人',
 '但只有46個人上國立',
 '難道這就是所謂有學校就讀的概念嗎',
 '還有據說繁星進大學的',
 '都蠻優秀的',
 '是這樣嗎']

In [20]:
lineLengths = content_rdd.flatMap(lambda s: remove_url_and_punctuation(s['content'])).map(lambda s: len(s))
totalLength = lineLengths.reduce(lambda a, b: a + b)
print(totalLength)

16168912


In [21]:
# import time
# sec = int(round(time.time()))
# print(sec)
# sample_rdd = content_rdd.sample(False, 0.01, sec)

In [22]:
# result = content_rdd.flatMap(lambda s: remove_url_and_punctuation(s['content'])).map(lambda line: one_to_three_grams(line)).reduce(lambda a, b: tuple(map(operator.add, a, b)))

# result[0].most_common(10)

In [23]:
# result_one_grams = content_rdd.flatMap(lambda s: remove_url_and_punctuation(s['content'])).map(lambda s: Counter(s)).reduce(lambda a,b: a + b)

# result_one_grams.most_common(10)

In [24]:
# # Utilizing spark Accumulator to calculate n-grams.
# list_data = content_rdd.flatMap(lambda a: remove_url(a['content'])).collect()
# result_list = sc.parallelize(list_data)
# result_list.top(10)

# class CounterAccumulatorParam(AccumulatorParam):
#     def zero(self, initialValue):
#         return initialValue

#     def addInPlace(self, v1, v2):
#         v1 += v2
#         return v1

# # Then, create an Accumulator of this type:
# one_gram_accum = sc.accumulator(Counter(), CounterAccumulatorParam())
# two_gram_accum = sc.accumulator(Counter(), CounterAccumulatorParam())
# three_gram_accum = sc.accumulator(Counter(), CounterAccumulatorParam())

# def one_to_three_grams_accum(line):
#     one_gram_accum.add(Counter(line))
#     two_gram_accum.add(to_ngrams(line, 2))
#     three_gram_accum.add(to_ngrams(line, 3))

# result_list.foreach(lambda line: one_to_three_grams_accum(line))

# one_gram_accum.value.most_common(10)

# two_gram_accum.value.most_common(10)

# three_gram_accum.value.most_common(10)

 # Good-Turing Smoothing Language Model

In [25]:
V1 = content_rdd.flatMap(lambda s: remove_url_and_punctuation(s['content'])).map(lambda s: len(s)).reduce(lambda a, b: a + b)
V2 = V1 ** 2
k = 10

In [26]:
from math import log10
# compute N1, N2, N3...
unigram_Nr = Counter(unigram_counter.values())
bigram_Nr = Counter(bigram_counter.values())
# compute N0
unigram_Nr[0] = V1 - len(unigram_counter)
bigram_Nr[0] = V2 - len(bigram_counter)
print(unigram_Nr[0], bigram_Nr[0])

16161360 261433714466032


In [27]:
# compute r
unigram_r = [(i+1) * unigram_Nr[i+1] / unigram_Nr[i] for i in range(k)]
bigram_r = [(i+1) * bigram_Nr[i+1] / bigram_Nr[i] for i in range(k)]
print(unigram_r)
print(bigram_r)

[6.732106703891256e-05, 1.0202205882352942, 1.6486486486486487, 3.2918032786885245, 3.545816733067729, 5.595505617977528, 5.271084337349397, 7.424, 8.53448275862069, 9.272727272727273]
[1.4023478216985474e-09, 0.6473060735746179, 1.5708928180147987, 2.515120789274621, 3.5128493908057745, 4.4934421451471875, 5.45295777388597, 6.3883094985324425, 7.3410495034018926, 8.641418983700863]


In [28]:
# compute normalize factor
# compute N
unigram_N = sum(unigram_counter.values())
bigram_N = sum(bigram_counter.values())
print(unigram_N, bigram_N)

16168912 14297453


In [29]:
# compute new probability sum
unigram_N_ = unigram_N + k * unigram_Nr[k]
bigram_N_ = bigram_N + k * bigram_Nr[k]
print(unigram_N_, bigram_N_)

16169932 14387583


In [30]:
# normalize factor: N/N’
unigram_norm_factor = unigram_N / unigram_N_
bigram_norm_factor = bigram_N / bigram_N_
print(unigram_norm_factor, bigram_norm_factor)

0.9999369199573629 0.9937355704568307


In [31]:
# Estimating P(w) and P(w’|w)
def prob_1word(unigram):
    count = unigram_counter[unigram]
    r = unigram_r[count] if count < k else count
    return log10(r / unigram_N_)
def prob_2words(text_front, text_rear):
    count = bigram_counter[text_front, text_rear]
    r = bigram_r[count] if count < k else count
    return log10(r / bigram_N_)
def prob_word_by_word(text_front, text_rear):
    return prob_2words(text_front, text_rear) - prob_1word(text_front)
def prob_words(words):
    return prob_1word(words[0]) + sum(prob_word_by_word(words[i-1], words[i]) for i in range(1, len(words)))
def prob_text(text):
    return prob_words(text.lower().split())

In [66]:
print(prob_1word('清'))
print(prob_2words('清','華'))
print(prob_word_by_word('我','很'))

-3.3656623830213768
-5.24417398964687
-1.8612119932022284


In [33]:
unicount_log = {k: log10(v) for k, v in unigram_counter.items()}
bicount_log = {k: log10(v) for k, v in bigram_counter.items()}
unigram_r_log = [log10(r) for r in unigram_r]
bigram_r_log = [log10(r) for r in bigram_r]
unigram_N_log = log10(unigram_N_)
bigram_N_log = log10(bigram_N_)


def prob_1word(unigram):
    count = unigram_counter[unigram]
    r = unigram_r_log[count] if count < k else unicount_log[unigram]
    return r - unigram_N_log
def prob_2words(text_front, text_rear):
    count = bigram_counter[text_front, text_rear]
    r = bigram_r_log[count] if count < k else bicount_log[text_front, text_rear]
    return r - bigram_N_log

In [67]:
print(prob_1word(u'清'))
print(prob_1word(u'華'))
print(prob_2words(u'清', u'華'))
print(prob_word_by_word(u'清',u'華'))

-3.3656623830213768
-3.7733416869432848
-5.24417398964687
-1.8785116066254934


In [76]:
import math
# N_unigram_corpus = math.log2(float(sum(unigram_counter.values())))
# N_bigram_corpus = math.log2(float(sum(bigram_counter.values())))
def pmi(words):
    word1 = words[0]
    word2 = words[1]
    # Good-Turing Estimation 將次數小於k的字做一些調整 目標不要讓沒出現過的字 機率為0
    count_word1 = unigram_counter[word1]
    r_word1 = unigram_r_log[count_word1] if count_word1 < k else unicount_log[word1]
    count_word2 = unigram_counter[word2]
    r_word2 = unigram_r_log[count_word2] if count_word2 < k else unicount_log[word2]
    count_word1_and_word2 = bigram_counter[(word1, word2)]
    r_word1_and_word2 = bigram_r_log[count_word1_and_word2] if count_word1_and_word2 < k else bicount_log[(word1, word2)]

    # mutual information algorithm
    prob_word1 = r_word1 - unigram_N_log
    prob_word2 = r_word2 -  unigram_N_log
    prob_word1_word2 = r_word1_and_word2 -  bigram_N_log
    return prob_word1_word2 - (prob_word1+prob_word2)

In [77]:
print(pmi((u'聰', u'思')))
print(pmi((u'很',u'開')))
print(pmi((u'開', u'心')))
print(pmi((u'吃', u'飯')))
print(pmi((u'我', u'弟')))

-8.291553511361858
0.7102270735757452
1.5498605236084844
2.577196006409019
0.6475047860715808


In [89]:
import operator

def word_segmentation(sentence):
    # input : 忘了當初選的選項
    words_list = to_words(sentence, 2)
    max_probability_dict = find_max_prob(to_prob_dict(words_list))
    return seperate_sentence(sentence, max_probability_dict)


def to_words(unigrams, length):
    return list(zip(*[unigrams[i:] for i in range(length)]))


def to_prob_dict(words_list):
#   input : [('忘', '了'), ('了', '當'), ('當', '初'), ('初', '選'), ('選', '的'), ('的', '選'), ('選', '項')]
    result_sentence = {}
    for word in words_list:
        result_sentence[word] = pmi(word)
    return result_sentence


def find_max_prob(probability_dict):
    threshold = 0.5
    sorted_prob = sorted(probability_dict.items(), key=operator.itemgetter(1), reverse=True)
    print(sorted_prob)  # Can see every candidate words prob.
    result_list = []
    for candidate in sorted_prob:
        prob = candidate[1]
        words_tuple = candidate[0]
        if prob > threshold:
            result_list.append(''.join(map(str, words_tuple)))  # ('選', '項') => 選項
    return result_list


def seperate_sentence(orginal_sentence, max_probability_dict):
    segment_word = orginal_sentence
    for candidate in max_probability_dict:
        insert_word = " "+candidate+" "
        segment_word = segment_word.replace(candidate, insert_word)
    return segment_word

In [90]:
test = word_segmentation(u'我已經向六個女生邀請她們當我舞伴')
print(test)

[(('邀', '請'), 2.7377469706616395), (('已', '經'), 2.616439494768189), (('舞', '伴'), 2.372406293195432), (('女', '生'), 1.9376363810795398), (('她', '們'), 1.1372047738830324), (('個', '女'), 0.9856825965816585), (('六', '個'), 0.9582880721092568), (('我', '已'), 0.6326799837011312), (('請', '她'), 0.4579079118711089), (('當', '我'), 0.41783700297367954), (('們', '當'), 0.23328336551896012), (('生', '邀'), -0.09065874418841524), (('向', '六'), -0.18860153138347435), (('經', '向'), -0.26773190071805963), (('我', '舞'), -0.996158227432633)]
我 已經 向 六個  女生  邀請  她們 當我 舞伴 


In [91]:
total_lines = content_rdd.flatMap(lambda s: remove_url_and_punctuation(s['content'])).collect()

In [93]:
for sentence in total_lines[40:50]:
    print(sentence)
    print(word_segmentation(sentence))
    print()

供應商的議價能力
[(('能', '力'), 1.6685670230815473), (('供', '應'), 1.5496545873803456), (('議', '價'), 0.7948979770876985), (('應', '商'), 0.5847347903222362), (('的', '議'), 0.05783033568889451), (('商', '的'), -0.009587630864118957), (('價', '能'), -0.8891515584214131)]
 供應 商的 議價  能力 

顧客的議價能力及產業內部的競爭對手
[(('競', '爭'), 3.6490206048028764), (('顧', '客'), 2.276043214517113), (('產', '業'), 2.1451360753349187), (('能', '力'), 1.6685670230815473), (('內', '部'), 1.4001566132006324), (('議', '價'), 0.7948979770876985), (('的', '競'), 0.6567804575461791), (('爭', '對'), 0.6426456929905022), (('及', '產'), 0.6111592116394231), (('力', '及'), 0.5450525776318056), (('對', '手'), 0.47596881563711513), (('部', '的'), 0.2987359486950636), (('業', '內'), 0.2658487676083485), (('的', '議'), 0.05783033568889451), (('客', '的'), -0.08960679760762069), (('價', '能'), -0.8891515584214131)]
 顧客 的 議價  能力 及 產業  內部 的 競爭 對手

而由合力來決定產業最終的獲利潛力
[(('決', '定'), 2.633020547529234), (('獲', '利'), 2.300566157151798), (('潛', '力'), 2.210778917179651), (('產', '業'), 2.1

In [49]:
def partial_match(word, counter):
    new_counter = Counter()
    for key, value in counter.most_common():
        if all(k1 == k2 or k2 is None for k1, k2 in zip(key, word)):
            new_counter[key] = value
    return new_counter


In [50]:
one_match = partial_match((u'沒', None), bigram_counter)
two_match = partial_match((u'沒', u'有', None), trigram_counter)

In [51]:
one_match.most_common(10)

[(('沒', '有'), 44665),
 (('沒', '什'), 2285),
 (('沒', '想'), 2201),
 (('沒', '辦'), 2084),
 (('沒', '人'), 1754),
 (('沒', '看'), 1645),
 (('沒', '關'), 1577),
 (('沒', '事'), 1472),
 (('沒', '錯'), 1344),
 (('沒', '多'), 864)]

In [52]:
two_match.most_common(10)

[(('沒', '有', '人'), 4607),
 (('沒', '有', '什'), 1272),
 (('沒', '有', '推'), 838),
 (('沒', '有', '這'), 832),
 (('沒', '有', '很'), 806),
 (('沒', '有', '要'), 713),
 (('沒', '有', '一'), 705),
 (('沒', '有', '想'), 627),
 (('沒', '有', '任'), 504),
 (('沒', '有', '看'), 494)]