In [1]:
import re
import operator
from collections import Counter
from pyspark import SparkContext
SparkContext.setSystemProperty('spark.executor.memory', '16g')
SparkContext.setSystemProperty('spark.executor.extraJavaOptions', '-verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps')
sc = SparkContext("local[*]", 'dcard')

In [2]:
from pyspark.sql import SparkSession
import sys
my_spark = SparkSession \
    .builder \
    .appName("dcard") \
    .config("spark.mongodb.input.uri", "mongodb://192.168.2.12:27017/dcard.talk_posts") \
    .config("spark.mongodb.output.uri", "mongodb://192.168.2.12:27017/dcard.talk_posts") \
    .getOrCreate()

In [3]:
df = my_spark.read.format("com.mongodb.spark.sql.DefaultSource").load()

In [4]:
from pyspark.accumulators import AccumulatorParam

In [5]:
df.printSchema()

root
 |-- _id: integer (nullable = true)
 |-- anonymousDepartment: boolean (nullable = true)
 |-- anonymousSchool: boolean (nullable = true)
 |-- commentCount: integer (nullable = true)
 |-- content: string (nullable = true)
 |-- createdAt: string (nullable = true)
 |-- department: string (nullable = true)
 |-- excerpt: string (nullable = true)
 |-- forumAlias: string (nullable = true)
 |-- forumId: string (nullable = true)
 |-- forumName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- hidden: boolean (nullable = true)
 |-- hiddenByAuthor: boolean (nullable = true)
 |-- likeCount: integer (nullable = true)
 |-- media: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- url: string (nullable = true)
 |-- pinned: boolean (nullable = true)
 |-- replyId: integer (nullable = true)
 |-- replyTitle: string (nullable = true)
 |-- reportReason: string (nullable = true)
 |-- school: string (nullable = true)
 |-- tags: array (nullable = true

In [6]:
df.groupBy("school").count().show()

+-----------+-----+
|     school|count|
+-----------+-----+
|       新堡大學|    9|
| 國立高雄海洋科技大學|   34|
|        屏科大|  436|
|      亞當森大學|    1|
|       格魯斯特|    1|
| 國立臺南護理專科學校|   21|
|     香港浸會大學|    1|
|    love♏的♎|    1|
|         七七|    1|
|       森野小街|    1|
|   Ladycaca|    1|
|       伊比爾喬|    1|
|      理論型情聖|    1|
|        OwO|    1|
|         小隻|    1|
|💸挖壕溝的女子嘟🔨|    1|
|         米米|    1|
|     142小女紙|    2|
|        紅心K|    1|
|       新鮮な肝|    1|
+-----------+-----+
only showing top 20 rows



In [7]:
df.take(5)

[Row(_id=6150, anonymousDepartment=False, anonymousSchool=False, commentCount=19, content='好酷噢可以按讚XD', createdAt='2014-04-10T08:16:24.673Z', department='資訊工程學系', excerpt='好酷噢可以按讚XD', forumAlias='talk', forumId='255fd275-fec2-49d2-8e46-2e1557ffaeb0', forumName='閒聊', gender='M', hidden=False, hiddenByAuthor=False, likeCount=82, media=[], pinned=False, replyId=None, replyTitle='null', reportReason='', school='淡江大學', tags=[], title='新功能耶！', updatedAt='2014-04-10T08:16:24.673Z', withNickname=None),
 Row(_id=6151, anonymousDepartment=False, anonymousSchool=False, commentCount=60, content='大家快來給點建議吧 : )', createdAt='2014-04-10T09:15:11.945Z', department=' ', excerpt='大家快來給點建議吧 : )', forumAlias='talk', forumId='255fd275-fec2-49d2-8e46-2e1557ffaeb0', forumName='閒聊', gender='D', hidden=False, hiddenByAuthor=False, likeCount=18, media=[], pinned=False, replyId=None, replyTitle='null', reportReason='', school='狄卡', tags=[], title='新版建議', updatedAt='2014-04-10T09:15:11.945Z', withNickname=None),
 R

In [8]:
content = df.select('content')
print(content.rdd.getNumPartitions())
content_rdd = content.rdd.repartition(16).cache()


3


In [11]:
content_rdd.getNumPartitions()


16

In [10]:
lineLengths = content_rdd.map(lambda s: len(s))
totalLength = lineLengths.reduce(lambda a, b: a + b)
print(lineLengths)
print(totalLength)

PythonRDD[30] at RDD at PythonRDD.scala:48
79631


In [19]:
def remove_url(sentence):
    # remove url
    if 'http' in sentence:
        sentence = re.sub(r'^https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE)

    # remove punctuation
    text_list = re.split('\W+', sentence)
    return list(filter(None, text_list))

def to_ngrams(unigrams, length):
    return Counter(zip(*[unigrams[i:] for i in range(length)]))

In [11]:
test = '🤔剛剛要回文時已經找不到了\n總之，內容就是跟姊姊一起刮卡然後中了205萬，於是發上來的照片我怎麼看都沒中😂。\n\n祝大家新年快樂，也希望大家在刮彩卷之類的東西能夠好好看清楚喔🙌，以免遇到以為中獎一家人很high，結果被網友說沒中的哭哭戲碼。\nhttps://i.imgur.com/Svbg4BF.jpg\n沒有截文章 截到圖片而已\n\n大家加油喔👊'

In [12]:
remove_url(test)

['剛剛要回文時已經找不到了',
 '總之',
 '內容就是跟姊姊一起刮卡然後中了205萬',
 '於是發上來的照片我怎麼看都沒中',
 '祝大家新年快樂',
 '也希望大家在刮彩卷之類的東西能夠好好看清楚喔',
 '以免遇到以為中獎一家人很high',
 '結果被網友說沒中的哭哭戲碼',
 '沒有截文章',
 '截到圖片而已',
 '大家加油喔']

In [13]:
result = content_rdd.map(lambda a: remove_url(a['content'])).collect()

In [14]:
result[:10]

[['呃', '忘了當初選的選項', '想要知道的話可以顯示嗎', '我猜線在是不行', '要改了系統才能吧'],
 ['有時候難得遇到認識的人其實很高興', '不過因為太久沒看到', '不知道要說什麼', '所以就快步離開了', '大家都有這樣的經驗嗎'],
 ['感覺淡江的同學好多阿', '抽到好多都是淡江的', '好友也都是'],
 ['學校即將舉行耶誕舞會',
  '我已經向六個女生邀請她們當我舞伴',
  '但是都被拒絕',
  '我很難過',
  '我無法再承受任何打擊',
  '我心理系同學對我說',
  '不要難過',
  '遇到任何困難',
  '可以隨時找她談談',
  '我很感動',
  '於是我問她',
  '你可以當我舞伴嗎',
  '她對我說',
  '你現在累積到7個了',
  '我還要去找第八個嗎'],
 ['摩立特集團聲請破產保護前',
  '公司內部的高薪顧問為何不用波特著名的',
  '競爭五力分析法',
  '來自救',
  '是因為策略無效',
  '還是因為世界已經改變',
  '但公司卻未能調適',
  '波特一九七九年在哈佛商業評論發表的文章中指出',
  '產業內的競爭情勢',
  '繫於五種基本力量',
  '新進者的威脅',
  '替代性產品的威脅',
  '供應商的議價能力',
  '顧客的議價能力及產業內部的競爭對手',
  '而由合力來決定產業最終的獲利潛力',
  '富比世雜誌指出',
  '他的理論植基於',
  '寡占',
  '理論',
  '企業的策略目標是為了擊敗對手',
  '而非對客戶提供加值',
  '企業要想持續獲得超額利潤',
  '必須避免競爭',
  '並找出結構性的障礙',
  '來保護既有的超額利潤',
  '富比世指出',
  '然而企業的競爭並非戰爭',
  '未必要對手失敗才能造就本身的成功',
  '就如同藝術表演一樣',
  '藝術家可以各自尋找自己的觀眾',
  '至於波特最重視的結構性障礙',
  '近廿年來由於全球化及網路風',
  '已經吹垮了大部分障礙',
  '市場改由客戶當家做主',
  '富比世強調',
  '今天企業已不再靠擊敗對手',
  '並以結構性障礙來保護自己免於競爭',
  '而必須不斷創新來為客戶提供加值',
  '找到新方法來討客戶歡心

In [15]:
# compute unigram and bigram count
unigram_counter = Counter()
bigram_counter = Counter()
trigram_counter = Counter()

for post in result:
    for line in post:
        unigram_counter.update(line)
        bigram_counter.update(to_ngrams(line, 2))
        trigram_counter.update(to_ngrams(line, 3))

In [16]:
print(unigram_counter.most_common(10))
print(bigram_counter.most_common(10))
print(trigram_counter.most_common(10))


[('的', 543687), ('我', 347812), ('是', 329005), ('不', 252695), ('一', 239275), ('有', 237782), ('了', 172307), ('人', 143985), ('在', 137319), ('到', 124767)]
[(('沒', '有'), 43618), (('什', '麼'), 41772), (('可', '以'), 39754), (('一', '個'), 37991), (('自', '己'), 37321), (('知', '道'), 36272), (('大', '家'), 35983), (('覺', '得'), 34301), (('因', '為'), 33830), (('真', '的'), 32435)]
[(('不', '知', '道'), 18466), (('的', '時', '候'), 15026), (('有', '沒', '有'), 13611), (('_', '_', '_'), 13133), (('為', '什', '麼'), 9418), (('自', '己', '的'), 8194), (('哈', '哈', '哈'), 7108), (('我', '覺', '得'), 6325), (('是', '不', '是'), 6131), (('真', '的', '很'), 6026)]


In [18]:
def remove_url_and_punctuation(sentence):
    # remove url
    if 'http' in sentence:
        sentence = re.sub(r'^https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE)

    # remove punctuation
    text_list = re.split('\W+', sentence)
    return list(filter(None, text_list))

def one_to_three_grams(line):
    print(line)
    result = (Counter(line), to_ngrams(line, 2), to_ngrams(line, 3))
    print(len(result[0]), len(result[1]), len(result[2]))
    return result
#     return (Counter(line), to_ngrams(line, 2), to_ngrams(line, 3))

def to_ngrams(unigrams, length):
    return Counter(zip(*[unigrams[i:] for i in range(length)]))

In [21]:
print(content_rdd.top(1))
remove_url_and_punctuation(content_rdd.top(1)[0]['content'])

[Row(content='🤔🤔\n聽說今年在屏東某地的潮X高中\n全國繁星第一 (110人)\n但只有46個人上國立\n難道這就是所謂有學校就讀的概念嗎?\n\n還有據說繁星進大學的 都蠻優秀的\n是這樣嗎？')]


['聽說今年在屏東某地的潮X高中',
 '全國繁星第一',
 '110人',
 '但只有46個人上國立',
 '難道這就是所謂有學校就讀的概念嗎',
 '還有據說繁星進大學的',
 '都蠻優秀的',
 '是這樣嗎']

In [22]:
lineLengths = content_rdd.flatMap(lambda s: remove_url_and_punctuation(s['content'])).map(lambda s: len(s))
totalLength = lineLengths.reduce(lambda a, b: a + b)

In [23]:
totalLength

15835832

In [119]:
import time
sec = int(round(time.time()))
print(sec)
sample_rdd = content_rdd.sample(False, 0.01, sec)

1491633105


In [43]:
result = content_rdd.flatMap(lambda s: remove_url_and_punctuation(s['content'])).map(lambda line: one_to_three_grams(line)).reduce(lambda a, b: tuple(map(operator.add, a, b)))


In [45]:
result
result[0].most_common(10)

[('的', 4723),
 ('我', 3009),
 ('是', 2828),
 ('有', 2171),
 ('不', 2104),
 ('一', 2043),
 ('了', 1436),
 ('人', 1197),
 ('到', 1137),
 ('在', 1130)]

In [48]:
result_one_grams = content_rdd.flatMap(lambda s: remove_url_and_punctuation(s['content'])).map(lambda s: Counter(s)).reduce(lambda a,b: a + b)

In [73]:
result_one_grams.most_common(10)

[('的', 543687),
 ('我', 347812),
 ('是', 329005),
 ('不', 252695),
 ('一', 239275),
 ('有', 237782),
 ('了', 172307),
 ('人', 143985),
 ('在', 137319),
 ('到', 124767)]

In [134]:
list_data = content_rdd.flatMap(lambda a: remove_url(a['content'])).collect()
result_list = sc.parallelize(list_data)
result_list.top(10)

['𡘙尛', '𠨕雥𤬑𠌕𣣓𥴒𦏱𦓈𨷮𤳅𡬜', 'ﾟﾟ', 'ﾟﾟ', 'ﾟﾟ', 'ﾟﾟ', 'ﾟﾛﾟ', 'ﾟﾉД', 'ﾟﾉД', 'ﾟﾉ']

In [118]:
class CounterAccumulatorParam(AccumulatorParam):
    def zero(self, initialValue):
        return initialValue

    def addInPlace(self, v1, v2):
        v1 += v2
        return v1

# Then, create an Accumulator of this type:
one_gram_accum = sc.accumulator(Counter(), CounterAccumulatorParam())
two_gram_accum = sc.accumulator(Counter(), CounterAccumulatorParam())
three_gram_accum = sc.accumulator(Counter(), CounterAccumulatorParam())

In [128]:
def one_to_three_grams_accum(line):
    one_gram_accum.add(Counter(line))
    two_gram_accum.add(to_ngrams(line, 2))
    three_gram_accum.add(to_ngrams(line, 3))

In [None]:
result_list.foreach(lambda line: one_to_three_grams_accum(line))

In [132]:
one_gram_accum.value.most_common(10)

Counter({'不': 3,
         '久': 1,
         '了': 3,
         '人': 1,
         '什': 1,
         '以': 2,
         '候': 1,
         '其': 1,
         '初': 1,
         '到': 2,
         '可': 1,
         '吧': 1,
         '呃': 1,
         '嗎': 2,
         '因': 1,
         '在': 1,
         '大': 1,
         '太': 1,
         '家': 1,
         '實': 1,
         '就': 1,
         '很': 1,
         '得': 1,
         '忘': 1,
         '快': 1,
         '想': 1,
         '我': 1,
         '所': 1,
         '才': 1,
         '改': 1,
         '是': 1,
         '時': 1,
         '有': 2,
         '樣': 1,
         '步': 1,
         '沒': 1,
         '為': 1,
         '猜': 1,
         '當': 1,
         '的': 4,
         '看': 1,
         '知': 2,
         '示': 1,
         '系': 1,
         '統': 1,
         '經': 1,
         '線': 1,
         '能': 1,
         '興': 1,
         '行': 1,
         '要': 3,
         '話': 1,
         '認': 1,
         '說': 1,
         '識': 1,
         '這': 1,
         '遇': 1,
         '過': 1,
         '道': 

In [133]:
two_gram_accum.value.most_common(10)

Counter({('不', '知'): 1,
         ('不', '行'): 1,
         ('不', '過'): 1,
         ('久', '沒'): 1,
         ('了', '當'): 1,
         ('了', '系'): 1,
         ('人', '其'): 1,
         ('什', '麼'): 1,
         ('以', '就'): 1,
         ('以', '顯'): 1,
         ('候', '難'): 1,
         ('其', '實'): 1,
         ('初', '選'): 1,
         ('到', '認'): 1,
         ('可', '以'): 1,
         ('因', '為'): 1,
         ('在', '是'): 1,
         ('大', '家'): 1,
         ('太', '久'): 1,
         ('家', '都'): 1,
         ('實', '很'): 1,
         ('就', '快'): 1,
         ('很', '高'): 1,
         ('得', '遇'): 1,
         ('忘', '了'): 1,
         ('快', '步'): 1,
         ('想', '要'): 1,
         ('我', '猜'): 1,
         ('所', '以'): 1,
         ('才', '能'): 1,
         ('改', '了'): 1,
         ('是', '不'): 1,
         ('時', '候'): 1,
         ('有', '時'): 1,
         ('有', '這'): 1,
         ('樣', '的'): 1,
         ('步', '離'): 1,
         ('沒', '看'): 1,
         ('為', '太'): 1,
         ('猜', '線'): 1,
         ('當', '初'): 1,
         ('的', '

In [131]:
three_gram_accum.value.most_common(10)

Counter({('不', '知', '道'): 1,
         ('不', '過', '因'): 1,
         ('久', '沒', '看'): 1,
         ('了', '當', '初'): 1,
         ('了', '系', '統'): 1,
         ('人', '其', '實'): 1,
         ('以', '就', '快'): 1,
         ('以', '顯', '示'): 1,
         ('候', '難', '得'): 1,
         ('其', '實', '很'): 1,
         ('初', '選', '的'): 1,
         ('到', '認', '識'): 1,
         ('可', '以', '顯'): 1,
         ('因', '為', '太'): 1,
         ('在', '是', '不'): 1,
         ('大', '家', '都'): 1,
         ('太', '久', '沒'): 1,
         ('家', '都', '有'): 1,
         ('實', '很', '高'): 1,
         ('就', '快', '步'): 1,
         ('很', '高', '興'): 1,
         ('得', '遇', '到'): 1,
         ('忘', '了', '當'): 1,
         ('快', '步', '離'): 1,
         ('想', '要', '知'): 1,
         ('我', '猜', '線'): 1,
         ('所', '以', '就'): 1,
         ('才', '能', '吧'): 1,
         ('改', '了', '系'): 1,
         ('是', '不', '行'): 1,
         ('時', '候', '難'): 1,
         ('有', '時', '候'): 1,
         ('有', '這', '樣'): 1,
         ('樣', '的', '經'): 1,
         ('步',

 # Good-Turing Smoothing Language Model

In [20]:
V1 = content_rdd.flatMap(lambda s: remove_url(s['content'])).map(lambda s: len(s)).reduce(lambda a, b: a + b)
V2 = V1 ** 2
k = 10

In [21]:
from math import log10
# compute N1, N2, N3...
unigram_Nr = Counter(unigram_counter.values())
bigram_Nr = Counter(bigram_counter.values())
# compute N0
unigram_Nr[0] = V1 - len(unigram_counter)
bigram_Nr[0] = V2 - len(bigram_counter)
print(unigram_Nr[0], bigram_Nr[0])

15828297 250773574342249


In [22]:
# compute r
unigram_r = [(i+1) * unigram_Nr[i+1] / unigram_Nr[i] for i in range(k)]
bigram_r = [(i+1) * bigram_Nr[i+1] / bigram_Nr[i] for i in range(k)]
print(unigram_r)
print(bigram_r)

[6.848494187340558e-05, 1.0313653136531364, 1.6207513416815742, 3.2450331125827816, 3.693877551020408, 5.602209944751381, 5.053254437869822, 7.737704918032787, 8.694915254237289, 8.421052631578947]
[1.450778858794238e-09, 0.6462644681254587, 1.5686069359736647, 2.5184727260895734, 3.5188103973954834, 4.449608987773984, 5.516758255359176, 6.305923738928325, 7.4514663691424925, 8.501299451342767]


In [23]:
# compute normalize factor
# compute N
unigram_N = sum(unigram_counter.values())
bigram_N = sum(bigram_counter.values())
print(unigram_N, bigram_N)

15835832 14001451


In [24]:
# compute new probability sum
unigram_N_ = unigram_N + k * unigram_Nr[k]
bigram_N_ = bigram_N + k * bigram_Nr[k]
print(unigram_N_, bigram_N_)

15836792 14089771


In [25]:
# normalize factor: N/N’
unigram_norm_factor = unigram_N / unigram_N_
bigram_norm_factor = bigram_N / bigram_N_
print(unigram_norm_factor, bigram_norm_factor)

0.9999393816626498 0.9937316227495819


In [26]:
# Estimating P(w) and P(w’|w)
def prob_1word(unigram):
    count = unigram_counter[unigram]
    r = unigram_r[count] if count < k else count
    return log10(r / unigram_N_)
def prob_2words(text_front, text_rear):
    count = bigram_counter[text_front, text_rear]
    r = bigram_r[count] if count < k else count
    return log10(r / bigram_N_)
def prob_word_by_word(text_front, text_rear):
    return prob_2words(text_front, text_rear) - prob_1word(text_front)
def prob_words(words):
    return prob_1word(words[0]) + sum(prob_word_by_word(words[i-1], words[i]) for i in range(1, len(words)))
def prob_text(text):
    return prob_words(text.lower().split())

In [27]:
unicount_log = {k: log10(v) for k, v in unigram_counter.items()}
bicount_log = {k: log10(v) for k, v in bigram_counter.items()}
unigram_r_log = [log10(r) for r in unigram_r]
bigram_r_log = [log10(r) for r in bigram_r]
unigram_N_log = log10(unigram_N_)
bigram_N_log = log10(bigram_N_)


def prob_1word(unigram):
    count = unigram_counter[unigram]
    r = unigram_r_log[count] if count < k else unicount_log[unigram]
    return r - unigram_N_log
def prob_2words(text_front, text_rear):
    count = bigram_counter[text_front, text_rear]
    r = bigram_r_log[count] if count < k else bicount_log[text_front, text_rear]
    return r - bigram_N_log

In [28]:
print(prob_1word(u'清'))
print(prob_1word(u'華'))
print(prob_2words(u'清', u'華'))
print(prob_word_by_word(u'清',u'華'))

-3.368181373497382
-3.769753514982285
-5.2350900822270905
-1.8669087087297087


In [29]:
print(prob_1word(u'我'))
print(prob_2words(u'哈', u'哈'))


-1.658322651054391
-2.971194328527625


In [30]:
print(prob_text(u'哈 哈'))

-2.971194328527625


In [31]:
bigram_counter.most_common(10)

[(('沒', '有'), 43618),
 (('什', '麼'), 41772),
 (('可', '以'), 39754),
 (('一', '個'), 37991),
 (('自', '己'), 37321),
 (('知', '道'), 36272),
 (('大', '家'), 35983),
 (('覺', '得'), 34301),
 (('因', '為'), 33830),
 (('真', '的'), 32435)]

In [56]:
def partial_match(word, counter):
    new_counter = Counter()
    for key, value in counter.most_common():
        if all(k1 == k2 or k2 is None for k1, k2 in zip(key, word)):
            new_counter[key] = value
    return new_counter


In [61]:
one_match = partial_match((u'沒', None), bigram_counter)
two_match = partial_match((u'沒', u'有', None), trigram_counter)

In [60]:
one_match.most_common(10)

[(('沒', '有'), 43618),
 (('沒', '什'), 2213),
 (('沒', '想'), 2161),
 (('沒', '辦'), 2031),
 (('沒', '人'), 1717),
 (('沒', '看'), 1613),
 (('沒', '關'), 1554),
 (('沒', '事'), 1443),
 (('沒', '錯'), 1299),
 (('沒', '多'), 854)]

In [63]:
two_match.most_common(10)

[(('沒', '有', '人'), 4516),
 (('沒', '有', '什'), 1251),
 (('沒', '有', '推'), 818),
 (('沒', '有', '這'), 814),
 (('沒', '有', '很'), 787),
 (('沒', '有', '要'), 689),
 (('沒', '有', '一'), 687),
 (('沒', '有', '想'), 617),
 (('沒', '有', '任'), 484),
 (('沒', '有', '看'), 483)]