In [1]:
import spacy
from spacy.lang.en import English
import nltk
from nltk.corpus import wordnet as wn
import pickle
from collections import Counter
import re
import pandas as pd

# Most frequent words

In [2]:
spacy.load('en')
parser = English()

### 分词：“I am a pig" -> ["I", "am", "a", "pig"]
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

### 找词根
def get_lemma(word):
    # denied -> deny
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
### 没有意义的助词    
en_stop = set(nltk.corpus.stopwords.words('english'))

### 
def prepare_text_for_lda(text):
    tokens = nltk.word_tokenize(text) #分词#
    tokens = [token.lower() for token in tokens if token.isalpha()] #删除非单词并且全部小写#
    tokens = [token for token in tokens if token not in en_stop]  #删除无意义助词#
    tokens = [get_lemma(token) for token in tokens] #拿词根#
    return tokens

In [3]:
### 数词出现的次数
def get_frequency(file_name):
    with open(file_name, 'rb') as file:
        data = pickle.load(file)
    articles = data['articles']
    print(len(articles))
    frequency = Counter()
    # Counter(["a", "a", "b"]) -> {"a":2, "b":1}
    for url in articles:
        tokens = prepare_text_for_lda(articles[url][2]) ##读取url.content
        tokens = set(tokens) ##一个词文章出现很多次也只算一次，set(["a", "a", "b"]) -> ["a", "b"]
        frequency += Counter(tokens)
    return frequency

In [4]:
file_name1 = 'coindesk.data'
frequency1 = get_frequency(file_name1)

4168


In [5]:
file_name2 = 'bitcoinmagazine.data'
frequency2 = get_frequency(file_name2)

966


In [6]:
file_name3 = 'bitcoinnews.data'
frequency3 = get_frequency(file_name3)

4392


In [7]:
file_name4 = 'bitcoinist.data'
frequency4 = get_frequency(file_name4)

4600


In [8]:
file_name5 = 'blockonomi.data'
frequency5 = get_frequency(file_name5)

460


In [9]:
frequency_overall = frequency1 + frequency2 + frequency3 + frequency4 + frequency5

In [10]:
frequency_overall.most_common(300)

[('also', 10430),
 ('cryptocurrency', 10078),
 ('bitcoin', 8767),
 ('say', 8354),
 ('new', 8229),
 ('one', 7997),
 ('company', 7316),
 ('market', 7186),
 ('exchange', 7076),
 ('blockchain', 7028),
 ('use', 7000),
 ('make', 6874),
 ('time', 6694),
 ('crypto', 6625),
 ('accord', 6468),
 ('first', 6232),
 ('would', 5773),
 ('cryptocurrencies', 5760),
 ('platform', 5752),
 ('include', 5670),
 ('technology', 5477),
 ('like', 5445),
 ('last', 5411),
 ('take', 5409),
 ('report', 5216),
 ('year', 5195),
 ('however', 5108),
 ('world', 5051),
 ('financial', 5047),
 ('trading', 5027),
 ('many', 5020),
 ('state', 4963),
 ('may', 4931),
 ('user', 4928),
 ('digital', 4896),
 ('see', 4880),
 ('provide', 4858),
 ('could', 4853),
 ('add', 4819),
 ('come', 4750),
 ('million', 4705),
 ('read', 4698),
 ('currency', 4683),
 ('investor', 4632),
 ('business', 4571),
 ('price', 4549),
 ('well', 4513),
 ('announce', 4492),
 ('month', 4487),
 ('token', 4459),
 ('people', 4372),
 ('allow', 4341),
 ('create', 433

In [11]:
words = [item[0] for item in frequency_overall.most_common(300)]
freqs = [item[1] for item in frequency_overall.most_common(300)]

In [12]:
# Save data to a csv file
d = {'word': words, 'frequency': freqs}
df = pd.DataFrame(data=d)
csv_file_name = 'word_stat.csv'
df.to_csv(csv_file_name, index=False)

# Frequency of keyword candidates

In [13]:
### 读取keyword
file_name = 'keywords.txt'
freq_keyword = dict()
with open(file_name, 'rb') as file:
    for line in file:
        keyword = line.strip().decode('UTF-8').lower()
        freq_keyword[keyword] = 0

In [14]:
def get_frequency2(file_name):
    
    freq_keyword = dict()
    with open('keywords.txt', 'rb') as file:
        for line in file:
            keyword = line.strip().decode('UTF-8').lower()
            freq_keyword[keyword] = 0
            
    with open(file_name, 'rb') as file:
        data = pickle.load(file)
    articles = data['articles']
    print(len(articles))
    for url in articles:
        content = articles[url][2].lower()
        for keyword in freq_keyword:
            if re.search(keyword, articles[url][2]):
                freq_keyword[keyword] += 1
    return Counter(freq_keyword)

In [15]:
file_name1 = 'coindesk.data'
freq_keyword1 = get_frequency2(file_name1)

4168


In [16]:
file_name2 = 'bitcoinmagazine.data'
freq_keyword2 = get_frequency2(file_name2)

966


In [17]:
file_name3 = 'bitcoinnews.data'
freq_keyword3 = get_frequency2(file_name3)

4392


In [18]:
file_name4 = 'bitcoinist.data'
freq_keyword4 = get_frequency2(file_name4)

4600


In [19]:
file_name5 = 'blockonomi.data'
freq_keyword5 = get_frequency2(file_name5)

460


In [20]:
freq_keyword = freq_keyword1 + freq_keyword2 + freq_keyword3 + freq_keyword4 + freq_keyword5

In [21]:
### 有个小问题：second会和sec匹配，如果出现了second也算作出现了sec
freq_keyword.most_common()

[('crypto', 11933),
 ('sto', 10130),
 ('cryptocurrency', 9927),
 ('market', 7666),
 ('sec', 7385),
 ('blockchain', 6760),
 ('bitcoin', 6021),
 ('ether', 4872),
 ('trading', 4764),
 ('token', 4490),
 ('payment', 3181),
 ('community', 2835),
 ('security', 2784),
 ('wallet', 2455),
 ('decentralized', 2371),
 ('regulation', 2081),
 ('cryptocurrency exchange', 2055),
 ('usa', 1643),
 ('ethereum', 1385),
 ('hack', 1356),
 ('banking', 1311),
 ('bear', 1300),
 ('bull', 1260),
 ('smart contract', 1250),
 ('investing', 950),
 ('ico', 906),
 ('scam', 865),
 ('retail', 837),
 ('altcoin', 724),
 ('compliance', 711),
 ('hash', 701),
 ('nodes', 584),
 ('litecoin', 516),
 ('neo', 439),
 ('society', 367),
 ('supply chain', 339),
 ('bubble', 335),
 ('ripple', 278),
 ('decentralized exchange', 254),
 ('forks', 246),
 ('new cryptocurrency', 240),
 ('cryptocurrency wallet', 218),
 ('eos', 187),
 ('cryptocurrency miner', 162),
 ('reddit', 101),
 ('dapps', 100),
 ('cryptocurrency market cap', 84),
 ('doge', 

## 大写单词要单独处理

In [22]:
re.search('sec', 'second')

<_sre.SRE_Match object; span=(0, 3), match='sec'>

In [23]:
freq_capital = {
    'ICO': 0,
    'STO': 0,
    'SEC': 0,
    'NEO': 0,
    'EOS': 0,
    'IDEX': 0,
    'FCK': 0
}

In [24]:
file_name = 'coindesk.data'
with open(file_name, 'rb') as file:
    data = pickle.load(file)
articles = data['articles']

for url in articles:
    content = articles[url][2]
    for keyword in freq_capital:
        if re.search(keyword, articles[url][2]):
            freq_capital[keyword] += 1

In [25]:
file_name = 'bitcoinmagazine.data'
with open(file_name, 'rb') as file:
    data = pickle.load(file)
articles = data['articles']

for url in articles:
    content = articles[url][2]
    for keyword in freq_capital:
        if re.search(keyword, articles[url][2]):
            freq_capital[keyword] += 1

In [26]:
file_name = 'bitcoinnews.data'
with open(file_name, 'rb') as file:
    data = pickle.load(file)
articles = data['articles']

for url in articles:
    content = articles[url][2]
    for keyword in freq_capital:
        if re.search(keyword, articles[url][2]):
            freq_capital[keyword] += 1

In [27]:
file_name = 'bitcoinist.data'
with open(file_name, 'rb') as file:
    data = pickle.load(file)
articles = data['articles']

for url in articles:
    content = articles[url][2]
    for keyword in freq_capital:
        if re.search(keyword, articles[url][2]):
            freq_capital[keyword] += 1

In [29]:
file_name = 'blockonomi.data'
with open(file_name, 'rb') as file:
    data = pickle.load(file)
articles = data['articles']

for url in articles:
    content = articles[url][2]
    for keyword in freq_capital:
        if re.search(keyword, articles[url][2]):
            freq_capital[keyword] += 1

In [30]:
freq_keyword = dict(freq_keyword)

In [31]:
del freq_keyword['ico']
del freq_keyword['sto']
del freq_keyword['sec']
del freq_keyword['neo'] 
del freq_keyword['eos'] 
del freq_keyword['idex'] 

In [32]:
### 删除原先小写的ico，sto等，加入大写的ICO，STO。。。
freq_keyword = Counter(freq_keyword)
freq_keyword += freq_capital

In [33]:
words = [item[0] for item in freq_keyword.most_common()]
freqs = [item[1] for item in freq_keyword.most_common()]

In [34]:
# Save data to a csv file
d = {'word': words, 'frequency': freqs}
df = pd.DataFrame(data=d)
csv_file_name = 'keyword_stat.csv'
df.to_csv(csv_file_name, index=False)