In [46]:
import json
import glob
import os
import datetime
import re
import pandas as pd
from slack_export import SlackExport, normalize_links
import subprocess

In [2]:
data_folder = './ODS_dump_Mar_10_2017'
ods = SlackExport(data_folder)

In [3]:
df_msg = pd.DataFrame.from_records(ods.messages)
df_msg.fillna(0).groupby(['type', 'subtype'])['dt'].count()

type     subtype          
message  0                    301495
         bot_add                  11
         bot_message            2582
         bot_remove                6
         channel_archive          27
         channel_join          61056
         channel_leave          2564
         channel_name             39
         channel_purpose          83
         channel_topic           141
         channel_unarchive         1
         file_comment            396
         file_mention             53
         file_share             3189
         me_message                8
         pinned_item             380
         reminder_add              8
         reply_broadcast         184
         sh_room_created           1
         tombstone                33
Name: dt, dtype: int64

In [4]:
df_msg.subtype.unique()

array([u'channel_join', nan, u'channel_leave', u'file_share',
       u'channel_topic', u'file_comment', u'pinned_item',
       u'reply_broadcast', u'file_mention', u'channel_purpose',
       u'channel_name', u'bot_add', u'bot_remove', u'me_message',
       u'channel_archive', u'bot_message', u'reminder_add',
       u'sh_room_created', u'tombstone', u'channel_unarchive'], dtype=object)

In [5]:
# %load slack_export.py
import json
import glob
import os
import datetime
import re


def _read_json_dict(filename, key='id'):
    with open(filename) as fin:
        records = json.load(fin)
        json_dict = {
            record[key]: record
            for record in records
        }
    return json_dict


class SlackExport(object):
    def __init__(self, export_path):
        self.load_export(export_path)

    def load_export(self, export_path):
        self.channels = _read_json_dict(os.path.join(export_path, 'channels.json'))
        self.users = _read_json_dict(os.path.join(export_path, 'users.json'))
        self.messages = []
        for channel_id, channel in self.channels.iteritems():
            messages_glob = os.path.join(export_path, channel['name'], '*.json')
            for messages_filename in glob.glob(messages_glob):
                with open(messages_filename) as f_messages:
                    for record in json.load(f_messages):
                        if 'ts' in record:
                            record['ts'] = float(record['ts'])
                            record['dt'] = datetime.datetime.fromtimestamp(record['ts'])
                        record['channel'] = channel_id
                        self.messages.append(record)

re_slack_link = re.compile(r'(?P<all><(?P<id>[^\|]*)(\|(?P<title>[^>]*))?>)')

def _extract_slack_link_id(m):
    return m.group('id')

def normalize_links(text):
    return re_slack_link.sub(_extract_slack_link_id, text)

In [6]:
class WelcomeExport(object):
    def __init__(self, export_path):
        self.load_export(export_path)

    def load_export(self, export_path):
        self.users = _read_json_dict(os.path.join(export_path, 'users.json'))
        self.messages = []
        messages_glob = os.path.join(export_path, 'welcome', '*.json')
        for messages_filename in glob.glob(messages_glob):
            with open(messages_filename) as f_messages:
                for record in json.load(f_messages):
                    if 'ts' in record:
                        record['ts'] = float(record['ts'])
                        record['dt'] = datetime.datetime.fromtimestamp(record['ts'])
                    record['channel'] = 'welcome'
                    self.messages.append(record)


In [7]:
ods = WelcomeExport(data_folder)

In [8]:
df_msg = pd.DataFrame.from_records(ods.messages)
df_msg.fillna(0).groupby(['type', 'subtype'])['dt'].count()

type     subtype        
message  0                  3214
         channel_join       2987
         channel_leave        95
         channel_purpose       1
         channel_topic         1
         file_share            4
         reply_broadcast       1
         tombstone             2
Name: dt, dtype: int64

In [9]:
print df_msg[df_msg.subtype.isnull()].text[4]

Всем привет. Меня зовут Артём, я закончил Иркутский политех по специальности радиотехника. Сейчас я аспирант, занимаюсь обработкой сигналов атмосферных радаров. Недавно заинтересовался машинным обучением, прошел курс Andrew Ng, сейчас прохожу курс от Яндекса и МФТИ на курсере. Поражает количество практических задач, которые позволяет решать data science, поэтому я здесь.


Вытащим сообщения-представления

In [10]:
k = 0
hi_messages = []
hi_tokens=(u'все', u'привет',u'добр',u'шалом',u'салют',u'здрав',u'хай',u'я',u'ребят'u'коллег')
for _, i in df_msg[df_msg.subtype.isnull()].iterrows():
    if i.text.lower().startswith(hi_tokens) and len(i.text) > 100:
        k+=1
        hi_messages.append((i.username, i.text))

In [1]:
for i in hi_messages:
    print i[1]
    print '='*80

In [12]:
k, len(df_msg[df_msg.subtype.isnull()].text)

(612, 3214)

файлы для томиты

In [83]:
def create_messages(count):
    if not os.path.exists('for_tomita/messages'):
        os.makedirs('for_tomita/messages')
    for n, i in zip(xrange(len(hi_messages)), hi_messages):
        if n < count:
            f = open('for_tomita/messages/mes_'+str(n), 'w')
            f.write(hi_messages[n][1].encode("utf-8"))
            f.close()

In [85]:
create_messages(10)

запуск томита парсера

In [86]:
tomita_path = '/tmp/tomita-parser/build/bin/tomita-parser '
bash_command = 'cd for_tomita/ && '+ tomita_path + 'config.proto'

In [87]:
subprocess.Popen(bash_command, shell=True)

<subprocess.Popen at 0x7f09f99c2550>

факты в файле facts.xml

Перевод xml в dataframe

In [142]:
#TODO перевод в dataframe

In [95]:
import xml.etree.ElementTree as ET
from lxml import etree
import pandas as pd

xml_data = 'for_tomita/facts.xml'


In [140]:
def xml2df(xml_data):
    tree = ET.parse(xml_data)
    root = tree.getroot()
    print root
    all_records = []
    headers = []
    for i, child in enumerate(root):
        print child
        record = []
        for subchild in child:
            if subchild.tag == 'facts':
                for fact in subchild.tag:
                    print fact
                    record.append(fact.val)
                    if fact not in headers:
                        headers.append(fact)
        all_records.append(record)
    return pd.DataFrame(all_records, columns=headers)

In [141]:
df = xml2df(xml_data)

<Element 'fdo_objects' at 0x7f0a1d5273d0>
<Element 'document' at 0x7f0a0d1c23d0>


IndexError: string index out of range

In [107]:
df

Unnamed: 0,facts,Leads
0,,
1,,
2,,
3,,
4,,
5,,
6,,
7,,
8,,
9,,


In [34]:
from lxml import objectify
import pandas as pd

In [36]:
path = 'for_tomita/facts.xml'
xml = objectify.parse(open(path))
root = xml.getroot()
root.getchildren()[0].getchildren()
df = pd.DataFrame(columns=('id', 'name'))

for i in range(0,3):
    obj = root.getchildren()[i].getchildren()
    row = dict(zip(['id', 'name'], [obj[0].text, obj[1].text]))
    row_s = pd.Series(row)
    row_s.name = i
    df = df.append(row_s)

In [37]:
df

Unnamed: 0,id,name
0,,
1,,
2,,
