|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +# call i.e. with: |
| 4 | +# bin/seperate.py data/dump/own/messages_*.txt.gz |
| 5 | +# to run this script on all messages |
| 6 | + |
| 7 | +import sys |
| 8 | +import json |
| 9 | +import gzip |
| 10 | + |
| 11 | +def separate(filename): |
| 12 | + |
| 13 | + print("separating " + filename) |
| 14 | + |
| 15 | + ftoken = filename.split(".") |
| 16 | + gz = ftoken[len(ftoken) - 1] == "gz" |
| 17 | + # if the number of tokens is 2, this is probably a unziped file. |
| 18 | + file_image = ftoken[0] + ".image" if len(ftoken) == 2 else ftoken[0] + ".image." + ftoken[2] |
| 19 | + file_conv0 = ftoken[0] + ".conv0" if len(ftoken) == 2 else ftoken[0] + ".conv0." + ftoken[2] |
| 20 | + file_conv1 = ftoken[0] + ".conv1" if len(ftoken) == 2 else ftoken[0] + ".conv1." + ftoken[2] |
| 21 | + file_puret = ftoken[0] + ".puret" if len(ftoken) == 2 else ftoken[0] + ".puret." + ftoken[2] |
| 22 | + |
| 23 | + with \ |
| 24 | + gzip.open(filename, mode="rt", encoding="utf-8") if gz else open(filename, "r", encoding="utf-8") as text_in, \ |
| 25 | + gzip.open(file_image, mode="wt", encoding="utf-8") if gz else open(file_image, "a", encoding="utf-8") as image_out, \ |
| 26 | + gzip.open(file_conv0, mode="wt", encoding="utf-8") if gz else open(file_conv0, "a", encoding="utf-8") as conv0_out, \ |
| 27 | + gzip.open(file_conv1, mode="wt", encoding="utf-8") if gz else open(file_conv1, "a", encoding="utf-8") as conv1_out, \ |
| 28 | + gzip.open(file_puret, mode="wt", encoding="utf-8") if gz else open(file_puret, "a", encoding="utf-8") as puret_out: |
| 29 | + |
| 30 | + for line in text_in: |
| 31 | + j = json.loads(line) |
| 32 | + if not 'text' in j: continue |
| 33 | + if 'images' in j: |
| 34 | + images = j['images'] |
| 35 | + if len(images) > 0: |
| 36 | + image_out.write(line) |
| 37 | + |
| 38 | + else: |
| 39 | + if 'links_count' in j and j['links_count'] == 0: |
| 40 | + if 'mentions_count' in j: |
| 41 | + mentions_count = j['mentions_count'] |
| 42 | + |
| 43 | + if mentions_count == 1: |
| 44 | + conv1_out.write(line) |
| 45 | + |
| 46 | + if mentions_count == 0: |
| 47 | + conv0_out.write(line) |
| 48 | + if 'hashtags_count' in j: |
| 49 | + hashtags_count = j['hashtags_count'] |
| 50 | + if hashtags_count == 0: |
| 51 | + puret_out.write(line) |
| 52 | + |
| 53 | +sys.argv.pop(0) |
| 54 | +for f in sys.argv: separate(f) |
| 55 | + |
0 commit comments