Skip to content

Commit 5f48476

Browse files
committed
added python file which prepares dump files for machine learning
1 parent ddfa2b5 commit 5f48476

File tree

1 file changed

+55
-0
lines changed

1 file changed

+55
-0
lines changed

bin/seperate.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
#!/usr/bin/env python3
2+
3+
# call i.e. with:
4+
# bin/seperate.py data/dump/own/messages_*.txt.gz
5+
# to run this script on all messages
6+
7+
import sys
8+
import json
9+
import gzip
10+
11+
def separate(filename):
12+
13+
print("separating " + filename)
14+
15+
ftoken = filename.split(".")
16+
gz = ftoken[len(ftoken) - 1] == "gz"
17+
# if the number of tokens is 2, this is probably a unziped file.
18+
file_image = ftoken[0] + ".image" if len(ftoken) == 2 else ftoken[0] + ".image." + ftoken[2]
19+
file_conv0 = ftoken[0] + ".conv0" if len(ftoken) == 2 else ftoken[0] + ".conv0." + ftoken[2]
20+
file_conv1 = ftoken[0] + ".conv1" if len(ftoken) == 2 else ftoken[0] + ".conv1." + ftoken[2]
21+
file_puret = ftoken[0] + ".puret" if len(ftoken) == 2 else ftoken[0] + ".puret." + ftoken[2]
22+
23+
with \
24+
gzip.open(filename, mode="rt", encoding="utf-8") if gz else open(filename, "r", encoding="utf-8") as text_in, \
25+
gzip.open(file_image, mode="wt", encoding="utf-8") if gz else open(file_image, "a", encoding="utf-8") as image_out, \
26+
gzip.open(file_conv0, mode="wt", encoding="utf-8") if gz else open(file_conv0, "a", encoding="utf-8") as conv0_out, \
27+
gzip.open(file_conv1, mode="wt", encoding="utf-8") if gz else open(file_conv1, "a", encoding="utf-8") as conv1_out, \
28+
gzip.open(file_puret, mode="wt", encoding="utf-8") if gz else open(file_puret, "a", encoding="utf-8") as puret_out:
29+
30+
for line in text_in:
31+
j = json.loads(line)
32+
if not 'text' in j: continue
33+
if 'images' in j:
34+
images = j['images']
35+
if len(images) > 0:
36+
image_out.write(line)
37+
38+
else:
39+
if 'links_count' in j and j['links_count'] == 0:
40+
if 'mentions_count' in j:
41+
mentions_count = j['mentions_count']
42+
43+
if mentions_count == 1:
44+
conv1_out.write(line)
45+
46+
if mentions_count == 0:
47+
conv0_out.write(line)
48+
if 'hashtags_count' in j:
49+
hashtags_count = j['hashtags_count']
50+
if hashtags_count == 0:
51+
puret_out.write(line)
52+
53+
sys.argv.pop(0)
54+
for f in sys.argv: separate(f)
55+

0 commit comments

Comments
 (0)