In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import glob
import nltk
import re
import csv

In [2]:
# global variables
liveticker_folderpath = r'../data/livetickers_2020to25_2021'
coronoacovid_postings_path = r'../data/coronacovid_postings_year20'
selected_months = [r'01_20', r'02_20', r'03_20', r'04_20', r'05_20', r'06_20', r'07_20', r'08_20', r'09_20', r'10_20', r'11_20', r'12_20']
liwc_anger_path = r'../data/liwc_german_2007_anger'

output_df = r'../data/preprocessing/combined_postings.pickle'
output_text_file = r'../data/preprocessing/text'
output_text_pid_file = r'../data/preprocessing/text_pid'

Read Livetickers:

In [3]:
all_files = glob.glob(os.path.join(liveticker_folderpath, '*'))
# remove empty files
all_files = list(filter(lambda f: os.stat(f).st_size > 0, all_files))
# filter for postings files
all_files_postings = list(filter(lambda k: 'postings' in k, all_files))
#df = pd.concat((pd.read_table(f, error_bad_lines=False, warn_bad_lines=False, engine='python') for f in all_files_postings))
colNames = ["oid","rid","pid","ppid","cd","cn","o","vp","vn","hl","tx"]
df = pd.concat((pd.read_csv(filename, sep='\t', names=colNames, skiprows=1, quoting=csv.QUOTE_NONE, encoding='utf-8') for filename in all_files_postings))
#low_memory=False

#for filename in all_files_postings: 
#    print(filename)
#    df = pd.read_csv(filename, sep='\t', names=colNames, quoting=csv.QUOTE_NONE, encoding='utf-8')
#    print(df.columns)

In [4]:
df.columns

Index(['oid', 'rid', 'pid', 'ppid', 'cd', 'cn', 'o', 'vp', 'vn', 'hl', 'tx'], dtype='object')

In [5]:
df['pid'].isna().values.any()

False

In [6]:
df['allText'] = df['hl'].fillna('') + df['tx'].fillna('')
df['allText'].isna().values.any()

False

In [7]:
len(df['allText'])

9214323

In [8]:
len(df['pid'])

9214323

In [9]:
len(df['allText']) == len(df['pid'])

True

In [10]:
df.to_pickle(output_df)

In [11]:
df['allText'].to_csv(output_text_file, header=False, index=False)

In [12]:
df['pid'].to_csv(output_text_pid_file, header=False, index=False)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9214323 entries, 0 to 37425
Data columns (total 12 columns):
 #   Column   Dtype  
---  ------   -----  
 0   oid      int64  
 1   rid      int64  
 2   pid      int64  
 3   ppid     float64
 4   cd       object 
 5   cn       object 
 6   o        object 
 7   vp       int64  
 8   vn       int64  
 9   hl       object 
 10  tx       object 
 11  allText  object 
dtypes: float64(1), int64(5), object(6)
memory usage: 913.9+ MB


In [14]:
df.head()

Unnamed: 0,oid,rid,pid,ppid,cd,cn,o,vp,vn,hl,tx,allText
0,2000122865130,1000213648,1064710786,,2020-12-31T20:04:57.643+01:00,unmo,,4,0,Herr Bundespräsident,nach Artikel 70 B-VG können Sie den Bundeskanz...,Herr Bundespräsidentnach Artikel 70 B-VG könne...
1,2000122865130,1000213648,1064724063,1064711000.0,2021-01-01T06:00:14.35+01:00,Art10 EMRK,,2,0,,Der Flash wird noch eine weitere Amtszeit dafü...,Der Flash wird noch eine weitere Amtszeit dafü...
2,2000122865130,1000213648,1064712838,1064711000.0,2020-12-31T20:57:02.017+01:00,it's my life,,2,0,,Das ist wahrscheinlich sogar unserem HBP zu pe...,Das ist wahrscheinlich sogar unserem HBP zu pe...
3,2000122865130,1000213648,1064708391,,2020-12-31T19:09:04.76+01:00,it's my life,,3,0,,Ausnahmegesetze für unseren Ausnahmenkanzler ...,Ausnahmegesetze für unseren Ausnahmenkanzler ...
4,2000122865130,1000213648,1064707195,,2020-12-31T18:43:57.973+01:00,sumac trebla,,3,0,Anschober will...,Anschober kann aber nicht.,Anschober will...Anschober kann aber nicht.


In [15]:
all_text_files = []
for month in selected_months:
    all_text_files.extend(glob.glob(os.path.join(os.path.join(coronoacovid_postings_path, month), '*_text')))

df_postings_year20 = pd.concat((pd.read_csv(filename, sep='\t', names=['title', 'text'], skiprows=1, quoting=csv.QUOTE_NONE, encoding='utf-8') for filename in all_text_files))
df_postings_year20.head()

Unnamed: 0,title,text
0,,Katastrophales Hygienebewusstsein + massive Be...
0,,"Sie sind wahrscheinlich auch jemand, der sich ..."
1,Tja,"Dann sind ""Leute wie ich"" offenbar in der Mehr..."
2,,"Na, seien wir froh, dass über Seuchenpräventio..."
3,,Deswegen schaut's heute auch so aus auf der Welt.


In [16]:
df_postings_year20['allText'] = df_postings_year20['title'].fillna('') + df_postings_year20['text'].fillna('')
df_postings_year20['allText'].to_csv(r'../data/preprocessing/text_year20', header=False, index=False)

In [17]:
all_metadata_files = []
for month in selected_months:
    all_metadata_files.extend(glob.glob(os.path.join(os.path.join(coronoacovid_postings_path, month), '*_metadata')))

meta_data_colnames = ['postid', 'parentid', 'communityidentityid', 'communityname', 'timestamp', 'followers', 'ratings_pos', 'ratings_neg']
df_postings_metadata_year20 = pd.concat((pd.read_csv(filename, sep='\t', names=meta_data_colnames, skiprows=1, quoting=csv.QUOTE_NONE, encoding='utf-8') for filename in all_metadata_files))
df_postings_metadata_year20.head()

Unnamed: 0,postid,parentid,communityidentityid,communityname,timestamp,followers,ratings_pos,ratings_neg
0,1049061634,,552163.0,Zornica,1579516469,12.0,0,2
0,1049075891,1049075000.0,170029.0,Plus Lucis,1579545033,49.0,21,1
1,1049076693,1049076000.0,603017.0,Robin55,1579546947,7.0,0,19
2,1049079664,1049077000.0,21668.0,Bobostandard,1579552601,0.0,6,0
3,1049078739,1049077000.0,,,1579550827,,0,0


In [18]:
df_postings_metadata_year20.to_csv(r'../data/preprocessing/metadata_year20', index=False)