In [1]:
from pathlib import Path
import itertools
import pandas as pd
import numpy as np

repo_path = Path('/home/krajda/anticipatio/')

In [2]:
def open_fn(f):
    try:
        return pd.read_csv(f, engine='python')
    except:
        return pd.DataFrame()

files = list(
    itertools.chain(
            Path(repo_path/'data/futurists_kol/data').rglob('*csv'),
            Path(repo_path/'data/futurists_rossdawson/data').rglob('*csv')
        )
    )

with open(repo_path / 'data/model_input_files_order.txt','tr') as f:
    fpaths = f.readlines()
    fpaths = [line.strip() for line in fpaths]

files_shuffled = pd.Series(index=[f.name for f in files], data=files)
files = files_shuffled.loc[fpaths].to_list()

tweets = pd.concat(map(open_fn, files))

tweets.columns = ['index','user','timestamp','url','txt']
tweets.reset_index(drop=True,inplace=True)

tweets['txt'] = tweets['txt'].astype(str)
tweets['user']=tweets['user'].str.replace('@','').str.strip().str.lower()
tweets['timestamp'] = pd.to_datetime(tweets['timestamp'])

tweets = tweets.drop(columns=['index'])

tweets.drop_duplicates(inplace=True,subset=['timestamp','txt'])
tweets.reset_index(inplace=True,drop=True)

print('Unique users:{}\nUnique texts:{}'.format(tweets['user'].nunique(),tweets['txt'].nunique()))
docs = tweets['txt'].tolist()

print('Unique posts (timestamp+text): {}'.format(len(docs)))

Unique users:400
Unique texts:1200003
Unique posts (timestamp+text): 1458018


# TEXT CLEANING

In [3]:
import html
import re
from tqdm import tqdm

regexes = [
    re.compile(
        r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,"
        r"}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|("
        r"?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})"
    ), # URLS
    re.compile(r"\S*@\S*\..\S*"), # EMAILS
    re.compile(r"(?<=\s)(@[\w\-\.]+)(?=[\:\,\.\!\?\s]?)|^(@[\w\-\.]+)(?=[\:\,\.\!\?\s]?)"), # HANDLES
]

docs = tweets['txt'].tolist()

print('Cleaning tweets... html unescape')
docs = [html.unescape(t) for t in tqdm(docs)]

for regex in regexes:
    print('Cleaning tweets... removing regex #', regexes.index(regex))
    docs = [regex.sub('', t) for t in tqdm(docs)]

print('Cleaning tweets... removing RTs')
docs = [t[4:] if t.startswith('RT :') else t for t in tqdm(docs)]

tweets['original_text'] = tweets['txt']
tweets['txt'] = docs

Cleaning tweets... html unescape


  0%|          | 0/1458018 [00:00<?, ?it/s]

100%|██████████| 1458018/1458018 [00:00<00:00, 1489959.58it/s]


Cleaning tweets... removing regex # 0


100%|██████████| 1458018/1458018 [00:04<00:00, 304415.50it/s]


Cleaning tweets... removing regex # 1


100%|██████████| 1458018/1458018 [00:15<00:00, 96110.76it/s] 


Cleaning tweets... removing regex # 2


100%|██████████| 1458018/1458018 [00:13<00:00, 106689.20it/s]


Cleaning tweets... removing RTs


100%|██████████| 1458018/1458018 [00:00<00:00, 1999878.59it/s]


In [4]:
pd.to_pickle(tweets,repo_path / 'data/final.pkl')