In [4]:
import os
import time
import pandas as pd
import preprocessor as p

In [8]:
# Set data directory
data_dir = os.path.realpath("../data")

# Check if file exists
if os.path.exists(os.path.join(data_dir, "tweets.h5")):
    # Read tweets from htweets5
    tweets = pd.read_hdf(os.path.join(data_dir, "tweets.h5"), "tweets")
else:
    # Convert tweets to htweets5
    # ? hdf manages large data well
    tweets = pd.read_csv(os.path.join(data_dir, "tweets.csv"))
    tweets.to_hdf(os.path.join(data_dir, "tweets.h5"), "tweets")

# Remove null longitude and latitude values
tweets = tweets[tweets["longitude"].notnull()]

# Constrain the area to France
tweets = tweets.loc[
    (tweets.latitude > 42.33278)
    & (tweets.latitude < 51.08917)
    & (tweets.longitude > -4.795556)
    & (tweets.longitude < 8.230556)
]

# Set preprocessor options
p.set_options(p.OPT.URL)

# Remove URLs from the tweets
tweets['text'] = tweets['text'].apply(lambda x: p.clean(x))

# Set index to id for easy matching
tweets.set_index('id', inplace=True, verify_integrity=False)

In [11]:
# normalize the data 
# ! Test whether normalization speeds up the algorithm
tweets['longitude'] = (tweets['longitude'] - tweets['longitude'].min()) / (tweets['longitude'].max() - tweets['longitude'].min())
tweets['latitude'] = (tweets['latitude'] - tweets['latitude'].min()) / (tweets['latitude'].max() - tweets['latitude'].min())
# transform to numpy array
data = tweets.loc[:, ['createdAt','longitude','latitude']].values