In [1]:
import os
import time
import pandas as pd
import preprocessor as p
from sklearn.cluster import MiniBatchKMeans

In [2]:
# Set data directory
data_dir = os.path.realpath("../data")

# Check if file exists
if os.path.exists(os.path.join(data_dir, "tweets.h5")):
    # Read tweets from htweets5
    tweets = pd.read_hdf(os.path.join(data_dir, "tweets.h5"), "tweets")
else:
    # Convert tweets to htweets5
    # ? hdf manages large data well
    tweets = pd.read_csv(os.path.join(data_dir, "tweets.csv"))
    tweets.to_hdf(os.path.join(data_dir, "tweets.h5"), "tweets")

# Remove null longitude and latitude values
tweets = tweets[tweets["longitude"].notnull()]

# Constrain the area to France
tweets = tweets.loc[
    (tweets.latitude > 42.33278)
    & (tweets.latitude < 51.08917)
    & (tweets.longitude > -4.795556)
    & (tweets.longitude < 8.230556)
]

# Set preprocessor options
p.set_options(p.OPT.URL)

# Remove URLs from the tweets
tweets['text'] = tweets['text'].apply(lambda x: p.clean(x))

# Set index to id for easy matching
tweets.set_index('id', inplace=True, verify_integrity=False)

# Convert long and lat to numpy array
COORDS = tweets[['longitude', 'latitude']].values

In [3]:
# Timing the MiniBatchKMeans

def time_minibatch(batch_size):
    """Time the MiniBatchKMeans algorithm."""
    start = time.time()
    mini_batch = MiniBatchKMeans(n_clusters=10, batch_size=batch_size, n_init=10)
    mini_batch.fit(COORDS)
    end = time.time()
    return "batch_size: {}, time: {}".format(batch_size, end - start)

In [4]:
# Ignore UserWarnings
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Test different batch sizes
for batch_size in [10, 20, 30, 40]:
    print(time_minibatch(batch_size))

batch_size: 10, time: 2.2949771881103516
batch_size: 20, time: 0.1600053310394287
batch_size: 30, time: 0.20699334144592285
batch_size: 40, time: 0.17800283432006836
