In [1]:
import os
import time
import pandas as pd
import preprocessor as p
from sklearn.cluster import KMeans

In [2]:
# Set data directory
data_dir = os.path.realpath("../data")

# Check if file exists
if os.path.exists(os.path.join(data_dir, "tweets.h5")):
    # Read tweets from htweets5
    tweets = pd.read_hdf(os.path.join(data_dir, "tweets.h5"), "tweets")
else:
    # Convert tweets to htweets5
    # ? hdf manages large data well
    tweets = pd.read_csv(os.path.join(data_dir, "tweets.csv"))
    tweets.to_hdf(os.path.join(data_dir, "tweets.h5"), "tweets")

# Remove null longitude and latitude values
tweets = tweets[tweets["longitude"].notnull()]

# Constrain the area to France
tweets = tweets.loc[
    (tweets.latitude > 42.33278)
    & (tweets.latitude < 51.08917)
    & (tweets.longitude > -4.795556)
    & (tweets.longitude < 8.230556)
]

# Set preprocessor options
p.set_options(p.OPT.URL)

# Remove URLs from the tweets
tweets['text'] = tweets['text'].apply(lambda x: p.clean(x))

# Set index to id for easy matching
tweets.set_index('id', inplace=True, verify_integrity=False)

In [3]:
# Convert to numpy array
COORDS = tweets[['longitude', 'latitude']].values

# Time the KMeans
start = time.time()
kmeans = KMeans(n_clusters=100, random_state=0).fit(COORDS)
end = time.time()
print("KMeans took {} seconds".format(end - start))

# Add cluster labels to tweets
tweets['cluster'] = kmeans.labels_

KMeans took 1.1309916973114014 seconds


In [4]:
from pandas_geojson import to_geojson, write_geojson

# Convert to geojson
geojson = to_geojson(tweets, lat='latitude', lon='longitude', properties=['cluster'])

# Save to file
write_geojson(geojson, os.path.join(data_dir, "tweets.geojson"))