In [None]:
import os
import folium
import pandas as pd
import preprocessor as p
import matplotlib.pyplot as plt

# Set data directory
data_dir = os.path.realpath("../data")

# Check if file exists
if os.path.exists(os.path.join(data_dir, "tweets.h5")):
    # Read tweets from htweets5
    tweets = pd.read_hdf(os.path.join(data_dir, "tweets.h5"), "tweets")
else:
    # Convert tweets to htweets5
    tweets = pd.read_csv(os.path.join(data_dir, "tweets.csv"))
    tweets.to_hdf(os.path.join(data_dir, "tweets.h5"), "tweets")

# Remove null longitude and latitude values
tweets = tweets[tweets["longitude"].notnull()]

# Constrain the area to France
tweets = tweets.loc[
    (tweets.latitude > 42.33278)
    & (tweets.latitude < 51.08917)
    & (tweets.longitude > -4.795556)
    & (tweets.longitude < 8.230556)
]

# Set index to id for easy matching
# ! Verify_integrity=False is required because of duplicate ids
tweets.set_index('id', inplace=True, verify_integrity=False)

print(f"Size of full dataset: {len(tweets)}")

In [None]:
# Bounding box
bbox = (
    tweets.longitude.min(),
    tweets.longitude.max(),
    tweets.latitude.min(),
    tweets.latitude.max(),
)

# Center of the bounding box
center = (bbox[2] + (bbox[3] - bbox[2]) / 2, bbox[0] + (bbox[1] - bbox[0]) / 2)

In [None]:
# Map points with folium
map = folium.Map(location=[center[0], center[1]], zoom_start=6)

for _, row in tweets.iterrows():
    folium.CircleMarker(
        location=[row['latitude'], row['longitude']],
        radius=1,
        color='red',
        fill=True,
        fill_color='red',
        fill_opacity=0.5,
    ).add_to(map)

In [None]:
map

In [None]:
fig, ax = plt.subplots(figsize=(8, 7))
ax.scatter(tweets.longitude, tweets.latitude, zorder=1, alpha=0.2, c="y", s=10)
ax.set_xlim(bbox[0], bbox[1])
ax.set_ylim(bbox[2], bbox[3])

In [None]:
# Display the first 3 rows of the data
tweets.head(3)

In [None]:
# Min and max dates
min_date = tweets['createdAt'].min()
max_date = tweets['createdAt'].max()
print('Min date: ', min_date)
print('Max date: ', max_date)

# Duration of the data
duration = pd.Timestamp(max_date) - pd.Timestamp(min_date)
print('Duration: ', duration)

In [None]:
# Set preprocessor options
p.set_options(p.OPT.URL)

# Remove URLs from the tweets
tweets['text'] = tweets['text'].apply(lambda x: p.clean(x))

In [None]:
# Remove null longitude and latitude values
tweets = tweets[tweets['longitude'].notnull()]