# Create Subset of the 5000 most popular tweets

In [1]:
import h5py
import numpy as np    
import matplotlib.pyplot as plt
import pandas as pd
import os
from PIL import Image


######## Very basic access to the dataset - let's see what we are working with! #######
raw_dataset = h5py.File('C:/Users/Admin/Documents/GitHub/Team_Project_ComputerVision/datasets/climatevisions_2019.h5', 'r+')
image_directory = 'C:\\Users\\Admin\\Documents\\Dataset_TeamProject'
contents = os.listdir(image_directory)
#print(contents)

# Access the 'upper' data - we only have tweet data 
for item in raw_dataset.keys():
   print("Items: " + item)    
    
# Access the actual subgroups with data for us - different info we can look at - mostly things provided in Excel by Katharina
for item in raw_dataset.require_group('tweet_data').keys():
    print(item)


Items: tweet_data
author_id
created_at
edit_history_tweet_ids
entities_annotations
entities_cashtags
entities_hashtags
entities_mentions
entities_urls
geo_coord_data
geo_coord_type
geo_placeid
img_name
img_size
in_reply_to_user_id
lang
like_count
media_keys
possibly_sensitive
quote_count
referenced_tweets
reply_count
retweet_count
source
text
tweet_id
withheld_copyright
withheld_countrycode


In [2]:
# Access the dataset within the group
dataset = raw_dataset['tweet_data']  ## excludes unnecessary information - only tweet_data
 
# Create a dictionary to store column data
data_dict = {}
    
# Iterate through the keys (assuming each key is a column name)
for key in dataset.keys():
     # Access the data for each column
     column_data = dataset[key][:]
        
     # Store the data in the dictionary with the column name as the key
     data_dict[key] = column_data
 
# Convert the dictionary to a Pandas DataFrame
df = pd.DataFrame(data_dict)
df.head()


Unnamed: 0,author_id,created_at,edit_history_tweet_ids,entities_annotations,entities_cashtags,entities_hashtags,entities_mentions,entities_urls,geo_coord_data,geo_coord_type,...,possibly_sensitive,quote_count,referenced_tweets,reply_count,retweet_count,source,text,tweet_id,withheld_copyright,withheld_countrycode
0,b'24931983',b'2019-01-01T23:59:01.000Z',b'1080252082781855744',"b'start:30, end:31, probability:0.4413, type:P...",b':NA',b'NA',b'NA',"b'start1:66, start2:90, end1:89, end2:113, url...",b'NA',b'NA',...,True,0,b'NA',0,0,b'CoSchedule',b'Kids Win the Right to Sue the US Government ...,b'1080252082781855744',b'NA',b'NA'
1,b'2602968048',b'2019-01-01T23:57:02.000Z',b'1080251584729223173',b'NA',b':NA',b'NA',b'NA',"b'start:278, end:301, url:https://t.co/is0NHYP...",b'NA',b'NA',...,True,0,b'NA',0,0,b'Twitter Web Client',b'Wat zou de eenvoudigste verklaring zijn dat ...,b'1080251584729223173',b'NA',b'NA'
2,b'734730931848634368',b'2019-01-01T23:55:49.000Z',b'1080251277693595649',b'NA',b':NA',b'NA',b'NA',"b'start1:55, start2:260, end1:78, end2:283, ur...",b'NA',b'NA',...,True,0,b'NA',0,0,b'Twitter for iPad',"b""Here's what's on the radar for climate chang...",b'1080251277693595649',b'NA',b'NA'
3,b'1242955585',b'2019-01-01T23:53:00.000Z',b'1080250568583585792',"b'start:52, end:78, probability:0.4544, type:O...",b':NA',b'NA',"b'start1:82, start2:99, end1:95, end2:109, use...","b'start1:140, start2:164, end1:163, end2:187, ...",b'NA',b'NA',...,True,0,b'NA',0,0,b'TweetDeck',"b'The biggest story of the year, for all of hu...",b'1080250568583585792',b'NA',b'NA'
4,b'18085565',b'2019-01-01T23:51:36.000Z',b'1080250217524539393',"b'start:122, end:127, probability:0.7755, type...",b':NA',"b'start1:184, start2:239, start3:262, end1:196...","b'start1:248, start2:275, end1:261, end2:283, ...","b'start:284, end:307, url:https://t.co/jqCU6Ds...",b'NA',b'NA',...,True,0,b'NA',0,0,b'Twitter Web Client',b'Still celebrating 2016 w stale Crystal & ran...,b'1080250217524539393',b'NA',b'NA'


# Preprocessing

In [3]:
# strip of "b'" of all strings

cols_to_strip = [
        'author_id', 'created_at', 'edit_history_tweet_ids', 'entities_annotations', 
        'entities_hashtags', 'entities_mentions', 'entities_urls', 'geo_coord_data',
        'geo_coord_type', 'geo_placeid', 'img_name', 'img_size', 'in_reply_to_user_id',
        'lang', 'media_keys', 'referenced_tweets', 'source', 'text', 'tweet_id', 
        'withheld_copyright', 'withheld_countrycode', 'entities_cashtags'
        ]   

df[cols_to_strip] = df[cols_to_strip].astype('string')
df[cols_to_strip] = df[cols_to_strip].replace(to_replace=r'^b\':?(.*)\'$', value=r'\1', regex=True)


print(df.shape)
df.dtypes

# replace string NA to "real" missing value for further analysis
df = df.replace(r'^NA$', np.nan, regex=True)
df.isna().sum()


## only keep images here
# drop all columns exepct img_ columns
selected_columns = ['img_name', 'img_size']
df_selected = df.loc[:, selected_columns]
df_selected.head()

(714769, 27)


Unnamed: 0,img_name,img_size
0,id_1080252082781855744_2019-01-01.jpg,"(533, 1200, 3)"
1,id_1080251584729223173_2019-01-01.jpg,"(675, 1200, 3)"
2,id_1080251277693595649_2019-01-01.jpg,"(349, 620, 3)"
3,id_1080250568583585792_2019-01-01.jpg,"(348, 620, 3)"
4,id_1080250217524539393_2019-01-01.jpg,"(720, 960, 3)"


print(df_selected['img_name'].iloc[0])

Image.open

In [4]:
df.head()

Unnamed: 0,author_id,created_at,edit_history_tweet_ids,entities_annotations,entities_cashtags,entities_hashtags,entities_mentions,entities_urls,geo_coord_data,geo_coord_type,...,possibly_sensitive,quote_count,referenced_tweets,reply_count,retweet_count,source,text,tweet_id,withheld_copyright,withheld_countrycode
0,24931983,2019-01-01T23:59:01.000Z,1080252082781855744,"start:30, end:31, probability:0.4413, type:Pla...",:NA,,,"start1:66, start2:90, end1:89, end2:113, url1:...",,,...,True,0,,0,0,CoSchedule,Kids Win the Right to Sue the US Government Ov...,1080252082781855744,,
1,2602968048,2019-01-01T23:57:02.000Z,1080251584729223173,,:NA,,,"start:278, end:301, url:https://t.co/is0NHYP90...",,,...,True,0,,0,0,Twitter Web Client,Wat zou de eenvoudigste verklaring zijn dat er...,1080251584729223173,,
2,734730931848634368,2019-01-01T23:55:49.000Z,1080251277693595649,,:NA,,,"start1:55, start2:260, end1:78, end2:283, url1...",,,...,True,0,,0,0,Twitter for iPad,Here's what's on the radar for climate change ...,1080251277693595649,,
3,1242955585,2019-01-01T23:53:00.000Z,1080250568583585792,"start:52, end:78, probability:0.4544, type:Oth...",:NA,,"start1:82, start2:99, end1:95, end2:109, usern...","start1:140, start2:164, end1:163, end2:187, ur...",,,...,True,0,,0,0,TweetDeck,"The biggest story of the year, for all of huma...",1080250568583585792,,
4,18085565,2019-01-01T23:51:36.000Z,1080250217524539393,"start:122, end:127, probability:0.7755, type:P...",:NA,"start1:184, start2:239, start3:262, end1:196, ...","start1:248, start2:275, end1:261, end2:283, us...","start:284, end:307, url:https://t.co/jqCU6Ds5l...",,,...,True,0,,0,0,Twitter Web Client,Still celebrating 2016 w stale Crystal & ranci...,1080250217524539393,,


In [5]:
df.describe()

Unnamed: 0,like_count,quote_count,reply_count,retweet_count
count,714769.0,714769.0,714769.0,714769.0
mean,10.137216,0.441205,0.759852,3.915581
std,259.864707,14.017944,28.498886,131.400644
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0
75%,5.0,0.0,0.0,2.0
max,82582.0,5007.0,17280.0,51781.0


## Missing Values

#### Observations:
- entities_cashtags, in_reply_to_user_id, referenced_tweets only missing values
- 100-400 missing values: geo_coord_data, geo_coord_type, withheld_copyright, withheld_countrycode
- around 300.000 missing: entities_annotations, entities_mentions

In [6]:
# replace string NA to "real" missing value for further analysis
df = df.replace(r'^NA$', np.nan, regex=True)
df.isna().sum()

author_id                      0
created_at                     0
edit_history_tweet_ids         0
entities_annotations      424162
entities_cashtags              2
entities_hashtags         242003
entities_mentions         488033
entities_urls                  0
geo_coord_data            551306
geo_coord_type            714351
geo_placeid               691120
img_name                       0
img_size                       0
in_reply_to_user_id       714769
lang                           0
like_count                     0
media_keys                     0
possibly_sensitive             0
quote_count                    0
referenced_tweets         714769
reply_count                    0
retweet_count                  0
source                         0
text                           0
tweet_id                       0
withheld_copyright        714644
withheld_countrycode      279353
dtype: int64

## Image Access

### Goals:
- allow image access directly from 12 subfolders (extracted)
- save small dataset (images and h5py separately)

In [7]:
# we try to find the accurate
def find_month(img_name: str):
    splitted_strings = img_name.split('-')
    count = len(splitted_strings) 
    last_two_tokens = splitted_strings[count-2].split('.')
    return last_two_tokens[0]

def get_image_directory(get_month_string, argument):
    return get_month_string.get(argument)


get_month_string = {
    "01" : "\\01_January\\",
    "02" : "\\02_February\\",
    "03" : "\\03_March\\",
    "04" : "\\04_April\\",
    "05" : "\\05_May\\",
    "06" : "\\06_June\\",
    "07" : "\\07_July\\",
    "08" : "\\08_August\\",
    "09" : "\\09_September\\",
    "10" : "\\10_October\\",
    "11" : "\\11_November\\",
    "12" : "\\12_December\\"
}


In [12]:
 
### How many is up to us, can be tried out based on result of models ###
df.sort_values(by='like_count', ascending=False)
most_liked_subset =  df.nlargest(5000, 'like_count')[['tweet_id', 'like_count', 'retweet_count', 'text', 'img_name', 'referenced_tweets', 'lang', 'quote_count', 'created_at']].copy()
most_liked_subset.shape

columns_of_subset = ['tweet_id', 'like_count', 'retweet_count', 'text', 'img_name','referenced_tweets', 'lang', 'quote_count', 'created_at']

## Creates Dataset and saves it as a h5 file
for column in columns_of_subset:
    most_liked_subset[column] = most_liked_subset[column].astype(str)
#### Get the top 1000 tweets based on 'like_count' ####
most_liked_subset.head(10)

with h5py.File('climatevisions_2019_popular.h5', 'w') as hf:
    # Create a group named 'tweet_data'
    tweet_data_group = hf.create_group('tweet_data')
    
    # Convert DataFrame columns to numpy arrays --- shouldve done this with a for loop probably
    tweet_id = most_liked_subset['tweet_id'].values
    like_count = most_liked_subset['like_count'].values
    retweet_count = most_liked_subset['retweet_count'].values
    text = most_liked_subset['text'].values
    img_name = most_liked_subset['img_name'].values
    ref_tweets = most_liked_subset['referenced_tweets'].values
    language = most_liked_subset['lang'].values
    quote_count = most_liked_subset['quote_count'].values
    created_at = most_liked_subset['created_at'].values
    
    # Store numpy arrays in the HDF5 file under the 'tweet_data' group
    tweet_data_group.create_dataset('tweet_id', data=tweet_id)
    tweet_data_group.create_dataset('like_count', data=like_count)
    tweet_data_group.create_dataset('retweet_count', data=retweet_count)
    tweet_data_group.create_dataset('text', data=text)
    tweet_data_group.create_dataset('img_name', data=img_name)
    tweet_data_group.create_dataset('referenced_tweets', data=ref_tweets)
    tweet_data_group.create_dataset('language', data=language)
    tweet_data_group.create_dataset('quote_count', data=quote_count)
    tweet_data_group.create_dataset('created_at', data=created_at)

    

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import cv2

## Writes images to another directory to create the 5000 images dataset
for imageName in most_liked_subset['img_name']:
    try:
        sub_directory = get_image_directory(get_month_string, find_month(imageName))
        complete_image_directory = image_directory + sub_directory + imageName
        img = cv2.imread(complete_image_directory)
        output_directory = 'C:\\Users\\Admin\\Documents\\Dataset_Test\\' + imageName
        cv2.imwrite(output_directory, img)
    
    except Exception as e:
        print(complete_image_directory) # prints none but was to check if it works