In [1]:
import h5py
import numpy as np    
import matplotlib.pyplot as plt
import pandas as pd
import os
from PIL import Image


######## Very basic access to the dataset - let's see what we are working with! #######
raw_dataset = h5py.File('climatevisions_2019_popular.h5','r+') 
image_directory = 'C:\\Users\\Admin\\Documents\\Dataset_small\\'
contents = os.listdir(image_directory)
#print(contents)

# Access the 'upper' data - we only have tweet data 
for item in raw_dataset.keys():
   print("Items: " + item)    
    
# Access the actual subgroups with data for us - different info we can look at - mostly things provided in Excel by Katharina
for item in raw_dataset.require_group('tweet_data').keys():
    print(item)


Items: tweet_data
created_at
img_name
language
like_count
quote_count
referenced_tweets
retweet_count
text
tweet_id


In [2]:
# Access the dataset within the group
dataset = raw_dataset['tweet_data']  ## excludes unnecessary information - only tweet_data
 
# Create a dictionary to store column data
data_dict = {}
    
# Iterate through the keys (assuming each key is a column name)
for key in dataset.keys():
     # Access the data for each column
     column_data = dataset[key][:]
        
     # Store the data in the dictionary with the column name as the key
     data_dict[key] = column_data
 
# Convert the dictionary to a Pandas DataFrame
df = pd.DataFrame(data_dict)
df.head()


Unnamed: 0,created_at,img_name,language,like_count,quote_count,referenced_tweets,retweet_count,text,tweet_id
0,b'2019-07-22T12:38:24.000Z',b'id_1153283149360762880_2019-07-22.jpg\n',b'en',b'82582',b'3918',b'<NA>',b'50280',b'the UN released a 740 page report compiled o...,b'1153283149360762880'
1,b'2019-08-20T09:28:39.000Z',b'id_1163744643600637952_2019-08-20.jpg\n',b'en',b'69820',b'2456',b'<NA>',b'51781',"b'The Amazon Rainforest, one of the wettest pl...",b'1163744643600637952'
2,b'2019-04-28T18:51:22.000Z',b'id_1122574040936452097_2019-04-28.jpg\n',b'en',b'69235',b'87',b'<NA>',b'11051',b'just learned about climate change https://t....,b'1122574040936452097'
3,b'2019-10-28T13:10:13.000Z',b'id_1188805167958974465_2019-10-28.jpg\n',b'en',b'65465',b'70',b'<NA>',b'6124',b'Climate change caused this. https://t.co/JG2...,b'1188805167958974465'
4,b'2019-03-19T16:30:00.000Z',b'id_1108042949449969666_2019-03-19.jpg\n',b'en',b'62852',b'976',b'<NA>',b'9145',b'#GreenNewDeal haters\xe2\x80\x99 plan to add...,b'1108042949449969666'


# Preprocessing

In [3]:
# strip of "b'" of all strings

cols_to_strip = ['created_at', 'img_name', 'language', 'referenced_tweets', 'text', 'tweet_id']   

df[cols_to_strip] = df[cols_to_strip].astype('string')
df[cols_to_strip] = df[cols_to_strip].replace(to_replace=r'^b\':?(.*)\'$', value=r'\1', regex=True)


print(df.shape)
df.dtypes

## only keep images here
# drop all columns exepct img_ columns
selected_columns = ['img_name']
df_selected = df.loc[:, selected_columns]
df_selected.head()

(5000, 9)


Unnamed: 0,img_name
0,id_1153283149360762880_2019-07-22.jpg
1,id_1163744643600637952_2019-08-20.jpg
2,id_1122574040936452097_2019-04-28.jpg
3,id_1188805167958974465_2019-10-28.jpg
4,id_1108042949449969666_2019-03-19.jpg


print(df_selected['img_name'].iloc[0])

Image.open

In [4]:
df.head()

Unnamed: 0,created_at,img_name,language,like_count,quote_count,referenced_tweets,retweet_count,text,tweet_id
0,2019-07-22T12:38:24.000Z,id_1153283149360762880_2019-07-22.jpg,en,b'82582',b'3918',,b'50280',the UN released a 740 page report compiled ove...,1153283149360762880
1,2019-08-20T09:28:39.000Z,id_1163744643600637952_2019-08-20.jpg,en,b'69820',b'2456',,b'51781',"The Amazon Rainforest, one of the wettest plac...",1163744643600637952
2,2019-04-28T18:51:22.000Z,id_1122574040936452097_2019-04-28.jpg,en,b'69235',b'87',,b'11051',just learned about climate change https://t.co...,1122574040936452097
3,2019-10-28T13:10:13.000Z,id_1188805167958974465_2019-10-28.jpg,en,b'65465',b'70',,b'6124',Climate change caused this. https://t.co/JG2Ly...,1188805167958974465
4,2019-03-19T16:30:00.000Z,id_1108042949449969666_2019-03-19.jpg,en,b'62852',b'976',,b'9145',#GreenNewDeal haters’ plan to address Climate ...,1108042949449969666


In [5]:
df.describe()

Unnamed: 0,created_at,img_name,language,like_count,quote_count,referenced_tweets,retweet_count,text,tweet_id
count,5000,5000,5000,5000,5000,5000.0,5000,5000,5000
unique,4999,5000,28,1237,232,1.0,735,5000,5000
top,2019-02-07T19:34:34.000Z,id_1153283149360762880_2019-07-22.jpg\n,en,b'161',b'2',,b'50',the UN released a 740 page report compiled ove...,1153283149360762880
freq,2,1,4823,39,363,5000.0,51,1,1


## Missing Values

#### Observations:
- entities_cashtags, in_reply_to_user_id, referenced_tweets only missing values
- 100-400 missing values: geo_coord_data, geo_coord_type, withheld_copyright, withheld_countrycode
- around 300.000 missing: entities_annotations, entities_mentions

In [6]:
# replace string NA to "real" missing value for further analysis
df = df.replace(r'^NA$', np.nan, regex=True)
df.isna().sum()

created_at           0
img_name             0
language             0
like_count           0
quote_count          0
referenced_tweets    0
retweet_count        0
text                 0
tweet_id             0
dtype: int64

## Testing of Models 

### Goals:
- try to find fitting models for our project
- produce good results for analysis, try to visualize it to later display in GUI

In [58]:
from torchvision.io import read_image
from torchvision.models import convnext_large, ConvNeXt_Large_Weights


## Image Classification - supposed to display most important object - example Test - ConvNext (doesn't really work well)
img = read_image(image_directory + 'id_1092304223591587840_2019-02-04.jpg')

weights_convNext = ConvNeXt_Large_Weights.DEFAULT
model_convNext = convnext_large(weights=weights_convNext)
model_convNext.eval()
preprocess_convNext = weights_convNext.transforms()
batch_convNext = preprocess_convNext(img).unsqueeze(0)
prediction_convNext = model_convNext(batch_convNext).squeeze(0).softmax(0)
class_id_convNext = prediction_convNext.argmax().item()
score = prediction_convNext[class_id_convNext].item()
category_name = weights_convNext.meta["categories"][class_id_convNext]
print(f"{category_name}: {100 * score:.1f}%")




Windsor tie: 44.1%


In [56]:
from torchvision.io.image import read_image
from torchvision.models.segmentation import fcn_resnet50, FCN_ResNet50_Weights
from torchvision.transforms.functional import to_pil_image

## Test Model for semantic segmentation - has a few different weights and can e.g. create a mask of a human being but... surely there are better models
weights = FCN_ResNet50_Weights.DEFAULT
model = fcn_resnet50(weights=weights)
model.eval()

img = read_image(image_directory + 'id_1092304223591587840_2019-02-04.jpg')

preprocess = weights.transforms()

batch = preprocess(img).unsqueeze(0)

prediction = model(batch)["out"]
normalized_masks = prediction.softmax(dim=1)
class_to_idx = {cls: idx for (idx, cls) in enumerate(weights.meta["categories"])}
mask = normalized_masks[0, class_to_idx["person"]]

print(weights.meta["categories"])
to_pil_image(mask).show()


['__background__', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor']
