In [1]:
import zstandard as zstd
import json

In [2]:
class Zreader:

    def __init__(self, file, chunk_size=16384):
        '''Init method'''
        self.fh = open(file,'rb')
        self.chunk_size = chunk_size
        self.dctx = zstd.ZstdDecompressor()
        self.reader = self.dctx.stream_reader(self.fh)
        self.buffer = ''


    def readlines(self):
        '''Generator method that creates an iterator for each line of JSON'''
        while True:
            chunk = self.reader.read(self.chunk_size).decode(errors="ignore")
            if not chunk:
                break
            lines = (self.buffer + chunk).split("\n")

            for line in lines[:-1]:
                yield line

            self.buffer = lines[-1]


## Get list of channels in ```channelcrawler.csv```

In [3]:
import pandas as pd

In [4]:
df_channelcrawler = pd.read_csv('/dlabdata1/youtube_large/channelcrawler.csv')

In [5]:
df_channelcrawler.head()

Unnamed: 0,category,join_date,link,name,subscribers,videos
0,Film and Animation,2017-05-21,http://www.youtube.com/channel/UCBJuEqXfXTdcPS...,MagnusNation,65100,28
1,Entertainment,2011-12-13,http://www.youtube.com/channel/UCkNW9Q1VR_aeZ6...,Mago Dario Animazion...,60200,48
2,Music,2013-09-13,http://www.youtube.com/channel/UC1xcnrpcF59FWW...,M√§go de Oz - Topic,40200,395
3,Music,2008-03-17,http://www.youtube.com/channel/UCXhkGgooXHDNwg...,Mago Merlino,14800,838
4,Entertainment,2014-10-19,http://www.youtube.com/channel/UCvZGsuvKlYOGiZ...,MAGO TOM√ÅS,26200,31


In [6]:
df_channelcrawler.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164648 entries, 0 to 164647
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   category     164462 non-null  object
 1   join_date    164647 non-null  object
 2   link         164648 non-null  object
 3   name         164633 non-null  object
 4   subscribers  164648 non-null  int64 
 5   videos       164648 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 7.5+ MB


In [7]:
df_channelcrawler['channel_id'] = df_channelcrawler['link'].apply(lambda x: x.replace('http://www.youtube.com/channel/', ''))

In [8]:
df_channelcrawler.head()

Unnamed: 0,category,join_date,link,name,subscribers,videos,channel_id
0,Film and Animation,2017-05-21,http://www.youtube.com/channel/UCBJuEqXfXTdcPS...,MagnusNation,65100,28,UCBJuEqXfXTdcPSbGO9qqn1g
1,Entertainment,2011-12-13,http://www.youtube.com/channel/UCkNW9Q1VR_aeZ6...,Mago Dario Animazion...,60200,48,UCkNW9Q1VR_aeZ6uht83jJVQ
2,Music,2013-09-13,http://www.youtube.com/channel/UC1xcnrpcF59FWW...,M√§go de Oz - Topic,40200,395,UC1xcnrpcF59FWWELtZvJTdg
3,Music,2008-03-17,http://www.youtube.com/channel/UCXhkGgooXHDNwg...,Mago Merlino,14800,838,UCXhkGgooXHDNwgJXmoTSN7g
4,Entertainment,2014-10-19,http://www.youtube.com/channel/UCvZGsuvKlYOGiZ...,MAGO TOM√ÅS,26200,31,UCvZGsuvKlYOGiZTsxwJNS5Q


In [9]:
#store in a set since it will be faster to check if a channel is in channelcrawler
set_channelcrawler = set(df_channelcrawler['channel_id'])

## Remove non-english channels

Here, I use langdetect

Fishing for characters from different alphabets may let us with french-spanish-etc channels -> discuss with Manoel

In [10]:
from langdetect import detect
from langdetect import detect_langs

from collections import Counter

In [11]:
list_en_videos = []

In [12]:
reader = Zreader("/dlabdata1/youtube_large/yt_metadata_all.jsonl.zst", chunk_size=2**28)

In [13]:
def detect_language(text):
    '''Method that detect the language of the argument using langdetect'''
    
    # Create list to store the language detections
    detections = []
    
    for i in range(5):
        detections.append(detect(text))
        
    # Create the counter to get the most detected language
    c = Counter(detections)
    language_detected, _ = c.most_common()[0]
    
    print(detections)
    return language_detected

In [14]:
text = "Bonjour, je m'appelle Olivier et j'ai twenty ans"

In [15]:
detect_language(text)

['nl', 'fr', 'fr', 'fr', 'fr']


'fr'

## To ignore - Testing

In [16]:
idx = 0
for line in reader.readlines():
    idx += 1
    
    if idx % 250 == 0:
        break
        
    # line is a str dict, res is the dict corresponding to the str dict
    res = json.loads(line)
    
    title = res['title']
    description = res['description']
    
    if detect(title) == 'en' and res['channel_id'] in set_channelcrawler:
        list_en_videos.append(res)
    
    try: 
        if description != '':
            best_lang1 = str(detect_langs(description)[0])
            best_lang2 = str(detect_langs(description)[0])
            best_lang3 = str(detect_langs(description)[0])
            print('Detect language from description: ' + detect(description))
            print('Probability1 of languages from description: ' + best_lang1)
            print('Probability2 of languages from description: ' + best_lang2)
            print('Probability3 of languages from description: ' + best_lang3)
            #print('')
            
            language, prob = best_lang1.split(':')[0], float(best_lang1.split(':')[1])
            
            if prob < 0.8:
                print('Top1 lang detect from description with prob: ' + language + ' ' + str(prob))
                print('Title of the video : ' + title)
                print('Channel of the video : ' + res['channel_id'])
                print('')
        elif title != '':
            try:
                print('Detect language from title: ' + detect(title))
                print('Probability of languages from title: ' + str(detect_langs(title)))
                print('')
            except:
                print('Language not known for the video')
                
    except:
        try:
            if title != '':
                print('Detect language from title: ' + detect(title))
                print('Probability of languages from title: ' + str(detect_langs(title)))
                print('')
        except:
            print('Language not known for the video')
    
        
    

Detect language from description: ru
Probability1 of languages from description: ru:0.9999946789434702
Probability2 of languages from description: ru:0.9999942099350225
Probability3 of languages from description: ru:0.9999969883943975
Detect language from description: ru
Probability1 of languages from description: ru:0.9999937825853757
Probability2 of languages from description: ru:0.999994793419498
Probability3 of languages from description: ru:0.9999968764605269
Detect language from title: ru
Probability of languages from title: [ru:0.9999952532027125]

Detect language from title: ru
Probability of languages from title: [ru:0.9999933718970782]

Detect language from title: ru
Probability of languages from title: [ru:0.999994925410497]

Detect language from title: ru
Probability of languages from title: [ru:0.9999958616083578]

Detect language from title: ru
Probability of languages from title: [ru:0.9999935477021455]

Detect language from description: ru
Probability1 of languages from

Detect language from description: en
Probability1 of languages from description: en:0.9999965828855879
Probability2 of languages from description: en:0.9999966534128218
Probability3 of languages from description: en:0.9999950916624795
Detect language from description: en
Probability1 of languages from description: en:0.9999946604318573
Probability2 of languages from description: en:0.9999970706778863
Probability3 of languages from description: en:0.9999977801824794
Detect language from description: en
Probability1 of languages from description: en:0.9999981531233355
Probability2 of languages from description: en:0.9999977550551682
Probability3 of languages from description: en:0.9999955936567109
Detect language from description: en
Probability1 of languages from description: en:0.9999974508261928
Probability2 of languages from description: en:0.9999966526954003
Probability3 of languages from description: en:0.9999985094899656
Detect language from description: en
Probability1 of languag

Detect language from description: en
Probability1 of languages from description: en:0.9999971390564332
Probability2 of languages from description: en:0.9999953151567105
Probability3 of languages from description: en:0.9999957367222505
Detect language from description: en
Probability1 of languages from description: en:0.999997592091703
Probability2 of languages from description: en:0.999998194451352
Probability3 of languages from description: en:0.9999947896728498
Detect language from description: en
Probability1 of languages from description: en:0.9999964531271366
Probability2 of languages from description: en:0.9999975555237461
Probability3 of languages from description: en:0.9999974762987716
Detect language from description: en
Probability1 of languages from description: en:0.9999973614127753
Probability2 of languages from description: en:0.999996976601401
Probability3 of languages from description: en:0.9999985732937445
Detect language from description: en
Probability1 of languages 

Detect language from description: en
Probability1 of languages from description: en:0.999995970122163
Probability2 of languages from description: en:0.9999961303776419
Probability3 of languages from description: en:0.9999963373744964
Detect language from description: en
Probability1 of languages from description: en:0.9999958920547358
Probability2 of languages from description: en:0.9999975866728249
Probability3 of languages from description: en:0.9999958631362592
Detect language from description: en
Probability1 of languages from description: en:0.9999972259612976
Probability2 of languages from description: en:0.9999973277835108
Probability3 of languages from description: en:0.9999977563438243
Detect language from description: en
Probability1 of languages from description: en:0.9999958504021617
Probability2 of languages from description: en:0.9999963745315497
Probability3 of languages from description: en:0.9999978417086346
Detect language from description: en
Probability1 of language

Detect language from description: en
Probability1 of languages from description: en:0.9999968167833347
Probability2 of languages from description: en:0.999998116140693
Probability3 of languages from description: en:0.9999966025474812
Detect language from description: en
Probability1 of languages from description: en:0.9999963954373958
Probability2 of languages from description: en:0.9999944034376995
Probability3 of languages from description: en:0.9999976826699241
Detect language from description: en
Probability1 of languages from description: en:0.9999945428189049
Probability2 of languages from description: en:0.9999975567825286
Probability3 of languages from description: en:0.9999978433253862
Detect language from description: en
Probability1 of languages from description: en:0.999996419912794
Probability2 of languages from description: en:0.99999768908868
Probability3 of languages from description: en:0.9999963035782488
Detect language from description: en
Probability1 of languages f

Detect language from description: en
Probability1 of languages from description: en:0.7142848722139727
Probability2 of languages from description: en:0.7142845746243449
Probability3 of languages from description: en:0.9999968951859293
Top1 lang detect from description with prob: en 0.7142848722139727
Title of the video : Lauv - Easy Love (Cherry Beach Remix)
Channel of the video : UCzzzUN8yvD2LRAnY-lhzyLQ

Detect language from description: en
Probability1 of languages from description: en:0.9999982671723819
Probability2 of languages from description: en:0.9999972387723944
Probability3 of languages from description: en:0.9999965837919266
Detect language from description: en
Probability1 of languages from description: en:0.857139431030246
Probability2 of languages from description: en:0.5714269496843856
Probability3 of languages from description: en:0.8571406407705007
Detect language from description: en
Probability1 of languages from description: en:0.8571396654253632
Probability2 of la

Detect language from description: en
Probability1 of languages from description: en:0.9999970805490315
Probability2 of languages from description: en:0.8571390332559383
Probability3 of languages from description: en:0.8571405779744261
Detect language from description: en
Probability1 of languages from description: en:0.9999954226574497
Probability2 of languages from description: en:0.7142840813365424
Probability3 of languages from description: en:0.5714284956850127
Detect language from description: en
Probability1 of languages from description: en:0.8571403320217096
Probability2 of languages from description: en:0.9999975592777519
Probability3 of languages from description: en:0.5714274384280221
Detect language from description: en
Probability1 of languages from description: en:0.9999964176269915
Probability2 of languages from description: en:0.857139507050321
Probability3 of languages from description: en:0.7142837879447693
Detect language from description: en
Probability1 of language

Detect language from description: en
Probability1 of languages from description: en:0.5714258496479254
Probability2 of languages from description: pl:0.5714271554429284
Probability3 of languages from description: en:0.7142827385224737
Top1 lang detect from description with prob: en 0.5714258496479254
Title of the video : Alan Walker- Tired (neutral. Remix)
Channel of the video : UCzzzUN8yvD2LRAnY-lhzyLQ

Detect language from description: en
Probability1 of languages from description: en:0.571426711991521
Probability2 of languages from description: en:0.9999955503775272
Probability3 of languages from description: en:0.7142834638072879
Top1 lang detect from description with prob: en 0.571426711991521
Title of the video : Frank Walker - Young (Sam Feldt Remix)
Channel of the video : UCzzzUN8yvD2LRAnY-lhzyLQ

Detect language from description: en
Probability1 of languages from description: en:0.7142830001743612
Probability2 of languages from description: en:0.9999957220486152
Probability3 o

In [17]:
list_en_videos[0]

{'categories': 'Howto & Style',
 'channel_id': 'UCzzzZ3-icktxbC3j7hkWqRw',
 'crawl_date': '2019-11-08 05:24:10.745916',
 'description': 'Benvenuto to Ciao Citalia, the blog from the leading Italian holiday specialist. Make Ciao Citalia your go-to for destination guides, food and wine features, recipes, and inspiration for things to see and do on your next holiday to Italy. You‚Äôll also find first-hand accounts from our team on their travels through Italy, from a Tuscan honeymoon to a trip on the famous Venice Simplon-Orient-Express.\n\nTake a look now at https://ciao.citalia.com/',
 'dislike_count': 2,
 'display_id': 'FV_kEBb1XqU',
 'duration': 63,
 'like_count': 17,
 'tags': 'Citalia,blog,italy,vog,videos,video,italian,food,wine,rome,venice,florence,milan,amalfi,coast,beach,holiday,travel,tips',
 'title': 'Ciao Citalia | The blog from the Italian holiday specialists',
 'upload_date': '2017-06-02 00:00:00',
 'view_count': 1334}

In [18]:
len(list_en_videos)

165

## Check rankings.jsonl

In [19]:
rankings = []

In [20]:
with open('/dlabdata1/youtube_large/rankings.jsonl', 'r') as json_file:
    json_list = list(json_file)

In [21]:
for json_str in json_list:
    res = json.loads(json_str)
    rankings.append(res)

In [22]:
len(rankings)

164677

In [23]:
rankings[0]

{'success': True,
 'data': {'social blade rank': 389823,
  'subscriber rank': 231223,
  'video views rank': 192084,
  'country rank': 3,
  'film rank': 11654,
  'country': 'Anguilla'},
 'crawl_time': '2020-02-17 16:40:48.336406',
 'channel': 'UCBJuEqXfXTdcPSbGO9qqn1g'}

## Title, Tag and (Description) pre-processing per video

In [24]:
video_test = list_en_videos[0]

In [25]:
video_test['title']

'Ciao Citalia | The blog from the Italian holiday specialists'

In [26]:
import nltk

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/olam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#### Step 1 : lowercase, remove stop words and tokenize

In [27]:
stop_words = set(stopwords.words('english'))

In [28]:
tokenizer = RegexpTokenizer(r'\w+')

In [29]:
tokens_per_video = []

In [30]:
title_tokens = [w for w in tokenizer.tokenize(video_test['title'].lower()) if not w in stop_words]

In [31]:
tag_tokens = [w for w in tokenizer.tokenize(video_test['tags'].lower()) if not w in stop_words]

In [32]:
print(title_tokens)

['ciao', 'citalia', 'blog', 'italian', 'holiday', 'specialists']


In [33]:
print(tag_tokens)

['citalia', 'blog', 'italy', 'vog', 'videos', 'video', 'italian', 'food', 'wine', 'rome', 'venice', 'florence', 'milan', 'amalfi', 'coast', 'beach', 'holiday', 'travel', 'tips']


In [34]:
# We want to keep duplicates !!
tokens_per_video = title_tokens + tag_tokens

In [35]:
print(tokens_per_video)

['ciao', 'citalia', 'blog', 'italian', 'holiday', 'specialists', 'citalia', 'blog', 'italy', 'vog', 'videos', 'video', 'italian', 'food', 'wine', 'rome', 'venice', 'florence', 'milan', 'amalfi', 'coast', 'beach', 'holiday', 'travel', 'tips']


#### Step 2 : Stemming

Ask Manoel : Stemming vs Lemmatization for the task ? We have a huge dataset so we shouldn't juste lemmatization since it would take too much time right ? For tags, lemmatization make no sense since we do not have any sentences

In [36]:
from nltk.stem.snowball import SnowballStemmer

In [37]:
s_stemmer = SnowballStemmer(language='english')

In [38]:
tokens_per_video_stemmed = [s_stemmer.stem(w) for w in tokens_per_video]

In [39]:
print(tokens_per_video_stemmed)

['ciao', 'citalia', 'blog', 'italian', 'holiday', 'specialist', 'citalia', 'blog', 'itali', 'vog', 'video', 'video', 'italian', 'food', 'wine', 'rome', 'venic', 'florenc', 'milan', 'amalfi', 'coast', 'beach', 'holiday', 'travel', 'tip']


#### Step 3 : Putting it together

In [40]:
import nltk
import collections

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/olam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [41]:
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
s_stemmer = SnowballStemmer(language='english')

In [42]:
def get_freq_tokens_per_video(video):
    title_tokens = [w for w in tokenizer.tokenize(video['title'].lower()) if not w in stop_words]
    tag_tokens = [w for w in tokenizer.tokenize(video['tags'].lower()) if not w in stop_words]
    
    # We want to keep duplicates !!
    tokens_per_video = title_tokens + tag_tokens

    # Stemming
    stemmed_tokens_per_video = ([s_stemmer.stem(w) for w in tokens_per_video])
    
    # Return a Counter object of the tokens
    return collections.Counter(stemmed_tokens_per_video)

In [43]:
list_en_videos

[{'categories': 'Howto & Style',
  'channel_id': 'UCzzzZ3-icktxbC3j7hkWqRw',
  'crawl_date': '2019-11-08 05:24:10.745916',
  'description': 'Benvenuto to Ciao Citalia, the blog from the leading Italian holiday specialist. Make Ciao Citalia your go-to for destination guides, food and wine features, recipes, and inspiration for things to see and do on your next holiday to Italy. You‚Äôll also find first-hand accounts from our team on their travels through Italy, from a Tuscan honeymoon to a trip on the famous Venice Simplon-Orient-Express.\n\nTake a look now at https://ciao.citalia.com/',
  'dislike_count': 2,
  'display_id': 'FV_kEBb1XqU',
  'duration': 63,
  'like_count': 17,
  'tags': 'Citalia,blog,italy,vog,videos,video,italian,food,wine,rome,venice,florence,milan,amalfi,coast,beach,holiday,travel,tips',
  'title': 'Ciao Citalia | The blog from the Italian holiday specialists',
  'upload_date': '2017-06-02 00:00:00',
  'view_count': 1334},
 {'categories': 'Howto & Style',
  'channel_

In [44]:
print(get_freq_tokens_per_video(list_en_videos[0]))

Counter({'citalia': 2, 'blog': 2, 'italian': 2, 'holiday': 2, 'video': 2, 'ciao': 1, 'specialist': 1, 'itali': 1, 'vog': 1, 'food': 1, 'wine': 1, 'rome': 1, 'venic': 1, 'florenc': 1, 'milan': 1, 'amalfi': 1, 'coast': 1, 'beach': 1, 'travel': 1, 'tip': 1})


## Create Sparse Matrix 

In [45]:
from scipy.sparse import dok_matrix
from sys import getsizeof

In [46]:
list_stemmed_tokens = set()

In [47]:
for video in list_en_videos:
    tokens_per_video = get_freq_tokens_per_video(video).keys()
    list_stemmed_tokens.update(tokens_per_video)

In [48]:
list_stemmed_tokens = list(list_stemmed_tokens)

In [49]:
len(list_stemmed_tokens)

891

In [50]:
size_of_tokens_dict = len(list_stemmed_tokens)

In [51]:
number_videos = len(list_en_videos)

In [52]:
S = dok_matrix((number_videos, size_of_tokens_dict))

In [53]:
def fill_underlying_dict(freq_tokens_per_video, list_stemmed_tokens, dict_freq_tokens_for_sparse_matrix, idx_video):
    '''Method to fill the underlying dictionnary in order to 
    update the sparse matrix incrementally by videos'''
    
    for key in freq_tokens_per_video.keys():
        
        # Column index in the sparse matrix (one column for each token)
        idy_token = list_stemmed_tokens.index(key)
    
        # Filling the underlying dict
        dict_freq_tokens_for_sparse_matrix[(idx_video, idy_token)] = freq_tokens_per_video[key]
    

In [54]:
# Row index in the sparse matrix (one row for each video)
idx_video = 0

for video in list_en_videos:
    # For each video, create a underlying dictionnary for filling the sparse matrix efficiently
    dict_freq_tokens_for_sparse_matrix = {}
    
    # Get the tokens for each video and theirs number of occurences
    freq_tokens_per_video = get_freq_tokens_per_video(video)
    
    # Fill the underlying dict
    fill_underlying_dict(freq_tokens_per_video, list_stemmed_tokens, dict_freq_tokens_for_sparse_matrix, idx_video)
        
    # Update the Sparse Matrix
    dict.update(S, dict_freq_tokens_for_sparse_matrix)
    
    # Increment Row index for next video
    idx_video += 1

In [55]:
S

<165x891 sparse matrix of type '<class 'numpy.float64'>'
	with 2772 stored elements in Dictionary Of Keys format>

In [56]:
len(list_en_videos)

165

In [57]:
S.items()

dict_items([((0, 658), 1), ((0, 442), 2), ((0, 480), 2), ((0, 471), 2), ((0, 445), 2), ((0, 537), 1), ((0, 701), 1), ((0, 216), 1), ((0, 49), 2), ((0, 318), 1), ((0, 571), 1), ((0, 459), 1), ((0, 527), 1), ((0, 493), 1), ((0, 693), 1), ((0, 192), 1), ((0, 178), 1), ((0, 775), 1), ((0, 448), 1), ((0, 110), 1), ((1, 710), 1), ((1, 383), 1), ((2, 602), 2), ((2, 448), 2), ((2, 110), 2), ((2, 138), 2), ((2, 287), 2), ((2, 442), 1), ((2, 701), 1), ((2, 471), 1), ((2, 445), 1), ((2, 206), 1), ((2, 91), 1), ((2, 867), 1), ((2, 315), 1), ((2, 775), 1), ((2, 39), 1), ((2, 785), 1), ((2, 178), 1), ((2, 15), 1), ((2, 366), 1), ((2, 726), 1), ((2, 135), 1), ((2, 612), 1), ((2, 369), 1), ((2, 457), 1), ((2, 52), 1), ((2, 540), 1), ((2, 409), 1), ((2, 167), 1), ((2, 582), 1), ((2, 754), 1), ((2, 8), 1), ((2, 284), 1), ((2, 594), 1), ((2, 821), 1), ((2, 837), 1), ((2, 571), 1), ((2, 482), 1), ((2, 139), 1), ((2, 75), 1), ((2, 495), 1), ((3, 442), 1), ((3, 810), 1), ((3, 735), 2), ((3, 633), 1), ((3, 1

## Group freq_tokens_per_videos by channel

In [58]:
tokens_per_channel = {}
channels_in_dict = set()

In [59]:
for vid in list_en_videos:
    if vid['channel_id'] in channels_in_dict:
        tokens_per_channel[vid['channel_id']] = tokens_per_channel[vid['channel_id']] + get_freq_tokens_per_video(vid)
    else:
        tokens_per_channel[vid['channel_id']] = get_freq_tokens_per_video(vid)
        channels_in_dict.add(vid['channel_id'])

In [60]:
tokens_per_channel

{'UCzzzZ3-icktxbC3j7hkWqRw': Counter({'ciao': 1,
          'citalia': 29,
          'blog': 2,
          'italian': 17,
          'holiday': 12,
          'specialist': 1,
          'itali': 17,
          'vog': 1,
          'video': 2,
          'food': 21,
          'wine': 4,
          'rome': 1,
          'venic': 1,
          'florenc': 1,
          'milan': 1,
          'amalfi': 1,
          'coast': 2,
          'beach': 2,
          'travel': 7,
          'tip': 4,
          'birthpac': 1,
          'cannelloni': 1,
          'sicili': 2,
          'gennaro': 26,
          'contaldo': 26,
          'vacat': 3,
          'toarmina': 1,
          'palermo': 1,
          'syracus': 1,
          'trapani': 1,
          'sicilian': 1,
          'church': 1,
          'cefal√π': 1,
          'cathedr': 1,
          'mountain': 1,
          'hike': 1,
          'ski': 1,
          'harbor': 1,
          'fish': 6,
          'cave': 1,
          'lake': 1,
          'volcano': 1,
    