In [4]:
import collections
import json
import nltk
import os
import pickle
import pyLDAvis
import random
import scipy.sparse
import sys
import time

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import zstandard as zstd


from collections import Counter
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from pyspark import SparkContext, SparkConf
from pyspark.ml.clustering import LDA, LDAModel, LocalLDAModel
from pyspark.ml.linalg import Vectors, SparseVector
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from scipy.sparse import dok_matrix
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/olam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
s_stemmer = SnowballStemmer(language='english')

# Select  the data

### Selection criteria
- Not used to train the topic modelling
- Video with more than 10'000 views

In [8]:
class Zreader:

    def __init__(self, file, chunk_size=16384):
        '''Init method'''
        self.fh = open(file,'rb')
        self.chunk_size = chunk_size
        self.dctx = zstd.ZstdDecompressor()
        self.reader = self.dctx.stream_reader(self.fh)
        self.buffer = ''


    def readlines(self):
        '''Generator method that creates an iterator for each line of JSON'''
        while True:
            chunk = self.reader.read(self.chunk_size).decode(errors="ignore")
            if not chunk:
                break
            lines = (self.buffer + chunk).split("\n")

            for line in lines[:-1]:
                yield line

            self.buffer = lines[-1]

In [9]:
# Load set of videos to consider
with open('/dlabdata1/youtube_large/olam/data/view10000_sub10000/idx_vid_to_consider.pickle', 'rb') as f:
    idx_vid_to_consider = pickle.load(f)
f.close()

In [10]:
# Load channels
df_channelcrawler = pd.read_csv('/dlabdata1/youtube_large/channelcrawler.csv')

df_channelcrawler['channel_id'] = df_channelcrawler['link'].apply(lambda x: x.replace('http://www.youtube.com/channel/', ''))

# filter the channels
df_channelcrawler_100000sub = df_channelcrawler[df_channelcrawler['subscribers'] >= 100000]
set_channelcrawler_100000sub = set(df_channelcrawler_100000sub['channel_id'])

In [12]:
reader = Zreader("/dlabdata1/youtube_large/yt_metadata_all.jsonl.zst", chunk_size=2**28)

idx = 0
array_relevant_infos = []

for line in reader.readlines():
    ###start_iter = time.time()
    idx += 1
    
    if idx % 1000000 == 0:
        print('Progress: ' + str(int(idx/1000000)) + '/85')
        
    if idx in idx_vid_to_consider:
        
        # line is a str dict, video is the dict corresponding to the str dict
        video = json.loads(line)
        
        array_vid_relevant_infos = [video['channel_id']]
        array_vid_relevant_infos.append(video['view_count'])
        array_vid_relevant_infos.append(video['upload_date'][:4])
        array_vid_relevant_infos.append(video['categories'])
        
        array_relevant_infos.append(array_vid_relevant_infos)
        
    

Progress: 1/85
Progress: 2/85
Progress: 3/85
Progress: 4/85
Progress: 5/85
Progress: 6/85
Progress: 7/85
Progress: 8/85
Progress: 9/85
Progress: 10/85
Progress: 11/85
Progress: 12/85
Progress: 13/85
Progress: 14/85
Progress: 15/85
Progress: 16/85
Progress: 17/85
Progress: 18/85
Progress: 19/85
Progress: 20/85
Progress: 21/85
Progress: 22/85
Progress: 23/85
Progress: 24/85
Progress: 25/85
Progress: 26/85
Progress: 27/85
Progress: 28/85
Progress: 29/85
Progress: 30/85
Progress: 31/85
Progress: 32/85
Progress: 33/85
Progress: 34/85
Progress: 35/85
Progress: 36/85
Progress: 37/85
Progress: 38/85
Progress: 39/85
Progress: 40/85
Progress: 41/85
Progress: 42/85
Progress: 43/85
Progress: 44/85
Progress: 45/85
Progress: 46/85
Progress: 47/85
Progress: 48/85
Progress: 49/85
Progress: 50/85
Progress: 51/85
Progress: 52/85
Progress: 53/85
Progress: 54/85
Progress: 55/85
Progress: 56/85
Progress: 57/85
Progress: 58/85
Progress: 59/85
Progress: 60/85
Progress: 61/85
Progress: 62/85
Progress: 63/85
P

In [13]:
# Get the dataframe of all the videos that we will consider

column_names = ['channel_id', 'view_counts', 'uploaded_year', 'category']

df = pd.DataFrame(array_relevant_infos, columns=column_names)

In [14]:
df.shape

(21714294, 4)

### Remove all the videos that are used for topic modelling

Find the video indices in the dataframe such that:
- more than 100'000 subscribers
- top20 from category/channel/year

In [15]:
df_sub100000 = df[df['channel_id'].isin(set_channelcrawler_100000sub)]
df_top20 = df_sub100000.sort_values(['view_counts'], ascending=False).groupby(['category', 'uploaded_year', 'channel_id']).head(20)
df_top20.head()

Unnamed: 0,channel_id,view_counts,uploaded_year,category
21326653,UC0C-w0YjGpqDXGB8IHb662A,4468090305,2017,Music
10648298,UCVp3nfGRxmMadNDuVbJSk8A,4295905423,2015,Music
7977749,UCcdwLMPsaU2ezNSJU1nFoBQ,3838039119,2016,Education
4655616,UCmfFGTSsfJVu6CGvL8r75qg,3709532958,2014,Music
13575776,UCN1hnUccO4FD5WfM7ithXaw,3055180938,2015,Music


In [16]:
index_to_remove = set(df_top20.index)

In [17]:
idx_vid_to_consider_classifier = []

for index in df.index:
    if index not in index_to_remove:
        idx_vid_to_consider_classifier.append(index)

In [19]:
with open('/dlabdata1/youtube_large/olam/data/classifier/idx_vid_to_consider_classifier.pickle', 'wb') as f:
    pickle.dump(idx_vid_to_consider_classifier, f)
f.close()

# Process the data

- Transform every video into BoW, according to the topic model vocabulary
- Get the transformed data -> distribution over the topic for each video
- Separate into train set and test set

In [10]:
def isEnglishAlpha(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

In [11]:
def get_freq_tokens_per_video(video):
    ''''''
    
    title_tokens = [w for w in tokenizer.tokenize(video['title'].lower()) if not w in stop_words]
    tag_tokens = [w for w in tokenizer.tokenize(video['tags'].lower()) if not w in stop_words]
    
    # We want to keep duplicates !!
    tokens_per_video = title_tokens + tag_tokens

    # Filter token with length < 3, with non english alphabet since fastext is not 100% accurate and remove numerical token 
    tokens_keep = []
    for token in tokens_per_video:
        if len(token) >= 3 and (not token.isnumeric()) and isEnglishAlpha(token):
            tokens_keep.append(token)
    
    
    # Stemming
    stemmed_tokens_per_video = ([s_stemmer.stem(w) for w in tokens_keep])
    
    
    # Return a Counter object of the tokens
    return collections.Counter(tokens_keep)

In [12]:
def fill_underlying_dict(freq_tokens_per_video, dict_stemmed_tokens, i_vid):
    '''Method to fill the underlying dictionnary in order to 
    update the sparse matrix incrementally by videos'''
    
    dict_freq_tokens_for_sparse_matrix = {}
    
    for key in freq_tokens_per_video.keys():
        
        # Column index in the sparse matrix (one column for each token)
        try:
            j_token = dict_stemmed_tokens[key]
            
            # Filling the underlying dict
            dict_freq_tokens_for_sparse_matrix[(i_vid % 1000000, j_token)] = freq_tokens_per_video[key]
            
        except KeyError:
            None
    
    return dict_freq_tokens_for_sparse_matrix

In [13]:
def remove_zero_rows(M):
    '''Function that removes all rows from sparse matrix M that contains only zero.'''
    num_nonzeros = np.diff(M.indptr)
    return M[num_nonzeros != 0]

In [4]:
# Load dictionnary of words
with open('/dlabdata1/youtube_large/olam/data/view10000_sub100000/id2word_tok100vid_sub100000.pickle', 'rb') as f:
    id2word = pickle.load(f)
f.close()

# Load index of data for classifier
with open('/dlabdata1/youtube_large/olam/data/classifier/idx_vid_to_consider_classifier.pickle', 'rb') as f:
    idx_vid_to_consider_classifier = pickle.load(f)
f.close()

In [5]:
word2id = {v: k for k, v in id2word.items()}

In [26]:
idx_vid_to_consider_sorted = list(idx_vid_to_consider)
idx_vid_to_consider_sorted.sort()

In [27]:
index_data = set([idx_vid_to_consider_sorted[i] for i in idx_vid_to_consider_classifier])

In [28]:
with open('/dlabdata1/youtube_large/olam/data/classifier/index_data.pickle', 'wb') as f:
    pickle.dump(index_data, f)
f.close()

In [3]:
# Load index of data for classifier
with open('/dlabdata1/youtube_large/olam/data/classifier/index_data.pickle', 'rb') as f:
    index_data = pickle.load(f)
f.close()

In [6]:
vocab = list(id2word.values())

In [14]:
data = dok_matrix((1000000, len(vocab)), dtype=np.uint8)
groundtruth = []


reader = Zreader(
    "/dlabdata1/youtube_large/yt_metadata_all.jsonl.zst", chunk_size=2**28)

idx = 0
i_vid = 0

for line in reader.readlines():
    ###start_iter = time.time()
    idx += 1

    if idx % 10000000 == 0:
        print('Processed ' + str(idx) + ' videos...')

    if i_vid % 1000000 == 0 and i_vid != 0:

        file_name = 'data' + str(int(i_vid / 1000000)) + '.npz'
        
        if not os.path.isfile('/dlabdata1/youtube_large/olam/data/classifier/csr_matrices/' + file_name):
            
            data = data.tocsr()
            scipy.sparse.save_npz(
                '/dlabdata1/youtube_large/olam/data/classifier/csr_matrices/' + file_name, data)
            data = dok_matrix((1000000, len(vocab)), dtype=np.uint8)

    if idx in index_data:

        # line is a str dict, video is the dict corresponding to the str dict
        video = json.loads(line)

        # Get the tokens for each video and theirs number of occurences
        freq_tokens_per_video = get_freq_tokens_per_video(video)

        # For each video, create a underlying dictionnary for filling the sparse matrix efficiently
        dict_freq_tokens_for_sparse_matrix = fill_underlying_dict(
            freq_tokens_per_video, word2id, i_vid)

        # Need to check that the video contains token from the reduced vocabulary
        if dict_freq_tokens_for_sparse_matrix != {}:

            # Update the Sparse Matrix
            dict.update(data, dict_freq_tokens_for_sparse_matrix)
            i_vid += 1

            # Get groundtruth values
            groundtruth.append(video['categories'])

# Save last sparse matrix
data = data.tocsr()
data = remove_zero_rows(data)
scipy.sparse.save_npz(
    '/dlabdata1/youtube_large/olam/data/classifier/csr_matrices/data_last.npz', data)

Processed 10000000 videos...
Processed 20000000 videos...
Processed 30000000 videos...
Processed 40000000 videos...
Processed 50000000 videos...
Processed 60000000 videos...
Processed 70000000 videos...
Processed 80000000 videos...


### Get training features for classifier

- get full matrix of BoW
- process for pyspark
- !!! on the cluster, run the model, SAVE the model AND transformed data 
- transform the data to have k features, which are the distribution over the topics 

In [15]:
# get full matrix of BoW
data = scipy.sparse.load_npz('/dlabdata1/youtube_large/olam/data/classifier/csr_matrices/data1.npz')

for i in range(2, 18):
    data_next = scipy.sparse.load_npz('/dlabdata1/youtube_large/olam/data/classifier/csr_matrices/data' + str(i) + '.npz')
    data = scipy.sparse.vstack([data, data_next])

# Add last matrix
data_last = scipy.sparse.load_npz('/dlabdata1/youtube_large/olam/data/classifier/csr_matrices/data_last.npz')
data = scipy.sparse.vstack([data, data_last])

In [16]:
len(groundtruth) == remove_zero_rows(data).shape[0]

True

In [17]:
scipy.sparse.save_npz('/dlabdata1/youtube_large/olam/data/classifier/csr_matrices/data_final.npz', data)

In [19]:
with open('/dlabdata1/youtube_large/olam/data/classifier/groundtruth.pickle', 'wb') as f:
    pickle.dump(groundtruth, f)
f.close()

In [3]:
def get_dict_for_row(row, S):
    '''Construct SparseVector bag-of-word for each row (videos)'''
    tmp_dict = {}
    for key, value in row:
        tmp_dict[key[1]] = value

    return SparseVector(S.shape[1], tmp_dict)

In [4]:
data = scipy.sparse.load_npz('/dlabdata1/youtube_large/olam/data/classifier/csr_matrices/data_final.npz')

In [5]:
# process for pyspark

data_spark = []

print('Process video for topic modelling...')
for i in range(data.shape[0]):

    if i % 5000000 == 0:
        print(str(i) + ' videos processed...')

    data_spark.append([i, get_dict_for_row(data.getrow(i).todok().items(), data)])

Process video for topic modelling...
0 videos processed...
5000000 videos processed...
10000000 videos processed...
15000000 videos processed...


In [None]:
with open('/dlabdata1/youtube_large/olam/data/classifier/list_data_spark.pickle', 'wb') as f:
    pickle.dump(data_spark, f)
f.close()

In [6]:
conf = SparkConf().setMaster("local[8]").setAll([('spark.executor.memory', '10g'),('spark.driver.memory','32g'),('spark.driver.maxResultSize', '0')])

# create the session
spark = SparkSession.builder.config(conf=conf).getOrCreate()

sc = spark.sparkContext



In [7]:
# Construct dataframe for LDA
df = spark.createDataFrame(data_spark, ["id", "features"])

In [None]:
# Save the spark dataframe
#save the dataframe
df.write\
        .option('compression', 'gzip')\
        .json('/dlabdata1/youtube_large/olam/data/classifier/sparkdf_data.json')

In [8]:
# Get model and/or transformed data!

model = LocalLDAModel.load('/dlabdata1/youtube_large/olam/data/classifier/best_model')

In [9]:
transformed_data = model.transform(df)

In [10]:
transformed_data.show(20)

+---+--------------------+--------------------+
| id|            features|   topicDistribution|
+---+--------------------+--------------------+
|  0|(42757,[1230,3534...|[3.41848242604404...|
|  1|(42757,[851,4683,...|[3.10372866523442...|
|  2|(42757,[851,986,4...|[2.92423131538094...|
|  3|(42757,[851,986,1...|[3.95298682552134...|
|  4|(42757,[851,986,3...|[2.84204946225357...|
|  5|(42757,[851,4683,...|[2.55484644367840...|
|  6|(42757,[851,986,4...|[3.20200251989183...|
|  7|(42757,[851,4683,...|[3.53808319352224...|
|  8|(42757,[851,4683,...|[3.53808319352224...|
|  9|(42757,[851,2333,...|[0.03949832339932...|
| 10|(42757,[851,4683,...|[3.30670318920320...|
| 11|(42757,[851,4683,...|[2.92423131538095...|
| 12|(42757,[851,4683,...|[3.01130750254167...|
| 13|(42757,[851,1044,...|[3.01130750254167...|
| 14|(42757,[851,3230,...|[3.01130750254167...|
| 15|(42757,[851,1730,...|[0.03535400904659...|
| 16|(42757,[851,3663,...|[3.10372866523443...|
| 17|(42757,[851,4683,...|[3.66635619908

In [48]:
number_videos_in_dataset = transformed_data.count()

In [35]:
batch_size = 500000

In [21]:
n_iter = int(number_videos_in_dataset / batch_size)

In [22]:
n_iter 

35

In [36]:
n_topic = 125

- Because of the memory issue, we will create a Sparse matrix that contains the distribution over the topics for each video.

- For that, we slide the data into 35 batches and for each one, we will create a dok sparse matrix and in the end, we will save all the results in the compressed csc format and in the end we will be able to stack all the "sub" sparse matrices

In [52]:
for k in range(n_iter):
    
    print('Iteration ' + str(k + 1) + '/' + str(n_iter))
    
    transformed_data_sub = transformed_data.where(
        col("id").between(0 + k * 500000, (k + 1) * 500000 - 1))

    # Create sparse matrix
    S = dok_matrix((batch_size, n_topic))

    for i, topic_dist_one_vid in enumerate(transformed_data_sub.select('topicDistribution').collect()):

        dict_topic_dist_one_vid = {}

        for j, prob in enumerate(topic_dist_one_vid['topicDistribution']):

            dict_topic_dist_one_vid[(i, j)] = prob

        # Fill data in to sparse matrix
        dict.update(S, dict_topic_dist_one_vid)

    filename = 'transformed_data' + str(k) + '.npz'
    scipy.sparse.save_npz(
        '/dlabdata1/youtube_large/olam/data/classifier/csr_matrices/' + filename, S.tocsr())

Iteration 35/35


In [49]:
# last iteration
transformed_data_sub = transformed_data.where(
    col("id").between(0 + n_iter * 500000, number_videos_in_dataset + 1))

S = dok_matrix((number_videos_in_dataset - batch_size * n_iter, n_topic))

for i, topic_dist_one_vid in enumerate(transformed_data_sub.select('topicDistribution').collect()):

    dict_topic_dist_one_vid = {}

    for j, prob in enumerate(topic_dist_one_vid['topicDistribution']):

        dict_topic_dist_one_vid[(i, j)] = prob

    # Fill data in to sparse matrix
    dict.update(S, dict_topic_dist_one_vid)

filename = 'transformed_data_last.npz'
scipy.sparse.save_npz(
    '/dlabdata1/youtube_large/olam/data/classifier/csr_matrices/' + filename, S.tocsr())

In [51]:
filename = 'transformed_data_last.npz'
scipy.sparse.save_npz(
    '/dlabdata1/youtube_large/olam/data/classifier/csr_matrices/' + filename, S.tocsr())

# Train the classifier

- Train set into train' and validation set, in order to do cross validation


In [2]:
data = scipy.sparse.load_npz(
    '/dlabdata1/youtube_large/olam/data/classifier/csr_matrices/transformed_data0.npz')

In [3]:
for i in range(1, 35):
    data_next = scipy.sparse.load_npz(
        '/dlabdata1/youtube_large/olam/data/classifier/csr_matrices/transformed_data' + str(i) + '.npz')
    data = scipy.sparse.vstack([data, data_next])

# Add last matrix
data_next = scipy.sparse.load_npz(
    '/dlabdata1/youtube_large/olam/data/classifier/csr_matrices/transformed_data_last.npz')
data = scipy.sparse.vstack([data, data_next])

In [2]:
# Load data features
data = scipy.sparse.load_npz(
    '/dlabdata1/youtube_large/olam/data/classifier/csr_matrices/transformed_data_final.npz')

# Load groundtruth
with open('/dlabdata1/youtube_large/olam/data/classifier/groundtruth.pickle', 'rb') as f:
    groundtruth = pickle.load(f)
f.close()

## Separate the data for building the model and testing it

In [3]:
# Load list of shuffled index
with open('/dlabdata1/youtube_large/olam/data/classifier/list_suffled_idx.pickle', 'rb') as f:
    index = pickle.load(f)
f.close()

In [4]:
index_model_data_threshold = int(0.8 * len(groundtruth))

In [5]:
training_model_threshold = int(0.6 * len(groundtruth))

In [6]:
list_train_idx = np.sort(index[:training_model_threshold])
list_val_idx = np.sort(index[training_model_threshold:index_model_data_threshold])
list_test_idx = np.sort(index[index_model_data_threshold:])

In [7]:
len(list_train_idx) + len(list_val_idx) + len(list_test_idx) == len(groundtruth)

True

In [20]:
### too much memory consumption
# X = data[:index_model_data_threshold]
# X_test = data[index_model_data_threshold:]

# y = groundtruth[:index_model_data_threshold]
# y_test = groundtruth[index_model_data_threshold:]

## Logistic Classifier

In [8]:
alphas = np.logspace(0, 1, num=10, base=100) / 100
alphas_test = [0.1]

In [27]:
accuracies = []

for i, alpha in enumerate(alphas):

    start = time.time()
    clf = SGDClassifier(loss='log', alpha=alpha, max_iter=50, shuffle=True, n_jobs=10, random_state=1)
    clf.fit(data[list_train_idx],
            y=np.array(groundtruth)[list_train_idx])
    
    y_pred = clf.predict(data[list_val_idx])
    y_gt = np.array(groundtruth)[list_val_idx]
    
    score = accuracy_score(y_gt, y_pred)
    accuracies.append(score)
    print('time for iter ' + str() + ': ' + str((time.time() - start) / 60))
    print('score: ' + str(score))
    print('')

time for iter : 5.230132786432902
score: 0.35073358812759964

time for iter : 6.296019558111826
score: 0.29902835078364703

time for iter : 7.983751229445139
score: 0.2731599832386995

time for iter : 8.08760746717453
score: 0.2595327084405522

time for iter : 8.459673058986663
score: 0.24998495420171493

time for iter : 8.297085611025492
score: 0.24647942381623314

time for iter : 7.691592129071553
score: 0.2524392473163639

time for iter : 6.180642422040304
score: 0.2580585014384908

time for iter : 5.008745809396108
score: 0.30795852421811065

time for iter : 4.184794783592224
score: 0.3873328439933517



## SVM Classifier

In [26]:
accuracies = []

for i, alpha in enumerate(alphas):

    start = time.time()
    clf = SGDClassifier(loss='hinge', alpha=alpha, max_iter=20, shuffle=True, n_jobs=10, random_state=1)
    clf.fit(data[list_train_idx],
            y=np.array(groundtruth)[list_train_idx])
    
    y_pred = clf.predict(data[list_val_idx])
    y_gt = np.array(groundtruth)[list_val_idx]
    
    score = accuracy_score(y_gt, y_pred)
    accuracies.append(score)
    print('time for iter ' + str() + ': ' + str((time.time() - start) / 60))
    print('score: ' + str(score))
    print('')

time for iter : 2.728762944539388
score: 0.5270121294444866

time for iter : 2.608687388896942
score: 0.5270335029149477

time for iter : 2.6261629541714986
score: 0.5261884071421139

time for iter : 2.636353937784831
score: 0.5240769332444647

time for iter : 2.7797876954078675
score: 0.5053132760186849





time for iter : 4.567264437675476
score: 0.24036014297726818

time for iter : 9.345357275009155
score: 0.24359203669487403

time for iter : 9.68878736893336
score: 0.25403578931382725

time for iter : 8.74266619682312
score: 0.2910329854519786

time for iter : 7.858613804976145
score: 0.3837508190820094



In [9]:
best_alpha = alphas[np.argmax(accuracies)]

In [10]:
best_alpha

0.027825594022071246

In [11]:
clf = SGDClassifier(loss='hinge', alpha=best_alpha, max_iter=20, shuffle=True, n_jobs=10, random_state=1)
clf.fit(data[list_train_idx],
        y=np.array(groundtruth)[list_train_idx])

SGDClassifier(alpha=0.027825594022071246, max_iter=20, n_jobs=10,
              random_state=1)

In [12]:
Y_pred_train = clf.predict(data[list_train_idx])
y_gt_train = np.array(groundtruth)[list_train_idx]

score_train = accuracy_score(y_gt_train, Y_pred_train)

print('Accuracy on training set of the best model: ' + str(score_train))

Accuracy on training set of the best model: 0.5263785678817267


In [13]:
y_pred = clf.predict(data[list_test_idx])
y_gt = np.array(groundtruth)[list_test_idx]

score_test = accuracy_score(y_gt, y_pred)

print('Accuracy on test set of the best model: ' + str(score_test))

Accuracy on test set of the best model: 0.5264752616140908


In [14]:
len(y_pred)

3555810

In [18]:
with open('/dlabdata1/youtube_large/olam/data/classifier/results/y_pred.pickle', 'wb') as f:
    pickle.dump(y_pred, f)
f.close()

In [19]:
with open('/dlabdata1/youtube_large/olam/data/classifier/results/y_gt.pickle', 'wb') as f:
    pickle.dump(y_gt, f)
f.close()

In [2]:
with open('/dlabdata1/youtube_large/olam/data/classifier/results/y_pred.pickle', 'rb') as f:
    y_pred = pickle.load(f)
f.close()

with open('/dlabdata1/youtube_large/olam/data/classifier/results/y_gt.pickle', 'rb') as f:
    y_gt = pickle.load(f)
f.close()

In [3]:
len(y_pred) == len(y_gt)

True

In [9]:
# Print the precision and recall, among other metricss
print(classification_report(y_gt, y_pred, digits=3))

                       precision    recall  f1-score   support

                           0.000     0.000     0.000        52
     Autos & Vehicles      0.567     0.634     0.598    115117
               Comedy      0.257     0.032     0.058     67846
            Education      0.375     0.098     0.155    152130
        Entertainment      0.518     0.385     0.442    726320
     Film & Animation      0.199     0.043     0.071    136741
               Gaming      0.552     0.950     0.698    828637
        Howto & Style      0.486     0.536     0.510    215770
                Music      0.546     0.757     0.635    400809
      News & Politics      0.542     0.683     0.604    199635
Nonprofits & Activism      0.021     0.000     0.001     19620
       People & Blogs      0.253     0.056     0.092    289254
       Pets & Animals      0.126     0.046     0.068     25219
 Science & Technology      0.540     0.311     0.395    122792
                Shows      0.000     0.000     0.000  