# Demo de extracción, anláisis y visualización de datos de vídeos de yt

## Requisitos:

* `API_KEY` de youtube
* `KEY` de ngrok para ver visualización

## Configuración

In [1]:
# API KEY de youtube para poder hacer las peticiones
# https://developers.google.com/youtube/v3/getting-started
API_KEY = 'XXXXXXXXX'

# Usuario del canal de youtube a analizar
USUARIO = 'xxxxxx'  

## Instalación librerías

In [None]:
# No necesario en colab
! pip install google-api-python-client

In [2]:
from apiclient.discovery import build
from apiclient.errors import HttpError
import json
import os

## Descarga de datos

In [4]:
# conexión youtube
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"

yt = build(YOUTUBE_API_SERVICE_NAME,
                YOUTUBE_API_VERSION,
                developerKey=API_KEY)



In [5]:
#@title Funciones auxiliares de extracción de datos
def get_channel_id_from_username(yt, username):
    ''' 
    Devuelve channelId a partir de un username'''

    part="id, snippet"
    res = yt.search().list(
        q=username,
        part='snippet',
        type="channel"
    ).execute()
    return res['items'][0]['snippet']['channelId']

def get_playlists(youtube, channelId):
    ''' 
    Devuelve las playlists de un canal a partir de su channelId.
    Si el canal tiene más de 50 playlists, se hace una petición por cada página'''

    part="snippet,status,contentDetails"
    res = youtube.playlists().list(
        part=part, 
        channelId=channelId, 
        maxResults="50", 
        ).execute()
    nextPageToken = res.get('nextPageToken')
    
    while (nextPageToken):
        nextPage = youtube.playlists().list(
        part=part, 
        channelId=channelId, 
        maxResults="50", 
        pageToken=nextPageToken
        ).execute()
        res['items'] = res['items'] + nextPage['items']

        nextPageToken = nextPage.get('nextPageToken')
        
    return res

def fetch_youtube_videos_playlist(youtube, playlistId):
    """
    Fetches a playlist of videos from youtube
    We splice the results together in no particular order

    Parameters:
        parm1 - (string) playlistId
    Returns:
        playListItem Dict
    """
    part="snippet,status,contentDetails"

    res = youtube.playlistItems().list(
    part=part,
    playlistId=playlistId,
    maxResults="50"
    ).execute()

    nextPageToken = res.get('nextPageToken')
    while ('nextPageToken' in res):
        nextPage = youtube.playlistItems().list(
        part=part,
        playlistId=playlistId,
        maxResults="50",
        pageToken=nextPageToken
        ).execute()
        res['items'] = res['items'] + nextPage['items']

        if 'nextPageToken' not in nextPage:
            res.pop('nextPageToken', None)
        else:
            nextPageToken = nextPage['nextPageToken']

    return res

def todos_los_videos_playlist(yt, playlists):
    '''
    Recorre las playlist de un canal y devuelve todos los videos de todas las playlists
    '''
    videos = []
    for p in playlists['items']:
        res = fetch_youtube_videos_playlist(yt, p.get('id'))
        videos = videos + res['items']
    return videos

In [6]:
# Obtenemos el channelId del usuario
channelId = get_channel_id_from_username(yt, USUARIO)

# Obtener playlists de un canal
playlists = get_playlists(yt, channelId)

# Obtener los vídeos de las playlists
videos  = todos_los_videos_playlist(yt, playlists)

# Guardamos datos json
json.dump(playlists, open(f'playlists_{USUARIO}_yt.json', 'w'))
json.dump(videos, open(f'videos_{USUARIO}_yt.json', 'w'))

## Análisis de texto y descripción de los vídeos (extracción de frases y temas clave)

In [7]:
# Por hacer

## Descarga, transcripción y ánálisis del contenido de los vídeos (extracción de frases y temas clave y posición en el vídeo)

In [None]:
# Por hacer

## Primera visualización de datos: [Exhibit](https://www.simile-widgets.org/exhibit)

### Preparación de json para exhibit

In [23]:
#@title Funciones auxiliares para transformar json al modelo de exhibit y generar el html

def crea_index(ruta, schema, data, template):
    '''
    Genera index.html para exhibit con json embebido en html
    '''
    data = json.dumps(data)
    #data = json.JSONEncoderForHTML().encode(data)
    schema = json.dumps(schema)
    
    open(ruta, 'w').write(template.format(data=data, schema=schema,
            title=f'Análisis de los videos de {USUARIO}', description = 'Data visualization' ))
    

def crea_videos_exhibit(videos, salida=None):
    items = []
    itemsd = {}

    for v in videos:
        try:
            if v.get('status') and v['status']['privacyStatus'] != 'public' or \
                v['snippet']['title'] == 'Deleted video':
                continue
            d = v.get('snippet')
            k = {}
            k['url'] = d['resourceId']['videoId']
            if k['url'] in itemsd:
                itemsd[k['url']]['playlistId'].append(d.get('playlistId'))
            else:
                k['label'] = d.get('title')
                try:
                    k['imagen'] = d['thumbnails']['medium']['url']
                except:
                    try:
                        k['imagen'] = d['thumbnails']['default']['url']
                    except:
                        print(k, 'no imagen')
                        pass
                k['playlistId'] = [d.get('playlistId')]
                k['position'] = d.get('position')
                k['url'] = d['resourceId']['videoId']
                k['type'] = 'video'
                k['id'] = v.get('id')
                details = v.get('contentDetails')
                if details:
                    k['publishedAt'] = details.get('videoPublishedAt')[:10]
                else:
                    k['publishedAt'] = d.get('publishedAt')[:10]
                k['description'] = d.get('description')
                itemsd[k['url']] = k
        except  Exception as e:
            print ('Exception --> ', e, v)
            if v.get('id') and v['id'].get('kind') != 'youtube#video':    
                continue
            d = v.get('snippet')
            k = {}
            k['url'] = v['id']['videoId']
            k['label'] = d.get('title')
            k['type'] = 'video'
            k['id'] = v['id']['videoId']
            k['publishedAt'] = d.get('publishedAt')[:10]
            k['description'] = d.get('description')
            try:
                k['imagen'] = d['thumbnails']['medium']['url']
            except:
                try:
                    k['imagen'] = d['thumbnails']['default']['url']
                except:
                    print(k, 'no imagen')
                    pass
            itemsd[k['url']] = k
    if salida:
        json.dump({'items': list(itemsd.values())}, open(salida, 'w'))
    else:
        return list(itemsd.values())

def crea_playlist_exhibit(playlists,salida=None, remove = None, separador=None):
    
    items = []
    for p in playlists.get('items'):
        if p['status']['privacyStatus'] == 'public':
            k = {}
            k['id'] = p.get('id')
            k['label'] = p['snippet']['title']
            if remove:
                k['label'] = k['label'].replace('remove', '').strip()
            if separador:
                k['label'] = k['label'].split(separador)[0].strip()
            k['type'] = 'playlist'
            items.append(k)
    if salida:
        json.dump({'items': items}, open(salida, 'w'))
    else:
        return items

template_index = '''<!DOCTYPE html>
<html lang="es">

<head>

    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1.0" />
    <title>{title}</title>
    <meta name="Description" content="{description}">

    <link href="#schema" type="application/json" rel="exhibit-data" />
    <link href="#data" type="application/json" rel="exhibit-data" />
      
    <!-- libs simile-exhibit -->

    <script src="//api.simile-widgets.org/exhibit/current/lib/jquery.min.js" type="text/javascript">
    </script>
    <!-- <link rel="exhibit-extension" href="/js/exhibit/extensions/time/time-extension.js" />
    -->
    <script src="//api.simile-widgets.org/exhibit/current/exhibit-api.js" type="text/javascript"></script>
    
    <!-- Bootstrap -->
    <link rel="stylesheet" href="//cdn.jsdelivr.net/bootstrap/3.3.0/css/bootstrap.min.css">
    <link rel="stylesheet" href="//cdn.jsdelivr.net/bootstrap/3.3.0/css/bootstrap-theme.min.css">
    <script src="//cdn.jsdelivr.net/bootstrap/3.3.0/js/bootstrap.min.js"></script>

    <script type="text/javascript">  
        $(document).bind("dataload.exhibit", function() {{
            $("input").addClass("form-control");
        }});

        $(document).bind("scriptsLoaded.exhibit", function () {{
            Exhibit.FunctionUtilities.registerSimpleMappingFunction("yearOf",
                function (d) {{
                    return d.split('-')[0];
                }},
                "number");
        }});
    </script>

    <script type="text/javascript">

        function addYear(json) {{
            var items = json.items;
            for (var i = 0; item = items[i]; i++) {{
                items[i].year = Exhibit.DateTime.parseIso8601DateTime(item.publishedAt).getFullYear();
            }}
            return json;
        }}
    </script>


    <style>
        #main-content {{
            background: white;
        }}

        #title-panel {{
            padding: 0.25in 0.5in;
        }}

        #top-panels {{
            padding: 0.5em 0.5in;
            border-top: 1px solid #BCB79E;
            border-bottom: 1px solid #BCB79E;
            background: #FBF4D3;
        }}

        .exhibit-tileView-body {{
            list-style: none;
        }}

        .exhibit-collectionView-group-count {{
            display: none;
        }}

        table.nobelist {{
            border: 1px solid #ddd;
            padding: 0.5em;
        }}

        div.name {{
            font-weight: bold;
            font-size: 120%;
        }}

        .relationship {{
            color: #888;
        }}

        ddiv.video-thumbnail {{
            float: left;
            width: 12vw;
            height: 13em;
            border: 1px solid #BCB79E;
            background: #F0FFF0;
            padding: 1em;
            margin: 0.5em;
            text-align: center;
        }}

        div.video-timeline-lens {{
            padding: 1em;
            text-align: center;
        }}

        .card-content p {{
            font-size: 12px;
        }}

        .exhibit-thumbnailView-body{{
            display: flex;
            flex-wrap: wrap;
            align-items: top;
            justify-content: center;
        }}
        .video-thumbnail {{
            width: 15vw;
            height: auto;
            border: 1px solid #BCB79E;
            background: #F0FFF0;
            padding: 1em;
            margin: 0.5em;
            text-align: center;
        }}
        
        @media (max-width: 768px) {{
            h1, .h1 {{
                font-size: 24px
            }}
            .video-thumbnail {{
                width: 90vw;
                height: auto;
                border: 1px solid #BCB79E;
                background: #F0FFF0;
                padding: 1em;
                margin: 0.5em;
                text-align: center;
            }}
            .card-content {{
                font-size: 8px;
            }}
            div.exhibit-facet-body{{
                height: 5em;
            }}
            div.exhibit-facet-value {{
                font-size: 12px;
            }}
          }}

    </style>



<link rel="apple-touch-icon" sizes="180x180" href="static/apple-touch-icon.png">
<link rel="icon" type="image/png" sizes="32x32" href="static/favicon-32x32.png">
<link rel="icon" type="image/png" sizes="16x16" href="static/favicon-16x16.png">
<link rel="manifest" href="site.webmanifest">
<link rel="mask-icon" href="static/safari-pinned-tab.svg" color="#5bbad5">
<meta name="msapplication-TileColor" content="#da532c">
<meta name="theme-color" content="#ffffff">



</head>

<body>
<header>
    <h1 class="text-center">{title}</h1>
    <hr>
</header>
    
        <div data-ex-role="exhibit-collection" data-ex-item-types="video"></div>
        <div class="container">
        <button class="visible-xs"  data-toggle="collapse" data-target="#controles"> <span class="glyphicon glyphicon-sort"></span></button>

        <div class="row collapse in" id="controles">
            <div class="col-md-4" data-ex-role="exhibit-facet" data-ex-facet-class="TextSearch" data-ex-facet-label="Buscar" 
            data-ex-expressions=".label, .playlistId.label"></div>
            <div class="col-md-4" data-ex-role="facet" data-ex-expression="yearOf(.publishedAt)" 
            data-ex-collapsible="true" data-ex-facet-label="Año de publicación"
            data-ex-sort-direction="reverse"></div>
            <div class="col-md-4" data-ex-role="facet" data-ex-expression=".playlistId" data-ex-facet-label="Playlists"
            data-ex-collapsible="true" data-ex-missing-label="Fuera de playlist" ></div>
        </div>
        

        <div data-ex-role="viewPanel" style="padding: 1em 0.5in;">

            <div data-ex-role="view" data-ex-view-class="Thumbnail" data-ex-showall="false" 
            data-ex-orders=".publishedAt, .label" data-ex-grouped="false"
                data-ex-paginate="true" data-ex-page-size="20" 
                data-ex-directions="descending"
                data-ex-show-controls="false" data-ex-possible-orders=".label ">

                <div data-ex-role="exhibit-lens" style="display: none;" class="video-thumbnail">

                    <div>
                        <a target="_blank" data-ex-href-content="concat('https://www.youtube.com/watch?&v=', .url)">
                            <img data-ex-src-content=".imagen" alt="Imagen" class="img-responsive">
                        </a>

                    </div>
                    <div class="card-content">
                        <a target="_blank" data-ex-href-content="concat('https://www.youtube.com/watch?&v=', .url)">
                        <p data-ex-content=".label"></p>
                        </a>
                    </div>


                </div>
            </div>
            <!--
            <div data-ex-role="view" 
                data-ex-view-class="Timeline" 
                data-ex-start=".publishedAt" >
                    <div data-ex-role="lens" class="video-timeline-lens" style="display: none;">
                            <img data-ex-src-content=".imagen"  alt="portrait"/>
                            <div><span data-ex-content=".label"></span></div>
                            
                    </div>

            </div>
            -->
            

        </div>
    

    </div>



    <!--  Scripts-->
    <div id="schema" style="display:none">
        {schema}
    </div>
    <div id="data" style="display:none">
            {data}
    </div>

</body>

</html>'''

schema = '''{
    "types": {
        "video": {
            "pluralLabel": "Vídeos"
        },
        "playlist": {
            "pluralLabel": "Playlists"
        }
    },
    "properties": {
        "imagen": {
            "valueType": "url"
        },
        "playlistId": {
            "valueType":              "item"
        },
        
        "publishedAt": {
            "valueType":              "date",
            "label":                  "Fecha de publicación"
        }
        
    }
}'''



In [20]:
videos_exhibit = crea_videos_exhibit(videos)
playlists_exhibit = crea_playlist_exhibit(playlists)

In [25]:
crea_index('index.html', schema = schema, data = {"items": videos_exhibit + playlists_exhibit}, 
           template= template_index)

### Visualización con ngrok

In [None]:
# configuración de ngrok para servir archivo
! pip install pyngrok

In [27]:
!ngrok authtoken <TOKEN_NGROK>

Authtoken saved to configuration file: /root/.ngrok2/ngrok.yml


In [28]:
# configurar output colab 
from google.colab import output
output.serve_kernel_port_as_window(8000)

<IPython.core.display.Javascript object>

In [34]:
from pyngrok import ngrok
port = 8000

public_url = ngrok.connect(port).public_url
print("ngrok tunnel \"{}\" -> \"http://127.0.0.1:{}\"".format(public_url, port))

ngrok tunnel "http://7086-35-247-63-81.ngrok.io" -> "http://127.0.0.1:8000"


In [None]:
# Ejecuta la siguiente línea y visita con el navegador la url pública de ngrok

!python -m http.server 8000