In [25]:
import pandas as pd
import numpy as np
import re

import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [26]:
talks = pd.read_pickle('../datasets/videosData.pkl')
talks['views'] = talks.views.astype(int)
talks['likes'] = talks.likes.astype(int)
talks['dislikes'] = talks.dislikes.astype(int)
print(len(talks))
talks.head(2)
import datetime
talks.loc[:,'viewsWeek'] = round(talks.views / talks.publishDate.apply(lambda x: round((datetime.datetime.now() - x).days/7)), 6)

78391


In [27]:
def getEvent(x):
    matchs = re.findall(r'(TEDx[^!\d ]+|TEDx [^!\d ]+)', x, re.IGNORECASE)
    if(len(matchs) > 0):
        return matchs[0].replace(' ','')
    else:
        return np.nan

talks['event'] = talks.titleRaw.apply(getEvent)
talks = talks[~pd.isna(talks.event)]

In [28]:
eventsDf = pd.read_pickle('../datasets/eventsParsed.pkl')
eventsDf = eventsDf.drop_duplicates(subset='name')
print(len(eventsDf), 'events')
eventsDf.index = eventsDf.name
eventsDf.head(5)

10419 events


Unnamed: 0_level_0,name,eventType,date,city,country,hasWebcast,hasSpace,eventCode,shortName
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
TEDxRiodelaPlata,TEDxRiodelaPlata,Standard,"May 7, 2019",Ciudad Autónoma de Buenos Aires,Argentina,True,False,32790,RiodelaPlata
TEDxEsquel,TEDxEsquel,Standard,"May 4, 2019",Esquel,Argentina,True,False,33511,Esquel
TEDxRafaela,TEDxRafaela,Standard,"May 4, 2019",Rafaela,Argentina,True,False,31229,Rafaela
TEDxBarilocheLive,TEDxBarilocheLive,TEDxLive,"April 27, 2019",San Carlos de Bariloche,Argentina,True,False,34453,Bariloche
TEDxRosarioLive,TEDxRosarioLive,TEDxLive,"April 18, 2019",Rosario,Argentina,True,False,34383,Rosario


In [29]:
talks.event = talks.event.replace('TEDxRíodelaPlata', 'TEDxRiodelaPlata')
talks.event = talks.event.replace('TEDxBahíaBlanca', 'TEDxBahiaBlanca')

In [30]:
print(len(talks[~talks.event.isin(eventsDf.index.values)].event.unique()), 'charlas sin evento')

332 charlas sin evento


In [31]:
talks = talks[talks.event.isin(eventsDf.index.values)]
moreData = eventsDf.loc[talks.event].drop(['date','hasWebcast','hasSpace'], axis=1)
moreData.index = talks.index
talks = talks.join(moreData, how='inner')
talks.loc[:, 'year'] = talks.publishDate.dt.year
talks.loc[:, 'month'] = talks.publishDate.dt.month

In [34]:
#add clasification
import pickle
with open('../models/clsTalksCat.pkl', 'rb') as pkl_mod:
    cls = pickle.load(pkl_mod)
with open('../models/vectorsTalks.pkl', 'rb') as pkl_vec:
    vectorizer = pickle.load(pkl_vec)

In [36]:
X = vectorizer.transform(talks.nouns)
proba = cls.transform(X)
cats = proba.argsort(axis=1)[:, -1]

In [38]:
probaScore = [round(proba[i][cats[i]], 4) for i in range(len(cats))]

In [40]:
talks['cat'] = cats
talks['catProbaScore'] = probaScore
nombres_grupos = [
    'art and comunication',
    'performance',
    'life, motivation',
    'gender',
    'education',
    'brain',
    'technology',
    'cities',
    'food'
]

talks['catName'] = talks.cat.replace({i:nombres_grupos[i] for i in range(len(nombres_grupos))})

In [46]:
print(len(talks))
talks = talks.loc[~talks.index.duplicated(keep='first')]
print(len(talks))

78798
76633


In [57]:
talks.drop(['titleRaw', 'publishedTimeRaw', 'shortName', 'catProbaScore', 'catName', 'currentDate', 'name'], axis=1)

Unnamed: 0,title,speaker,event,videoId,likes,dislikes,lengthSeconds,keywords,views,videoDescription,...,text,nouns,viewsWeek,eventType,city,country,eventCode,year,month,cat
---M5RE8nJo,El camino a mi trabajo ideal,Eduardo Molina,TEDxAnahuacUniversity,---M5RE8nJo,65,5,798,"[TEDxTalks, Spanish, Life, Decision making, Li...",3770,"Para ser feliz, debes de seguir tus sueños. Pa...",...,how how please Hello good afternoon a few ...,hello afternoon months opportunity kids prepa ...,171.363636,University,Mexico City,Mexico,31446,2019,8,2
--9TX7D0Fu0,Como ser un depresivo exitoso y no morir en el...,Samantha Isabel Aranda Ramírez,TEDxMérida,--9TX7D0Fu0,35,0,819,"[TEDxTalks, Spanish, Health, Depression, Life,...",720,"Comunicóloga de profesión, actriz en diferente...",...,"i [Music] i [Music] Hi, I'm Sam 'and I'm g...",i music ] i music ] hi story person life memor...,40.000000,University,Mérida,Mexico,31751,2019,9,2
--KpJ1fatzY,Defusing Stress with Mindful Mojo,Angela & Dennis Buttimer,TEDxGeorgiaStateU,--KpJ1fatzY,60,0,1007,"[TEDxTalks, English, United States, Social Sci...",1545,Stress is an epidemic in America. It hijacks ...,...,we've all experienced grief and lossdeath is a...,grief lossdeath part life imagine family in201...,11.616541,University,Atlanta,United States,24817,2017,6,5
--Nw62FG7-w,The Future of Medical School,"John Tomkowiak, MD",TEDxSpokane,--Nw62FG7-w,20,0,837,"[TEDxTalks, English, Health, Education, Higher...",1829,What will the health-care professionals of the...,...,[Applause]so I was told you all want to apply ...,[ applause ] school physicianswell future heal...,16.935185,Standard,Spokane,United States,27615,2017,12,6
--Pkf5htGFM,It's Funny How it Happened,Joey Commisso,TEDxWestVancouverED,--Pkf5htGFM,11,1,264,"[TEDxTalks, English, Canada, Life, Comedy, Hum...",677,After a life alternating event and being diagn...,...,I've been trying to figure out how theguy who ...,theguy classes grade tedx conferencein west va...,4.128049,Education,West Vancouver,Canada,27849,2016,11,7
--Pp_9hlHCM,What's your funniest joke?,George Vivian Paul,TEDxMACE,--Pp_9hlHCM,39,2,1235,"[TEDxTalks, English, Entertainment, Comedy, Hu...",621,His talk and narratives make you revisit your ...,...,so you guys have been hearing TED talkserious ...,ted talks morning name george babyand paul col...,38.812500,University,Kochi,India,31602,2019,9,0
--Zfz4g4-MM,Computer Science Education,Pranav Rajan,TEDxYouth@Lincoln,--Zfz4g4-MM,21,33,583,"[TEDxTalks, English, Education, Arts education...",1519,Computer science has become an influential sub...,...,[Music]the scientific and creative mind areine...,music mind look minds darwin einstein da vinci...,24.111111,Youth,Lincoln,United States,28685,2018,10,4
--aweILQMNI,Listening to the Voice of the Future,Iain White,TEDxRuakura,--aweILQMNI,14,1,942,"[TEDxTalks, English, New Zealand, Technology, ...",983,"Every major problem affecting society, from cl...",...,mm when I was young growing up in the1970s and...,mm the1970s aids years year thisimage cars hov...,5.715116,Standard,Hamilton,New Zealand,29596,2016,9,6
--etmwRTxc8,Se busca trabajo,Romina Masciangioli,TEDxInstitutoZonaOeste,--etmwRTxc8,30,3,969,"[TEDxTalks, Spanish, Argentina, Social Science...",2810,Llega un momento en la vida donde por distinto...,...,[Music] to come as I present candela I have ...,music ] resources years fact years people some...,17.562500,Youth,Rosario,Argentina,22384,2016,12,6
--mY5ruEhqI,The Power of Zero Tolerance,Isabelle Mercier,TEDxStanleyPark,--mY5ruEhqI,41286,920,1206,"[TEDxTalks, English, Canada, Life, Behavior, C...",2424155,95% of North Americans either go to bed or wak...,...,Translator: Hamzeh Koumakli Auditor: Ayman Mah...,translator hamzeh koumakli auditor ayman mahmo...,13033.091398,Standard,Vancouver,Canada,23555,2016,6,2


In [48]:
talks.to_pickle('../datasets/talksDf.pkl')

In [None]:
def getMostDisliked(df):
#     display(df.groupby('event').sum().sort_values('dislikes', ascending=False).index.values[0])
    return df.groupby('event').sum().sort_values('dislikes', ascending=False)

talks.groupby([talks.publishDate.dt.year]).apply(getMostDisliked).reset_index().drop_duplicates(subset='publishDate', keep='first')

In [None]:
talks.sort_values(by=['year', 'dislikes']).drop_duplicates(subset='year', keep='last').\
loc[:, ['titleRaw', 'likes','dislikes','year']].iloc[4:]

In [None]:
from tqdm import tqdm

def only_nouns(texts):
    output = []
    NOUN_CODES = ['NN', 'NNP', 'NNS', 'NNPS']
    for doc in tqdm(texts):
        noun_text = ' '.join([cat[0] for cat in nltk.pos_tag(word_tokenize(doc)) if cat[1] in NOUN_CODES]).lower()
        output.append(noun_text)
    return output