In [1]:
import pandas as pd
import numpy as np
import json
import feature_engineering as fe

In [2]:
ted_data = pd.read_csv('https://raw.githubusercontent.com/ngb0330/Projects/master/Datasets/ted_talks.csv')
ted_data.head()

Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url,views
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,1140825600,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...",Author/educator,"['children', 'creativity', 'culture', 'dance',...",Do schools kill creativity?,https://www.ted.com/talks/ken_robinson_says_sc...,47227110
1,265,With the same humor and humanity he exuded in ...,977,TED2006,1140825600,43,Al Gore,Al Gore: Averting the climate crisis,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...","[{'id': 243, 'hero': 'https://pe.tedcdn.com/im...",Climate advocate,"['alternative energy', 'cars', 'climate change...",Averting the climate crisis,https://www.ted.com/talks/al_gore_on_averting_...,3200520
2,124,New York Times columnist David Pogue takes aim...,1286,TED2006,1140739200,26,David Pogue,David Pogue: Simplicity sells,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...","[{'id': 1725, 'hero': 'https://pe.tedcdn.com/i...",Technology columnist,"['computers', 'entertainment', 'interface desi...",Simplicity sells,https://www.ted.com/talks/david_pogue_says_sim...,1636292
3,200,"In an emotionally charged talk, MacArthur-winn...",1116,TED2006,1140912000,35,Majora Carter,Majora Carter: Greening the ghetto,1,1151367060,"[{'id': 3, 'name': 'Courageous', 'count': 760}...","[{'id': 1041, 'hero': 'https://pe.tedcdn.com/i...",Activist for environmental justice,"['MacArthur grant', 'activism', 'business', 'c...",Greening the ghetto,https://www.ted.com/talks/majora_carter_s_tale...,1697550
4,593,You've never seen data presented like this. Wi...,1190,TED2006,1140566400,48,Hans Rosling,Hans Rosling: The best stats you've ever seen,1,1151440680,"[{'id': 9, 'name': 'Ingenious', 'count': 3202}...","[{'id': 2056, 'hero': 'https://pe.tedcdn.com/i...",Global health expert; data visionary,"['Africa', 'Asia', 'Google', 'demo', 'economic...",The best stats you've ever seen,https://www.ted.com/talks/hans_rosling_shows_t...,12005869


In [3]:
ted_data.isna().any()

comments              False
description           False
duration              False
event                 False
film_date             False
languages             False
main_speaker          False
name                  False
num_speaker           False
published_date        False
ratings               False
related_talks         False
speaker_occupation     True
tags                  False
title                 False
url                   False
views                 False
dtype: bool

In [4]:
def add_ratings_dummies(original_df, column, key1, key2):
    new_df = fe.str_to_dict_list(original_df, column)
    key_set = fe.get_key_set(new_df, column, key1)
    return fe.add_dict_list_key_counts(new_df, column, key_set, key1,key2)

In [5]:
ratings_df = add_ratings_dummies(ted_data, 'ratings', 'name', 'count')

In [6]:
def add_tags_dummies(original_df, column, sep, threshhold):
    new_df = fe.string_to_list(original_df, column,sep)
    values_list = fe.get_values_list(new_df, column)
    counts_dict = fe.get_counts_dict(values_list)
    counts_df = fe.get_counts_data_frame(original_df, counts_dict)
    counts_df.sort_values('Value_Percentage', ascending=False, inplace = True)
    tags_greater_than_threshhold = counts_df[counts_df['Value_Percentage'] >= (threshhold*100)]
    tags_list = list(tags_greater_than_threshhold["Key"].values)
    return fe.add_list_dummies(new_df, column, tags_list, threshhold)

In [7]:
tags_df = add_tags_dummies(ted_data, 'tags', ',', .08)

In [8]:
title_df = fe.string_column_length(ted_data, 'title')

In [9]:
def make_days_between_df(data_frame, column_1, column_2):
    df_column_1 = fe.unix_to_datetime(data_frame, column_1)
    df_column_2 = fe.unix_to_datetime(data_frame, column_2)
    new_df = df_column_1.join(df_column_2, how = "outer")
    return fe.add_days_between(new_df, column_1, column_2)

In [10]:
days_between_df = make_days_between_df(ted_data, 'published_date', 'film_date')

In [11]:
from date_lib import add_datepart
def generate_new_date_columns(data_frame, column_1, column_2):
    df_column_1 = fe.unix_to_datetime(data_frame, column_1)
    df_column_2 = fe.unix_to_datetime(data_frame, column_2)
    copied_dates_df = df_column_1.join(df_column_2, how = "outer")
    for col in copied_dates_df.columns:
        add_datepart(copied_dates_df, col)
    return copied_dates_df

In [12]:
dates_df = generate_new_date_columns(ted_data, 'published_date', 'film_date')

In [13]:
numeric_ted_data = ted_data._get_numeric_data()

In [14]:
numeric_ted_data.columns

Index(['comments', 'duration', 'film_date', 'languages', 'num_speaker',
       'published_date', 'views'],
      dtype='object')

In [15]:
numeric_ted_data = numeric_ted_data[['views','comments', 'duration', 'film_date', 'languages', 'num_speaker',
       'published_date']]

In [16]:
numeric_ted_data.head()

Unnamed: 0,views,comments,duration,film_date,languages,num_speaker,published_date
0,47227110,4553,1164,1140825600,60,1,1151367060
1,3200520,265,977,1140825600,43,1,1151367060
2,1636292,124,1286,1140739200,26,1,1151367060
3,1697550,200,1116,1140912000,35,1,1151367060
4,12005869,593,1190,1140566400,48,1,1151440680


In [17]:
df_list = [ratings_df, tags_df, title_df, days_between_df, dates_df]

In [18]:
merged_with_dates = fe.merge_dataframes(numeric_ted_data, df_list, "outer")

In [19]:
merged_no_dates = fe.merge_dataframes(numeric_ted_data, df_list[:-1], "outer")

In [20]:
merged_with_dates.to_feather('ted_data_with_dates')

In [21]:
merged_no_dates.to_feather('ted_data_no_dates')