In [2]:
# Import Dependencies
import pandas as pd


In [3]:
# Create a path to the csv and read it into a Pandas DataFrame
csv_path = "Resources/ted_talks.csv"
ted_df = pd.read_csv(csv_path)

ted_df.head()

Unnamed: 0,comments,description,duration,event,languages,main_speaker,name,title,views
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,Do schools kill creativity?,47227110
1,265,With the same humor and humanity he exuded in ...,977,TED2006,43,Al Gore,Al Gore: Averting the climate crisis,Averting the climate crisis,3200520
2,124,New York Times columnist David Pogue takes aim...,1286,TED2006,26,David Pogue,David Pogue: Simplicity sells,Simplicity sells,1636292
3,200,"In an emotionally charged talk, MacArthur-winn...",1116,TED2006,35,Majora Carter,Majora Carter: Greening the ghetto,Greening the ghetto,1697550
4,593,You've never seen data presented like this. Wi...,1190,TED2006,48,Hans Rosling,Hans Rosling: The best stats you've ever seen,The best stats you've ever seen,12005869


In [4]:
# Figure out the minimum and maximum views for a TED Talk
print(ted_df["views"].max())
print(ted_df["views"].min())

47227110
50443


In [5]:
# Create bins in which to place values based upon TED Talk views
bins = [0, 199999, 399999, 599999, 799999, 999999,
        1999999, 2999999, 3999999, 4999999, 50000000]

# Create labels for these bins
group_labels = ["0 to 199k", "200k to 399k", "400k to 599k", "600k to 799k", "800k to 999k", "1mil to 2mil",
                "2mil to 3mil", "3mil to 4mil", "4mil to 5mil", "5mil to 50mil"]

In [7]:
# alternative way to create bins
start = ted_df['views'].min()
end = ted_df['views'].max()
step = (ted_df['views'].max() - ted_df['views'].min()) / 10

bins = [x for x in range(start, end, int(round(step,0)))]
bins.append(end)
bins

[50443,
 4768110,
 9485777,
 14203444,
 18921111,
 23638778,
 28356445,
 33074112,
 37791779,
 42509446,
 47227110]

In [9]:
# Slice the data and place it into bins
pd.cut(ted_df["views"], bins, labels=group_labels).head()

0    5mil to 50mil
1        0 to 199k
2        0 to 199k
3        0 to 199k
4     400k to 599k
Name: views, dtype: category
Categories (10, object): [0 to 199k < 200k to 399k < 400k to 599k < 600k to 799k ... 2mil to 3mil < 3mil to 4mil < 4mil to 5mil < 5mil to 50mil]

In [10]:
# Place the data series into a new column inside of the DataFrame
ted_df["View Group"] = pd.cut(ted_df["views"], bins, labels=group_labels)
ted_df.head()

Unnamed: 0,comments,description,duration,event,languages,main_speaker,name,title,views,View Group
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,Do schools kill creativity?,47227110,5mil to 50mil
1,265,With the same humor and humanity he exuded in ...,977,TED2006,43,Al Gore,Al Gore: Averting the climate crisis,Averting the climate crisis,3200520,0 to 199k
2,124,New York Times columnist David Pogue takes aim...,1286,TED2006,26,David Pogue,David Pogue: Simplicity sells,Simplicity sells,1636292,0 to 199k
3,200,"In an emotionally charged talk, MacArthur-winn...",1116,TED2006,35,Majora Carter,Majora Carter: Greening the ghetto,Greening the ghetto,1697550,0 to 199k
4,593,You've never seen data presented like this. Wi...,1190,TED2006,48,Hans Rosling,Hans Rosling: The best stats you've ever seen,The best stats you've ever seen,12005869,400k to 599k


In [11]:
# Create a GroupBy object based upon "View Group"
ted_group = ted_df.groupby("View Group")

# Find how many rows fall into each bin
print(ted_group["comments"].count())

# Get the average of each column within the GroupBy object
ted_group[["comments", "duration", "languages"]].mean()

View Group
0 to 199k        2435
200k to 399k       78
400k to 599k       15
600k to 799k       11
800k to 999k        6
1mil to 2mil        0
2mil to 3mil        1
3mil to 4mil        1
4mil to 5mil        0
5mil to 50mil       2
Name: comments, dtype: int64


Unnamed: 0_level_0,comments,duration,languages
View Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0 to 199k,172.064887,823.062012,26.758111
200k to 399k,434.897436,844.410256,38.0
400k to 599k,779.6,910.0,42.8
600k to 799k,724.909091,950.818182,41.727273
800k to 999k,866.0,861.666667,42.166667
1mil to 2mil,,,
2mil to 3mil,1927.0,1219.0,52.0
3mil to 4mil,1930.0,1084.0,45.0
4mil to 5mil,,,
5mil to 50mil,3421.5,1213.0,55.5


In [14]:
ted_group

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f936533dfd0>