In [13]:
# https://www.youtube.com/watch?v=W9XjRYFkkyw&list=PL-osiE80TeTsWmV9i9c58mdDCSskIFdDS&index=3
import pandas as pd
import os

data_path = 'data/news.csv'
schema_path = 'data/news_schema.csv'
df = pd.read_csv(data_path)
schema_df = pd.read_csv(schema_path)
test_dict = {
    "first": ["Dave", "Jane", "John"],
    "last": ["Zimmer", "Fonda", "Smith"],
    "email": ["DaveZimmer@mail.com", "JaneFonda@mail.com", "JohnSmith@mail.com"]
}
test_df = pd.DataFrame(test_dict)

In [14]:
test_df

Unnamed: 0,email,first,last
0,DaveZimmer@mail.com,Dave,Zimmer
1,JaneFonda@mail.com,Jane,Fonda
2,JohnSmith@mail.com,John,Smith


In [15]:
test_df['email']

0    DaveZimmer@mail.com
1     JaneFonda@mail.com
2     JohnSmith@mail.com
Name: email, dtype: object

In [16]:
test_df.set_index('email')

Unnamed: 0_level_0,first,last
email,Unnamed: 1_level_1,Unnamed: 2_level_1
DaveZimmer@mail.com,Dave,Zimmer
JaneFonda@mail.com,Jane,Fonda
JohnSmith@mail.com,John,Smith


In [17]:
test_df # To demonstrate these methods are not in place

Unnamed: 0,email,first,last
0,DaveZimmer@mail.com,Dave,Zimmer
1,JaneFonda@mail.com,Jane,Fonda
2,JohnSmith@mail.com,John,Smith


In [18]:
test_df.set_index('email', inplace=True) #You can see now it is in place change

In [19]:
test_df

Unnamed: 0_level_0,first,last
email,Unnamed: 1_level_1,Unnamed: 2_level_1
DaveZimmer@mail.com,Dave,Zimmer
JaneFonda@mail.com,Jane,Fonda
JohnSmith@mail.com,John,Smith


In [22]:
test_df.index # Indexes are labels for the rows

Index(['DaveZimmer@mail.com', 'JaneFonda@mail.com', 'JohnSmith@mail.com'], dtype='object', name='email')

In [24]:
test_df.loc['DaveZimmer@mail.com', 'last']

'Zimmer'

In [26]:
test_df.iloc[0]

first      Dave
last     Zimmer
Name: DaveZimmer@mail.com, dtype: object

In [27]:
test_df.reset_index(inplace=True)
test_df

Unnamed: 0,email,first,last
0,DaveZimmer@mail.com,Dave,Zimmer
1,JaneFonda@mail.com,Jane,Fonda
2,JohnSmith@mail.com,John,Smith


### Why Indexes are Useful:

In [28]:
df.head()

Unnamed: 0.1,Unnamed: 0,authors,title,publish_date,description,text,url
0,0,['Cbc News'],Coronavirus a 'wake-up call' for Canada's pres...,2020-03-27 08:00:00,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...,https://www.cbc.ca/news/health/covid-19-drug-s...
1,1,['Cbc News'],Yukon gov't names 2 possible sources of corona...,2020-03-27 01:45:00,The Yukon government has identified two places...,The Yukon government has identified two places...,https://www.cbc.ca/news/canada/north/yukon-cor...
2,2,['The Associated Press'],U.S. Senate passes $2T coronavirus relief package,2020-03-26 05:13:00,The Senate has passed an unparalleled $2.2 tri...,The Senate late Wednesday passed an unparallel...,https://www.cbc.ca/news/world/senate-coronavir...
3,3,['Cbc News'],Coronavirus: The latest in drug treatment and ...,2020-03-27 00:36:00,Scientists around the world are racing to find...,Scientists around the world are racing to find...,https://www.cbc.ca/news/health/coronavirus-tre...
4,4,['Cbc News'],The latest on the coronavirus outbreak for Mar...,2020-03-26 20:57:00,The latest on the coronavirus outbreak from CB...,Trudeau says rules of Quarantine Act will ...,https://www.cbc.ca/news/the-latest-on-the-coro...


In [29]:
schema_df

Unnamed: 0.1,Unnamed: 0,Column,DescriptionText
0,0,authors,The author of online article.
1,1,title,The title of online article.
2,2,publish_date,The date the article was published.
3,3,description,Brief description of article.
4,4,text,The text of the article.
5,5,url,URL to accesss article.


In [30]:
df = pd.read_csv(data_path, index_col='Unnamed: 0')
df.head()

Unnamed: 0,authors,title,publish_date,description,text,url
0,['Cbc News'],Coronavirus a 'wake-up call' for Canada's pres...,2020-03-27 08:00:00,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...,https://www.cbc.ca/news/health/covid-19-drug-s...
1,['Cbc News'],Yukon gov't names 2 possible sources of corona...,2020-03-27 01:45:00,The Yukon government has identified two places...,The Yukon government has identified two places...,https://www.cbc.ca/news/canada/north/yukon-cor...
2,['The Associated Press'],U.S. Senate passes $2T coronavirus relief package,2020-03-26 05:13:00,The Senate has passed an unparalleled $2.2 tri...,The Senate late Wednesday passed an unparallel...,https://www.cbc.ca/news/world/senate-coronavir...
3,['Cbc News'],Coronavirus: The latest in drug treatment and ...,2020-03-27 00:36:00,Scientists around the world are racing to find...,Scientists around the world are racing to find...,https://www.cbc.ca/news/health/coronavirus-tre...
4,['Cbc News'],The latest on the coronavirus outbreak for Mar...,2020-03-26 20:57:00,The latest on the coronavirus outbreak from CB...,Trudeau says rules of Quarantine Act will ...,https://www.cbc.ca/news/the-latest-on-the-coro...


In [31]:
schema_df = pd.read_csv(schema_path, index_col='Column')
schema_df

Unnamed: 0_level_0,Unnamed: 0,DescriptionText
Column,Unnamed: 1_level_1,Unnamed: 2_level_1
authors,0,The author of online article.
title,1,The title of online article.
publish_date,2,The date the article was published.
description,3,Brief description of article.
text,4,The text of the article.
url,5,URL to accesss article.


In [32]:
schema_df.loc['authors']

Unnamed: 0                                     0
DescriptionText    The author of online article.
Name: authors, dtype: object

#### So we can see that setting a named index can be useful for looking up specific information in a schema.

In [33]:
schema_df.loc['authors', 'DescriptionText']

'The author of online article.'

In [34]:
schema_df.sort_index()

Unnamed: 0_level_0,Unnamed: 0,DescriptionText
Column,Unnamed: 1_level_1,Unnamed: 2_level_1
authors,0,The author of online article.
description,3,Brief description of article.
publish_date,2,The date the article was published.
text,4,The text of the article.
title,1,The title of online article.
url,5,URL to accesss article.


In [35]:
schema_df

Unnamed: 0_level_0,Unnamed: 0,DescriptionText
Column,Unnamed: 1_level_1,Unnamed: 2_level_1
authors,0,The author of online article.
title,1,The title of online article.
publish_date,2,The date the article was published.
description,3,Brief description of article.
text,4,The text of the article.
url,5,URL to accesss article.


In [36]:
schema_df.sort_index(inplace=True)
schema_df

Unnamed: 0_level_0,Unnamed: 0,DescriptionText
Column,Unnamed: 1_level_1,Unnamed: 2_level_1
authors,0,The author of online article.
description,3,Brief description of article.
publish_date,2,The date the article was published.
text,4,The text of the article.
title,1,The title of online article.
url,5,URL to accesss article.
