In [None]:
import pandas as pd # A general purpose Python library for data analysis
import numpy as np # A library for scientific computing in Python (e.g., provides high-performance multi-dimensional array objects and operations)

import matplotlib.pyplot as plt # a plotting library for Python and NumPy (readily customizable)
import seaborn as sns # Another plotting library for Python (fewer syntax, excellent default themes, behind the scenes, it uses matplotlib)
import time

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Knowledge Streams 2024

In this notebook, we will learn about the key data structures provided by the Pandas library: **Data Frames, Series, and Indices**.

In addition, we will learn about the following operations:
* How to access data contained in these structures?
* How to read files (e.g., csv, xlsx, sql) to create these structures?
* How to carry out different data manipulation tasks using these structures?

`Dataset`: US elections with information about candidates, their party, votes won, year of election and the result.

## Reading in Data Frames from Files
We'll be using **read_csv** today. Note that this file reading function does all the *data parsing* for you, which is very useful.

Before loading a file into a dataframe, let's first take a look at the **elections.csv** file

In [None]:
#Load csv file and print shape
data=pd.read_csv("/content/drive/MyDrive/DataScience_with_KS/elections.csv")
#copy of data
copied_data=data
# how many observation and features are given
shape=data.shape
print(f"There are {shape[0]} observations and {shape[1]} features")

There are 182 observations and 6 features


In [None]:
# We can use the **head command** to show only a few rows of a dataframe from start.
# Code here
print(data.head(10))
print(data.head())
#Use **tail command** to show last few observation.
# code here
print(data.tail(10))
print(data.tail())

   Year               Candidate                  Party  Popular vote Result  \
0  1824          Andrew Jackson  Democratic-Republican        151271   loss   
1  1824       John Quincy Adams  Democratic-Republican        113142    win   
2  1828          Andrew Jackson             Democratic        642806    win   
3  1828       John Quincy Adams    National Republican        500897   loss   
4  1832          Andrew Jackson             Democratic        702735    win   
5  1832              Henry Clay    National Republican        484205   loss   
6  1832            William Wirt           Anti-Masonic        100715   loss   
7  1836       Hugh Lawson White                   Whig        146109   loss   
8  1836        Martin Van Buren             Democratic        763291    win   
9  1836  William Henry Harrison                   Whig        550816   loss   

           %  
0  57.210122  
1  42.789878  
2  56.203927  
3  43.796073  
4  54.574789  
5  37.603628  
6   7.821583  
7  10.0059

In [None]:
#The `read_csv` command lets us specify a **column to use an index**. For example, we could have used __Year__ as the index.
#Code here
res=pd.read_csv("/content/drive/MyDrive/DataScience_with_KS/elections.csv",index_col="Year")
res.head()

Unnamed: 0_level_0,Candidate,Party,Popular vote,Result,%
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1824,Andrew Jackson,Democratic-Republican,151271,loss,57.210122
1824,John Quincy Adams,Democratic-Republican,113142,win,42.789878
1828,Andrew Jackson,Democratic,642806,win,56.203927
1828,John Quincy Adams,National Republican,500897,loss,43.796073
1832,Andrew Jackson,Democratic,702735,win,54.574789


In [None]:
#Alternately, we could have used the **set_index** commmand on the dataframe to set a particular column as index.
# code here
res=res.set_index("Year")
res.head()

KeyError: "None of ['Year'] are in the columns"

# Caution:
The **set_index command** (along with all other data frame methods) **does not modify the dataframe**, i.e., the original "elections" is untouched. Note: There is a flag called "inplace" which does modify the calling dataframe (e.g., `elections.set_index("Party",inplace=True)`).

## Duplicate Columns?
By contast, column names MUST be unique. For example, if we try to read in a file for which column names are not unique, Pandas will automatically any duplicates. Load duplicate_columns.csv

In [None]:
duplicate_colums=pd.read_csv("/content/drive/MyDrive/DataScience_with_KS/duplicate_columns.csv")
duplicate_colums.head()

Unnamed: 0,name,name.1,flavor
0,john,smith,vanilla
1,zhang,shan,chocolate
2,fulan,alfulani,strawberry
3,hong,gildong,banana


## The [ ] Operator & Indexing

The DataFrame class has an indexing operator **[ ]** (also known as the 'brack' operator) that lets you do a variety of different things. If your provide a String to the **[ ]** operator, you get back a ***Series*** corresponding to the requested label.

1.Use **[ ]** to display different columns

2.Use List retrive multiple columns

In [None]:
# Display and Retrieve multiple columns from the election data frame, the resultant would be the list for every column.
col=["Candidate","Party"]
res=data[col]
res.head()

Unnamed: 0,Candidate,Party
0,Andrew Jackson,Democratic-Republican
1,John Quincy Adams,Democratic-Republican
2,Andrew Jackson,Democratic
3,John Quincy Adams,National Republican
4,Andrew Jackson,Democratic


In [None]:
#The **[ ]** operator also accepts a list of strings. In this case, you get back a **DataFrame** corresponding to the requested strings.
res=data[["Candidate","Party"]]
res.head()

Unnamed: 0,Candidate,Party
0,Andrew Jackson,Democratic-Republican
1,John Quincy Adams,Democratic-Republican
2,Andrew Jackson,Democratic
3,John Quincy Adams,National Republican
4,Andrew Jackson,Democratic


A list of one label also returns a DataFrame. This can be handy if you want your results as a DataFrame, not a series.

Note that we can also use the **to_frame** method to turn a Series into a DataFrame.

Extract one col name "Candidates" from DataFrame it will be a series. Convert series into a DataFrame.

In [None]:
res=data["Candidate"]
print(res)
d1=res.to_frame()
d1

0         Andrew Jackson
1      John Quincy Adams
2         Andrew Jackson
3      John Quincy Adams
4         Andrew Jackson
             ...        
177           Jill Stein
178         Joseph Biden
179         Donald Trump
180         Jo Jorgensen
181       Howard Hawkins
Name: Candidate, Length: 182, dtype: object


Unnamed: 0,Candidate
0,Andrew Jackson
1,John Quincy Adams
2,Andrew Jackson
3,John Quincy Adams
4,Andrew Jackson
...,...
177,Jill Stein
178,Joseph Biden
179,Donald Trump
180,Jo Jorgensen


In [None]:
print(d1)
# if we use print then it will return string again not a data frame

             Candidate
0       Andrew Jackson
1    John Quincy Adams
2       Andrew Jackson
3    John Quincy Adams
4       Andrew Jackson
..                 ...
177         Jill Stein
178       Joseph Biden
179       Donald Trump
180       Jo Jorgensen
181     Howard Hawkins

[182 rows x 1 columns]


The following cells allow you to **test your understanding**. Let's go over the summary of what we have learnt (see slides).

# Creating DataFrames
Create DataFrame using List and Columns name given in slides.

In [None]:
# Code here
pd.DataFrame([[1,"One"],[2,"Two"],[3,"Three"]],columns=["Integers","String values"])

Unnamed: 0,Integers,String values
0,1,One
1,2,Two
2,3,Three


Creating DataFrames using **Dictionary** given in slides.

In [None]:
# Code here
pd.DataFrame({"Fruit":["Strabery","Orange"],"price":["23.5","31.6"],"Soled":["5","8"]})

Unnamed: 0,Fruit,price,Soled
0,Strabery,23.5,5
1,Orange,31.6,8


Creating DataFrames using **Series** given in slides.

In [None]:
# Code here
d1=pd.Series(["Car","Bike"],index=["VehicleType","VehicleType"])
d2=pd.Series(["Won","Lose"],index=["VehicleType","VehicleType"])
pd.DataFrame({"Type":d1,"Status":d2})

Unnamed: 0,Type,Status
VehicleType,Car,Won
VehicleType,Bike,Lose


In [None]:
s=pd.Series([1,2,3],index=["a","b","c"])
pd.DataFrame({"Name":s})

Unnamed: 0,Name
a,1
b,2
c,3


In [None]:
s[s>2]

Unnamed: 0,0
c,3


In [None]:
pd.DataFrame([[1,"one"],[2,"Two"]],columns=["Number","Alphabet"])

Unnamed: 0,Number,Alphabet
0,1,one
1,2,Two


In [None]:
a=pd.DataFrame({"Fruit":["Straubery","Mango"],"Price":[3,5]})
a.set_index("Fruit")

Unnamed: 0_level_0,Price
Fruit,Unnamed: 1_level_1
Straubery,3
Mango,5


In [None]:
a.reset_index()

Unnamed: 0,index,Fruit,Price
0,0,Straubery,3
1,1,Mango,5


In [None]:
a.columns

Index(['Fruit', 'Price'], dtype='object')

In [None]:
a.head(2)

Unnamed: 0,Fruit,Price
0,Straubery,3
1,Mango,5


In [None]:
data.index

RangeIndex(start=0, stop=182, step=1)

In [None]:
data.loc[[1,2,3],"Popular vote":"%"]
data.iloc[[2,3,4,5,56],:-1]

Unnamed: 0,Year,Candidate,Party,Popular vote,Result
2,1828,Andrew Jackson,Democratic,642806,win
3,1828,John Quincy Adams,National Republican,500897,loss
4,1832,Andrew Jackson,Democratic,702735,win
5,1832,Henry Clay,National Republican,484205,loss
56,1900,William McKinley,Republican,7228864,win


In [83]:
data[1:4]

Unnamed: 0,Year,Candidate,Party,Popular vote,Result,%
1,1824,John Quincy Adams,Democratic-Republican,113142,win,42.789878
2,1828,Andrew Jackson,Democratic,642806,win,56.203927
3,1828,John Quincy Adams,National Republican,500897,loss,43.796073


In [87]:
data[["Candidate","Year"]]

Unnamed: 0,Candidate,Year
0,Andrew Jackson,1824
1,John Quincy Adams,1824
2,Andrew Jackson,1828
3,John Quincy Adams,1828
4,Andrew Jackson,1832
...,...,...
177,Jill Stein,2016
178,Joseph Biden,2020
179,Donald Trump,2020
180,Jo Jorgensen,2020


In [92]:
a=pd.DataFrame({1:["Straubery","Mango"],"1":[3,5]})
a[1:]

Unnamed: 0,1,1.1
1,Mango,5
