# Pandas

In [2]:
import pandas as pd
import numpy as np

In [3]:
g7_pop = pd.Series([35.467, 63.951, 80.940, 60.665, 127.061, 64.511, 381.523])

In [4]:
g7_pop

0     35.467
1     63.951
2     80.940
3     60.665
4    127.061
5     64.511
6    381.523
dtype: float64

In [5]:
g7_pop.dtype

dtype('float64')

In [6]:
type(g7_pop.values)

numpy.ndarray

In [7]:
g7_pop

0     35.467
1     63.951
2     80.940
3     60.665
4    127.061
5     64.511
6    381.523
dtype: float64

In [8]:
g7_pop[0]

35.467

In [9]:
g7_pop[3]

60.665

In [10]:
g7_pop.index

RangeIndex(start=0, stop=7, step=1)

In [11]:
g7_pop.index = ['Canada', 'France', 'Germany', 'Italy', 'Japan', 'Korea', 'Turkey']

In [12]:
g7_pop

Canada      35.467
France      63.951
Germany     80.940
Italy       60.665
Japan      127.061
Korea       64.511
Turkey     381.523
dtype: float64

In [13]:
pd.Series({'Canada':      35.467,
'France':      63.951,
'Germany':     80.940,
'Italy':       60.665,
'Japan':      127.061,
'Korea':       64.511,
'Turkey':     381.523}, name = 'G7 population in millions')

Canada      35.467
France      63.951
Germany     80.940
Italy       60.665
Japan      127.061
Korea       64.511
Turkey     381.523
Name: G7 population in millions, dtype: float64

In [14]:
pd.Series([35.467, 63.951, 80.940, 60.665, 127.061, 64.511, 381.523],
         index = ['Canada', 'France', 'Germany', 'Italy', 'Japan', 'Korea', 'Turkey'],
         name = 'G7 population in millions')

Canada      35.467
France      63.951
Germany     80.940
Italy       60.665
Japan      127.061
Korea       64.511
Turkey     381.523
Name: G7 population in millions, dtype: float64

In [15]:
pd.Series(g7_pop, index = ['France', 'Germany', 'Turkey'])

France      63.951
Germany     80.940
Turkey     381.523
dtype: float64

# Indexing

In [16]:
g7_pop

Canada      35.467
France      63.951
Germany     80.940
Italy       60.665
Japan      127.061
Korea       64.511
Turkey     381.523
dtype: float64

In [17]:
g7_pop['Canada']

35.467

In [18]:
g7_pop['Turkey']

381.523

In [20]:
g7_pop.iloc[0]

35.467

In [21]:
g7_pop.iloc[-1]

381.523

In [22]:
g7_pop[['Italy', 'France']]

Italy     60.665
France    63.951
dtype: float64

In [24]:
g7_pop.iloc[[0, 1]]

Canada    35.467
France    63.951
dtype: float64

# Operations and methods

In [25]:
g7_pop

Canada      35.467
France      63.951
Germany     80.940
Italy       60.665
Japan      127.061
Korea       64.511
Turkey     381.523
dtype: float64

In [26]:
g7_pop * 1_000_000

Canada      35467000.0
France      63951000.0
Germany     80940000.0
Italy       60665000.0
Japan      127061000.0
Korea       64511000.0
Turkey     381523000.0
dtype: float64

In [27]:
g7_pop.mean()

116.30257142857144

In [28]:
np.log(g7_pop)

Canada     3.568603
France     4.158117
Germany    4.393708
Italy      4.105367
Japan      4.844667
Korea      4.166836
Turkey     5.944171
dtype: float64

# boolean arrays

In [31]:
g7_pop

Canada      35.467
France      63.951
Germany     80.940
Italy       60.665
Japan      127.061
Korea       64.511
Turkey     381.523
dtype: float64

In [32]:
g7_pop > 70

Canada     False
France     False
Germany     True
Italy      False
Japan       True
Korea      False
Turkey      True
dtype: bool

In [36]:
g7_pop[g7_pop > 70]

Germany     80.940
Japan      127.061
Turkey     381.523
dtype: float64

In [37]:
g7_pop.mean()

116.30257142857144

In [38]:
g7_pop[g7_pop > g7_pop.mean()]

Japan     127.061
Turkey    381.523
dtype: float64

# Modifying series

In [39]:
g7_pop['Canada'] = 40.3

In [40]:
g7_pop

Canada      40.300
France      63.951
Germany     80.940
Italy       60.665
Japan      127.061
Korea       64.511
Turkey     381.523
dtype: float64

In [41]:
g7_pop.iloc[-1] = 500

In [42]:
g7_pop

Canada      40.300
France      63.951
Germany     80.940
Italy       60.665
Japan      127.061
Korea       64.511
Turkey     500.000
dtype: float64

In [43]:
g7_pop[g7_pop < 70]

Canada    40.300
France    63.951
Italy     60.665
Korea     64.511
dtype: float64

In [44]:
g7_pop[g7_pop < 70] = 99.99

In [45]:
g7_pop

Canada      99.990
France      99.990
Germany     80.940
Italy       99.990
Japan      127.061
Korea       99.990
Turkey     500.000
dtype: float64

In [47]:
import matplotlib.pyplot as plt
%matplotlib inline

In [48]:
pd.read_csv?

In [55]:
df = pd.read_csv('data/sales_data.csv')

In [50]:
df.head()

Unnamed: 0,Date,Day,Month,Year,Customer_Age,Age_Group,Customer_Gender,Country,State,Product_Category,Sub_Category,Product,Order_Quantity,Unit_Cost,Unit_Price,Profit,Cost,Revenue
0,2013-11-26,26,November,2013,19,Youth (<25),M,Canada,British Columbia,Accessories,Bike Racks,Hitch Rack - 4-Bike,8,45,120,590,360,950
1,2015-11-26,26,November,2015,19,Youth (<25),M,Canada,British Columbia,Accessories,Bike Racks,Hitch Rack - 4-Bike,8,45,120,590,360,950
2,2014-03-23,23,March,2014,49,Adults (35-64),M,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,23,45,120,1366,1035,2401
3,2016-03-23,23,March,2016,49,Adults (35-64),M,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,20,45,120,1188,900,2088
4,2014-05-15,15,May,2014,47,Adults (35-64),F,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,4,45,120,238,180,418


In [51]:
df = pd.read_csv('data/sales_data.csv', header = None)

  df = pd.read_csv('data/sales_data.csv', header = None)


In [52]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,Date,Day,Month,Year,Customer_Age,Age_Group,Customer_Gender,Country,State,Product_Category,Sub_Category,Product,Order_Quantity,Unit_Cost,Unit_Price,Profit,Cost,Revenue
1,2013-11-26,26,November,2013,19,Youth (<25),M,Canada,British Columbia,Accessories,Bike Racks,Hitch Rack - 4-Bike,8,45,120,590,360,950
2,2015-11-26,26,November,2015,19,Youth (<25),M,Canada,British Columbia,Accessories,Bike Racks,Hitch Rack - 4-Bike,8,45,120,590,360,950
3,2014-03-23,23,March,2014,49,Adults (35-64),M,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,23,45,120,1366,1035,2401
4,2016-03-23,23,March,2016,49,Adults (35-64),M,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,20,45,120,1188,900,2088


In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113037 entries, 0 to 113036
Data columns (total 18 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   0       113037 non-null  object
 1   1       113037 non-null  object
 2   2       113037 non-null  object
 3   3       113037 non-null  object
 4   4       113037 non-null  object
 5   5       113037 non-null  object
 6   6       113037 non-null  object
 7   7       113037 non-null  object
 8   8       113037 non-null  object
 9   9       113037 non-null  object
 10  10      113037 non-null  object
 11  11      113037 non-null  object
 12  12      113037 non-null  object
 13  13      113037 non-null  object
 14  14      113037 non-null  object
 15  15      113037 non-null  object
 16  16      113037 non-null  object
 17  17      113037 non-null  object
dtypes: object(18)
memory usage: 15.5+ MB


In [56]:
df.tail()

Unnamed: 0,Date,Day,Month,Year,Customer_Age,Age_Group,Customer_Gender,Country,State,Product_Category,Sub_Category,Product,Order_Quantity,Unit_Cost,Unit_Price,Profit,Cost,Revenue
113031,2016-04-12,12,April,2016,41,Adults (35-64),M,United Kingdom,England,Clothing,Vests,"Classic Vest, S",3,24,64,112,72,184
113032,2014-04-02,2,April,2014,18,Youth (<25),M,Australia,Queensland,Clothing,Vests,"Classic Vest, M",22,24,64,655,528,1183
113033,2016-04-02,2,April,2016,18,Youth (<25),M,Australia,Queensland,Clothing,Vests,"Classic Vest, M",22,24,64,655,528,1183
113034,2014-03-04,4,March,2014,37,Adults (35-64),F,France,Seine (Paris),Clothing,Vests,"Classic Vest, L",24,24,64,684,576,1260
113035,2016-03-04,4,March,2016,37,Adults (35-64),F,France,Seine (Paris),Clothing,Vests,"Classic Vest, L",23,24,64,655,552,1207


In [57]:
df.dtypes

Date                object
Day                  int64
Month               object
Year                 int64
Customer_Age         int64
Age_Group           object
Customer_Gender     object
Country             object
State               object
Product_Category    object
Sub_Category        object
Product             object
Order_Quantity       int64
Unit_Cost            int64
Unit_Price           int64
Profit               int64
Cost                 int64
Revenue              int64
dtype: object