## Pandas Training

### Getting the Data

In [20]:
import pandas as pd

#### Create data within the notebook

In [21]:
# there are 2 main data types in pandas
# series which are 1-dimensional
series = pd.Series(['BMW', 'toyota', 'honda'])
series

0       BMW
1    toyota
2     honda
dtype: object

In [22]:
colours = pd.Series(['Red', 'Blue', 'White'])
colours

0      Red
1     Blue
2    White
dtype: object

In [23]:
# and dataframes which are 2-dimensional
car_data = pd.DataFrame({'Car make': series, 'Colours': colours})
car_data

Unnamed: 0,Car make,Colours
0,BMW,Red
1,toyota,Blue
2,honda,White


#### Import Data

In [24]:
car_sales = pd.read_csv('car-sales.csv')
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"
5,Toyota,Green,99213,4,"$4,500.00"
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"


#### Export Data

In [25]:
# This method creates a new index column
car_sales.to_csv('exported-car-sales.csv')
car_sales_2 = pd.read_csv('exported-car-sales.csv')
car_sales_2

Unnamed: 0.1,Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,0,Toyota,White,150043,4,"$4,000.00"
1,1,Honda,Red,87899,4,"$5,000.00"
2,2,Toyota,Blue,32549,3,"$7,000.00"
3,3,BMW,Black,11179,5,"$22,000.00"
4,4,Nissan,White,213095,4,"$3,500.00"
5,5,Toyota,Green,99213,4,"$4,500.00"
6,6,Honda,Blue,45698,4,"$7,500.00"
7,7,Honda,Blue,54738,4,"$7,000.00"
8,8,Toyota,White,60000,4,"$6,250.00"
9,9,Nissan,White,31600,4,"$9,700.00"


In [26]:
# To export the data without creating a new index column use the following
car_sales.to_csv('exported-car-sales-2', index=False)
car_sales_3 = pd.read_csv('exported-car-sales-2')
car_sales_3

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"
5,Toyota,Green,99213,4,"$4,500.00"
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"


#### Describe Data

In [27]:
# Note that .dtypes does not use (). This is known as an attribute
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price            object
dtype: object

In [28]:
car_sales.columns

Index(['Make', 'Colour', 'Odometer (KM)', 'Doors', 'Price'], dtype='object')

In [29]:
car_sales.index

RangeIndex(start=0, stop=10, step=1)

In [30]:
car_sales.shape

(10, 5)

In [31]:
# Here are a few descriptive methods
car_sales.describe()

Unnamed: 0,Odometer (KM),Doors
count,10.0,10.0
mean,78601.4,4.0
std,61983.471735,0.471405
min,11179.0,3.0
25%,35836.25,4.0
50%,57369.0,4.0
75%,96384.5,4.0
max,213095.0,5.0


In [32]:
car_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Make           10 non-null     object
 1   Colour         10 non-null     object
 2   Odometer (KM)  10 non-null     int64 
 3   Doors          10 non-null     int64 
 4   Price          10 non-null     object
dtypes: int64(2), object(3)
memory usage: 532.0+ bytes


In [33]:
# The numeric_only parameter ensures that only the numerical data is used
car_sales.mean(numeric_only=True)

Odometer (KM)    78601.4
Doors                4.0
dtype: float64

In [34]:
car_sales.sum(numeric_only=True)

Odometer (KM)    786014
Doors                40
dtype: int64

In [35]:
len(car_sales)

10