In [6]:
import pandas as pd

# 2 main datatypes
# series: 1-dimensional

series = pd.Series(["BMW", "Toyota", "Honda"])

series

0       BMW
1    Toyota
2     Honda
dtype: object

In [7]:
colors = pd.Series(["Red","Blue","White"])
colors

0      Red
1     Blue
2    White
dtype: object

In [8]:
# dataframe: 2-dimensional, takes Python dictionary

car_data = pd.DataFrame({"Car Make": series, "Colors":colors})

car_data

Unnamed: 0,Car Make,Colors
0,BMW,Red
1,Toyota,Blue
2,Honda,White


In [9]:
# Import data from csv
car_sales = pd.read_csv("../data/car-sales.csv")

car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"
5,Toyota,Green,99213,4,"$4,500.00"
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"


In [10]:
# Export dataframe (after changes, if necessary)
# remove index column when indexing
car_sales.to_csv("exported_car_sales.csv", index=False)

exported_car_sales = pd.read_csv("exported_car_sales.csv")

exported_car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"
5,Toyota,Green,99213,4,"$4,500.00"
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"


## Build a dataframe from existing matrices

In [25]:
data = [[0.89, 0.11],
       [0.49, 0.51],
       [0.43, 0.57],
       [0.84, 0.16],
       [0.18, 0.82]]

In [26]:
more_data = [0, 1, 1, 0, 1]

In [28]:
col = ["0 Prob", "1 Prob"]
a = pd.DataFrame(data=data,columns=col)
a

Unnamed: 0,0 Prob,1 Prob
0,0.89,0.11
1,0.49,0.51
2,0.43,0.57
3,0.84,0.16
4,0.18,0.82


In [29]:
a["Target"] = more_data
a

Unnamed: 0,0 Prob,1 Prob,Target
0,0.89,0.11,0
1,0.49,0.51,1
2,0.43,0.57,1
3,0.84,0.16,0
4,0.18,0.82,1


In [31]:
b = pd.DataFrame(data=more_data,columns=["Target"])
b

Unnamed: 0,Target
0,0
1,1
2,1
3,0
4,1


### Combining two or more dataframes

In [39]:
c = pd.concat([a, b], axis=1, join="inner")
c

Unnamed: 0,0 Prob,1 Prob,Target,Target.1
0,0.89,0.11,0,0
1,0.49,0.51,1,1
2,0.43,0.57,1,1
3,0.84,0.16,0,0
4,0.18,0.82,1,1


## Describe Data

In [11]:
# attribute
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price            object
dtype: object

In [12]:
car_sales.columns
car_columns = car_sales.columns

In [13]:
car_columns

Index(['Make', 'Colour', 'Odometer (KM)', 'Doors', 'Price'], dtype='object')

In [14]:
car_sales.index

RangeIndex(start=0, stop=10, step=1)

In [15]:
# functions
car_sales.describe()

Unnamed: 0,Odometer (KM),Doors
count,10.0,10.0
mean,78601.4,4.0
std,61983.471735,0.471405
min,11179.0,3.0
25%,35836.25,4.0
50%,57369.0,4.0
75%,96384.5,4.0
max,213095.0,5.0


In [16]:
car_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Make           10 non-null     object
 1   Colour         10 non-null     object
 2   Odometer (KM)  10 non-null     int64 
 3   Doors          10 non-null     int64 
 4   Price          10 non-null     object
dtypes: int64(2), object(3)
memory usage: 528.0+ bytes


In [17]:
car_sales.mean()

  car_sales.mean()


Odometer (KM)    78601.4
Doors                4.0
dtype: float64

In [18]:
car_sales["Odometer (KM)"].mean()

78601.4

In [19]:
# create a series from data
car_prices = pd.Series([3000, 1500, 4500])
car_prices.mean()

3000.0

In [20]:
car_sales.sum()

Make             ToyotaHondaToyotaBMWNissanToyotaHondaHondaToyo...
Colour               WhiteRedBlueBlackWhiteGreenBlueBlueWhiteWhite
Odometer (KM)                                               786014
Doors                                                           40
Price            $4,000.00$5,000.00$7,000.00$22,000.00$3,500.00...
dtype: object

In [21]:
car_sales["Doors"].sum()

40

In [22]:
len(car_sales)

10

In [23]:
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"


In [24]:
car_sales.tail()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
5,Toyota,Green,99213,4,"$4,500.00"
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"
