In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Data Acquisition

### Live Demos

This is some text
$$y = ax + b$$

In [2]:
pd.DataFrame({"name": ["Ivan", "Georgi"], "surname": ["Angelov", "Ivanov"], "age": [32, 15]})

Unnamed: 0,name,surname,age
0,Ivan,Angelov,32
1,Georgi,Ivanov,15


### You could read a table by pd.read_table or pd.read_csv

In [4]:
pd.read_table("data/accidents.csv", sep = ",")

Unnamed: 0,Miles from Home,% of Accidents
0,less than 1,23
1,1 to 5,29
2,6 to 10,17
3,11 to 15,8
4,16 to 20,6
5,over 20,17


In [5]:
pd.read_csv("data/accidents.csv")

Unnamed: 0,Miles from Home,% of Accidents
0,less than 1,23
1,1 to 5,29
2,6 to 10,17
3,11 to 15,8
4,16 to 20,6
5,over 20,17


In [6]:
type(pd.read_csv("data/accidents.csv"))

pandas.core.frame.DataFrame

### You could add a column to a variable and use it later:

In [7]:
accidents = pd.read_csv("data/accidents.csv")

In [8]:
accidents

Unnamed: 0,Miles from Home,% of Accidents
0,less than 1,23
1,1 to 5,29
2,6 to 10,17
3,11 to 15,8
4,16 to 20,6
5,over 20,17


### Index by rows [0, 6) by step 1

In [9]:
accidents.index

RangeIndex(start=0, stop=6, step=1)

### Columns

In [11]:
accidents.columns

Index(['Miles from Home', '% of Accidents'], dtype='object')

### Reach by column name like dictionary:

In [12]:
accidents["Miles from Home"]

0    less than 1
1         1 to 5
2        6 to 10
3       11 to 15
4       16 to 20
5        over 20
Name: Miles from Home, dtype: object

In [14]:
accidents[["Miles from Home"]]

Unnamed: 0,Miles from Home
0,less than 1
1,1 to 5
2,6 to 10
3,11 to 15
4,16 to 20
5,over 20


In [15]:
type(accidents[["Miles from Home"]])

pandas.core.frame.DataFrame

In [16]:
type(accidents["Miles from Home"])

pandas.core.series.Series

### We could give a name to the columns

In [19]:
accidents.columns = ["miles_from_home", "pct_of_accidents"]

In [20]:
accidents

Unnamed: 0,miles_from_home,pct_of_accidents
0,less than 1,23
1,1 to 5,29
2,6 to 10,17
3,11 to 15,8
4,16 to 20,6
5,over 20,17


### Now we could call it like dictionary - with [] or with dot:

In [21]:
accidents["miles_from_home"]

0    less than 1
1         1 to 5
2        6 to 10
3       11 to 15
4       16 to 20
5        over 20
Name: miles_from_home, dtype: object

In [22]:
accidents.miles_from_home

0    less than 1
1         1 to 5
2        6 to 10
3       11 to 15
4       16 to 20
5        over 20
Name: miles_from_home, dtype: object

### How many columns and rows contains my table

In [23]:
accidents.shape

(6, 2)

In [24]:
accidents.describe()

Unnamed: 0,pct_of_accidents
count,6.0
mean,16.666667
std,8.733079
min,6.0
25%,10.25
50%,17.0
75%,21.5
max,29.0


In [25]:
accidents.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
pct_of_accidents,6.0,16.666667,8.733079,6.0,10.25,17.0,21.5,29.0


In [26]:
accidents.dtypes

miles_from_home     object
pct_of_accidents     int64
dtype: object

### Index change

In [28]:
accidents.index = [7, 17, 20, 4, 2, -1]

In [29]:
accidents

Unnamed: 0,miles_from_home,pct_of_accidents
7,less than 1,23
17,1 to 5,29
20,6 to 10,17
4,11 to 15,8
2,16 to 20,6
-1,over 20,17


### We could reset the index again

In [30]:
accidents.reset_index()

Unnamed: 0,index,miles_from_home,pct_of_accidents
0,7,less than 1,23
1,17,1 to 5,29
2,20,6 to 10,17
3,4,11 to 15,8
4,2,16 to 20,6
5,-1,over 20,17


### We could create new table

In [36]:
accidents_new_index = accidents.reset_index(drop=True)

In [37]:
accidents_new_index

Unnamed: 0,miles_from_home,pct_of_accidents
0,less than 1,23
1,1 to 5,29
2,6 to 10,17
3,11 to 15,8
4,16 to 20,6
5,over 20,17


In [35]:
accidents

Unnamed: 0,miles_from_home,pct_of_accidents
7,less than 1,23
17,1 to 5,29
20,6 to 10,17
4,11 to 15,8
2,16 to 20,6
-1,over 20,17


### Reading from internet dataset

In [39]:
pd.read_csv("https://github.com/plotly/datasets/raw/master/data.csv")

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.658180,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.233810,30,0,0.036050,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
149995,0,0.040674,74,0,0.225131,2100.0,4,0,1,0,0.0
149996,0,0.299745,44,0,0.716562,5584.0,4,0,1,0,2.0
149997,0,0.246044,58,0,3870.000000,,18,0,1,0,0.0
149998,0,0.000000,30,0,0.000000,5716.0,4,0,0,0,0.0


In [40]:
credit_risk_data = pd.read_csv("https://github.com/plotly/datasets/raw/master/data.csv")

In [41]:
credit_risk_data

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.658180,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.233810,30,0,0.036050,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
149995,0,0.040674,74,0,0.225131,2100.0,4,0,1,0,0.0
149996,0,0.299745,44,0,0.716562,5584.0,4,0,1,0,2.0
149997,0,0.246044,58,0,3870.000000,,18,0,1,0,0.0
149998,0,0.000000,30,0,0.000000,5716.0,4,0,0,0,0.0


In [42]:
credit_risk_data.shape

(150000, 11)

In [43]:
credit_risk_data.dtypes

SeriousDlqin2yrs                          int64
RevolvingUtilizationOfUnsecuredLines    float64
age                                       int64
NumberOfTime30-59DaysPastDueNotWorse      int64
DebtRatio                               float64
MonthlyIncome                           float64
NumberOfOpenCreditLinesAndLoans           int64
NumberOfTimes90DaysLate                   int64
NumberRealEstateLoansOrLines              int64
NumberOfTime60-89DaysPastDueNotWorse      int64
NumberOfDependents                      float64
dtype: object

In [44]:
credit_risk_data.describe()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,150000.0,150000.0,150000.0,150000.0,150000.0,120269.0,150000.0,150000.0,150000.0,150000.0,146076.0
mean,0.06684,6.048438,52.295207,0.421033,353.005076,6670.221,8.45276,0.265973,1.01824,0.240387,0.757222
std,0.249746,249.755371,14.771866,4.192781,2037.818523,14384.67,5.145951,4.169304,1.129771,4.155179,1.115086
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.029867,41.0,0.0,0.175074,3400.0,5.0,0.0,0.0,0.0,0.0
50%,0.0,0.154181,52.0,0.0,0.366508,5400.0,8.0,0.0,1.0,0.0,0.0
75%,0.0,0.559046,63.0,0.0,0.868254,8249.0,11.0,0.0,2.0,0.0,1.0
max,1.0,50708.0,109.0,98.0,329664.0,3008750.0,58.0,98.0,54.0,98.0,20.0
