In [0]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

## PANDAS

In [0]:
import pandas as pd
import numpy as np

url = 'https://gist.githubusercontent.com/armgilles/194bcff35001e7eb53a2a8b441e8b2c6/raw/92200bc0a673d5ce2110aaad4544ed6c4010f687/pokemon.csv'
data = pd.read_csv(url)

print (data.columns) # columns gives column names of features
print (data.shape)   # shape gives number of rows and columns in a tuble
#print (data.info)    # info gives data type like dataframe, number of sample or row, number of feature or column, feature types and memory usage

Index(['#', 'Name', 'Type 1', 'Type 2', 'Total', 'HP', 'Attack', 'Defense',
       'Sp. Atk', 'Sp. Def', 'Speed', 'Generation', 'Legendary'],
      dtype='object')
(800, 13)


In [0]:
data.head()   # head shows first 5 rows


Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


In [0]:
data.tail() # tail shows last 5 rows

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
795,719,Diancie,Rock,Fairy,600,50,100,150,100,150,50,6,True
796,719,DiancieMega Diancie,Rock,Fairy,700,50,160,110,160,110,110,6,True
797,720,HoopaHoopa Confined,Psychic,Ghost,600,80,110,60,150,130,70,6,True
798,720,HoopaHoopa Unbound,Psychic,Dark,680,80,160,60,170,130,80,6,True
799,721,Volcanion,Fire,Water,600,80,110,120,130,90,70,6,True


In [0]:
# 1 - Filtering Pandas data frame
x = data['Defense']>200     # There are only 3 pokemons who have higher defense value than 200
data[x]

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
224,208,SteelixMega Steelix,Steel,Ground,610,75,125,230,55,95,30,2,False
230,213,Shuckle,Bug,Rock,505,20,10,230,10,230,5,2,False
333,306,AggronMega Aggron,Steel,,630,70,140,230,60,80,50,3,False


In [0]:
# 2 - Filtering pandas with logical_and
# There are only 2 pokemons who have higher defence value than 2oo and higher attack value than 100
data[np.logical_and(data['Defense']>200, data['Attack']>100 )]

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
224,208,SteelixMega Steelix,Steel,Ground,610,75,125,230,55,95,30,2,False
333,306,AggronMega Aggron,Steel,,630,70,140,230,60,80,50,3,False


* **value_counts()** : Frequency counts 
* **outliers** : the value that is considerably higher or lower from rest of the data
* **count**: number of entries
* **mean**: average of entries
* **std**: standart deviation
* **min**: minimum entry
* **max**: maximum entry

In [0]:
# For example lets look frequency of pokemom types
print(data['Type 1'].value_counts(dropna =False))  # if there are nan values that also be counted

Water       112
Normal       98
Grass        70
Bug          69
Psychic      57
Fire         52
Rock         44
Electric     44
Dragon       32
Ground       32
Ghost        32
Dark         31
Poison       28
Steel        27
Fighting     27
Ice          24
Fairy        17
Flying        4
Name: Type 1, dtype: int64


In [0]:
# lets melt

# id_vars = what we do not wish to melt
# value_vars = what we want to melt
melted = pd.melt(frame=data.head()[:5],id_vars = 'Name', value_vars= ['Attack','Defense'])
melted

Unnamed: 0,Name,variable,value
0,Bulbasaur,Attack,49
1,Ivysaur,Attack,62
2,Venusaur,Attack,82
3,VenusaurMega Venusaur,Attack,100
4,Charmander,Attack,52
5,Bulbasaur,Defense,49
6,Ivysaur,Defense,63
7,Venusaur,Defense,83
8,VenusaurMega Venusaur,Defense,123
9,Charmander,Defense,43


In [0]:
# reverse of melting

# Index is name
# I want to make that columns are variable
# Finally values in columns are value
melted.pivot(index = 'Name', columns = 'variable',values='value')

variable,Attack,Defense
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Bulbasaur,49,49
Charmander,52,43
Ivysaur,62,63
Venusaur,82,83
VenusaurMega Venusaur,100,123


In [0]:
# We can concatenate two dataframe


# Firstly lets create 2 data frame
data1 = data.head()
data2 = data.tail()

conc_data_row = pd.concat([data1,data2],axis =0,ignore_index =True) # axis = 0 : adds dataframes in row
conc_data_row

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False
5,719,Diancie,Rock,Fairy,600,50,100,150,100,150,50,6,True
6,719,DiancieMega Diancie,Rock,Fairy,700,50,160,110,160,110,110,6,True
7,720,HoopaHoopa Confined,Psychic,Ghost,600,80,110,60,150,130,70,6,True
8,720,HoopaHoopa Unbound,Psychic,Dark,680,80,160,60,170,130,80,6,True
9,721,Volcanion,Fire,Water,600,80,110,120,130,90,70,6,True


In [0]:
data1 = data['Attack'].head()
data2 = data['Defense'].head()

conc_data_col = pd.concat([data1,data2],axis =1) # axis = 0 : adds dataframes in row
conc_data_col

Unnamed: 0,Attack,Defense
0,49,49
1,62,63
2,82,83
3,100,123
4,52,43


#### Data Types

There are 5 basic data types: 
* object(string),
* boolean, 
* integer, 
* float 
* categorical. 

We can make conversion data types like from str to categorical or from int to float 
Why is category important:

In [0]:
data.dtypes

#              int64
Name          object
Type 1        object
Type 2        object
Total          int64
HP             int64
Attack         int64
Defense        int64
Sp. Atk        int64
Sp. Def        int64
Speed          int64
Generation     int64
Legendary       bool
dtype: object

In [0]:
# lets convert object(str) to categorical and int to float.
data['Type 1'] = data['Type 1'].astype('category')
data['Speed']  = data['Speed'].astype('float')

data.dtypes

#                int64
Name            object
Type 1        category
Type 2          object
Total            int64
HP               int64
Attack           int64
Defense          int64
Sp. Atk          int64
Sp. Def          int64
Speed          float64
Generation       int64
Legendary         bool
dtype: object

#### MISSING DATA and TESTING WITH ASSERT

If we encounter with missing data, what we can do:

* leave as is
* drop them with dropna()
* fill missing value with fillna()
* fill missing values with test statistics like mean 
* Assert statement: check that you can turn on or turn off when you are done with your testing of the program

In [0]:
# Lets drop nan values
data1=data   # also we will use data to fill missing value so I assign it to data1 variable
data1["Type 2"].dropna(inplace = True)  

assert data1['Type 2'].notnull().all()

# # With assert statement we can check a lot of thing. For example
# assert data.columns[1] == 'Name'
# assert data.Speed.dtypes == np.int

#### BUILDING DATA FRAMES FROM SCRATCH

* We can build data frames from csv as we did earlier.
* Also we can build dataframe from dictionaries
* Broadcasting: Create new column and assign a value to entire column

In [0]:
# data frames from dictionary

country    = ["Spain","France"]
population = ["11","12"]
list_label = ["country","population"]

list_col   = [country, population]
zipped = list(zip(list_label,list_col))

data_dict = dict(zipped)
df = pd.DataFrame(data_dict)
df

Unnamed: 0,country,population
0,Spain,11
1,France,12


In [0]:
# Add new columns
df['capital'] = ['madrid', 'paris']
df

Unnamed: 0,country,population,capital
0,Spain,11,madrid
1,France,12,paris


In [0]:
# Broadcasting
df['income'] = 0
df

Unnamed: 0,country,population,capital,income
0,Spain,11,madrid,0
1,France,12,paris,0


#### INDEXING DATA FRAMES
* Indexing using square brackets
* Using column attribute and row label
* Using loc accessor
* Selecting only some columns

In [0]:
# indexing using square brackets
data['HP'][1]

60

In [0]:
# using column attribute and row label
data.HP[1]

60

In [0]:
# using loc accessor
data.loc[:5,['HP']]

Unnamed: 0,HP
0,45
1,60
2,80
3,80
4,39
5,58


In [0]:
# Selecting only some columns
data1 = data[['HP', 'Attack']]
data1[:5]

Unnamed: 0,HP,Attack
0,45,49
1,60,62
2,80,82
3,80,100
4,39,52


#### SLICING DATA FRAME

* Difference between selecting columns
>* Series and data frames
* Slicing and indexing series
* Reverse slicing
* From something to end

In [0]:
# Slicing and indexing series
data.loc[1:5,"HP":"Defense"]   # 10 and "Defense" are inclusive

Unnamed: 0,HP,Attack,Defense
1,60,62,63
2,80,82,83
3,80,100,123
4,39,52,43
5,58,64,58


In [0]:
# Reverse slicing 
data.loc[10:1:-2,"HP":"Defense"] 

Unnamed: 0,HP,Attack,Defense
10,59,63,80
8,78,104,78
6,78,84,78
4,39,52,43
2,80,82,83


In [0]:
# From something to end
data.loc[1:5,"Speed":] 

Unnamed: 0,Speed,Generation,Legendary
1,60,1,False
2,80,1,False
3,80,1,False
4,65,1,False
5,80,1,False


In [0]:
## transforming data
data1 = data.HP.apply(lambda n : n**2)
data1[:5]

0    2025
1    3600
2    6400
3    6400
4    1521
Name: HP, dtype: int64