* Pandas is a newer package built on top of NumPy,
* Provides an efficient implementation of a DataFrame. 
* DataFrames are essentially multidimensional arrays with attached row and column labels, and often with heterogeneous types 
* Pandas implements a number of powerful data operations familiar to users of both database frameworks and spreadsheet programs.
* Series, DataFrame, Index, TimeSeries

##### Panda Series
* One dimensional array of indexed data

In [1]:
import pandas as pd

In [2]:
ser1 = pd.Series([11,2,5,9])

In [3]:
ser1.values

array([11,  2,  5,  9], dtype=int64)

In [4]:
ser1.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
ser1

0    11
1     2
2     5
3     9
dtype: int64

In [6]:
ser1[1]

2

In [7]:
ser1 = pd.Series([11,2,5,9] , index=[7,8,9,10])

In [8]:
ser1

7     11
8      2
9      5
10     9
dtype: int64

In [9]:
ser1[7]

11

In [10]:
ser1 = pd.Series([11,2,5,9] , index=['a','b','c','d'])

In [11]:
ser1

a    11
b     2
c     5
d     9
dtype: int64

In [12]:
ser1['a']

11

In [13]:
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}

In [14]:
population_dict

{'California': 38332521,
 'Florida': 19552860,
 'Illinois': 12882135,
 'New York': 19651127,
 'Texas': 26448193}

In [15]:
# Convert dictionary to series
population = pd.Series(population_dict)

In [16]:
population

California    38332521
Florida       19552860
Illinois      12882135
New York      19651127
Texas         26448193
dtype: int64

In [17]:
population['California':'New York']

California    38332521
Florida       19552860
Illinois      12882135
New York      19651127
dtype: int64

In [18]:
population = pd.Series(population_dict , index=['Illinois','New York'])

In [19]:
population

Illinois    12882135
New York    19651127
dtype: int64

##### DataFrame
* Collection of series
* This is analog to two-dimentional array

In [20]:
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)

In [21]:
area_dict = {'California': 423967, 
             
             'Illinois': 149995,
              'Texas': 695662, 
             'New York': 141297,
             'Florida': 170312}
area = pd.Series(area_dict)

In [22]:
# Creating dataframes using series
states = pd.DataFrame({'population':population, 'area':area})

In [23]:
import numpy as np
np.sum(states['area'].values)

1581233

In [24]:
states.index

Index(['California', 'Florida', 'Illinois', 'New York', 'Texas'], dtype='object')

In [25]:
states

Unnamed: 0,area,population
California,423967,38332521.0
Florida,170312,19552860.0
Illinois,149995,12882135.0
New York,141297,
Texas,695662,26448193.0


In [26]:
states[['area']].sum()

area    1581233
dtype: int64

In [27]:
type(states['area'])

pandas.core.series.Series

In [28]:
type(states[['area']])

pandas.core.frame.DataFrame

In [29]:
states[['area','population']].sum()

area           1581233.0
population    97215709.0
dtype: float64

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('../data/california_cities.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,city,latd,longd,elevation_m,elevation_ft,population_total,area_total_sq_mi,area_land_sq_mi,area_water_sq_mi,area_total_km2,area_land_km2,area_water_km2,area_water_percent
0,0,Adelanto,34.576111,-117.432778,875.0,2871.0,31765,56.027,56.009,0.018,145.107,145.062,0.046,0.03
1,1,AgouraHills,34.153333,-118.761667,281.0,922.0,20330,7.822,7.793,0.029,20.26,20.184,0.076,0.37
2,2,Alameda,37.756111,-122.274444,,33.0,75467,22.96,10.611,12.349,59.465,27.482,31.983,53.79
3,3,Albany,37.886944,-122.297778,,43.0,18969,5.465,1.788,3.677,14.155,4.632,9.524,67.28
4,4,Alhambra,34.081944,-118.135,150.0,492.0,83089,7.632,7.631,0.001,19.766,19.763,0.003,0.01


In [5]:
df

Unnamed: 0.1,Unnamed: 0,city,latd,longd,elevation_m,elevation_ft,population_total,area_total_sq_mi,area_land_sq_mi,area_water_sq_mi,area_total_km2,area_land_km2,area_water_km2,area_water_percent
0,0,Adelanto,34.576111,-117.432778,875.0,2871.0,31765,56.027,56.009,0.018,145.107,145.062,0.046,0.03
1,1,AgouraHills,34.153333,-118.761667,281.0,922.0,20330,7.822,7.793,0.029,20.260,20.184,0.076,0.37
2,2,Alameda,37.756111,-122.274444,,33.0,75467,22.960,10.611,12.349,59.465,27.482,31.983,53.79
3,3,Albany,37.886944,-122.297778,,43.0,18969,5.465,1.788,3.677,14.155,4.632,9.524,67.28
4,4,Alhambra,34.081944,-118.135000,150.0,492.0,83089,7.632,7.631,0.001,19.766,19.763,0.003,0.01
5,5,AlisoViejo,33.575000,-117.725556,127.0,417.0,47823,7.472,7.472,0.000,19.352,19.352,0.000,0.00
6,6,Alturas,41.487222,-120.542500,1332.0,4370.0,2827,2.449,2.435,0.014,6.342,6.306,0.036,0.57
7,7,AmadorCity,38.419444,-120.824167,280.0,919.0,185,0.314,0.314,0.000,0.813,0.813,0.000,0.00
8,8,AmericanCanyon,38.168056,-122.252500,14.0,46.0,19454,4.845,4.837,0.008,12.548,12.527,0.021,0.17
9,9,Anaheim,33.836111,-117.889722,48.0,157.0,336000,50.811,49.835,0.976,131.600,129.073,2.527,1.92


In [14]:
df2 = df[['city', 'elevation_m', 'population_total']]

df2[df2['population_total'] > 31000]

Unnamed: 0,city,elevation_m,population_total
0,Adelanto,875.00,31765
2,Alameda,,75467
4,Alhambra,150.00,83089
5,AlisoViejo,127.00,47823
9,Anaheim,48.00,336000
12,Antioch,13.00,107100
13,AppleValley,898.00,69135
14,Arcadia,147.00,56364
25,Azusa,186.00,46361
26,Bakersfield,,347483
