# PANDAS

In [2]:
import numpy as np
import pandas as pd

In [3]:
# np.nan es como un null
s = pd.Series([1,3,5,np.nan, 6,8])

In [4]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [5]:
s[1]

3.0

In [6]:
s = pd.Series([1,3,5,np.nan, 6,8], index= ["a", "b", "c", "d", "e", "f"])

In [7]:
s

a    1.0
b    3.0
c    5.0
d    NaN
e    6.0
f    8.0
dtype: float64

In [8]:
s = pd.Series([1,3,5,np.nan, 6,8], index= np.arange(2,14,2))

In [9]:
s

2     1.0
4     3.0
6     5.0
8     NaN
10    6.0
12    8.0
dtype: float64

In [10]:
df = pd.DataFrame(np.random.random((10,5)))

In [11]:
colnames = ["Col1", "Col2", "Col3", "Col4", "Col5"]

In [12]:
df = pd.DataFrame(np.random.random((10,5)), columns = colnames)

In [13]:
df.head()

Unnamed: 0,Col1,Col2,Col3,Col4,Col5
0,0.497879,0.369834,0.575738,0.559787,0.239015
1,0.634556,0.419047,0.156435,0.171696,0.133442
2,0.369912,0.394515,0.868813,0.989803,0.446776
3,0.102295,0.109475,0.846052,0.854292,0.38481
4,0.443997,0.640383,0.976122,0.422376,0.093824


In [14]:
# para seleccionar solo una columna
df["Col1"]

0    0.497879
1    0.634556
2    0.369912
3    0.102295
4    0.443997
5    0.679038
6    0.452344
7    0.620700
8    0.619924
9    0.818815
Name: Col1, dtype: float64

In [15]:
# para seleccionar más de una columna

In [16]:
df[["Col1", "Col2", "Col3"]]

Unnamed: 0,Col1,Col2,Col3
0,0.497879,0.369834,0.575738
1,0.634556,0.419047,0.156435
2,0.369912,0.394515,0.868813
3,0.102295,0.109475,0.846052
4,0.443997,0.640383,0.976122
5,0.679038,0.60233,0.675324
6,0.452344,0.295904,0.983546
7,0.6207,0.280208,0.631331
8,0.619924,0.519696,0.779067
9,0.818815,0.851234,0.957978


In [17]:
# quiero cambiar el nombre de las filas a fechas, y queremos que cada paso en el range sea 1 día (periods). Por defecto es un día
df.index=pd.date_range("2019/01/30", periods=df.shape[0])

In [18]:
df

Unnamed: 0,Col1,Col2,Col3,Col4,Col5
2019-01-30,0.497879,0.369834,0.575738,0.559787,0.239015
2019-01-31,0.634556,0.419047,0.156435,0.171696,0.133442
2019-02-01,0.369912,0.394515,0.868813,0.989803,0.446776
2019-02-02,0.102295,0.109475,0.846052,0.854292,0.38481
2019-02-03,0.443997,0.640383,0.976122,0.422376,0.093824
2019-02-04,0.679038,0.60233,0.675324,0.255124,0.829835
2019-02-05,0.452344,0.295904,0.983546,0.912587,0.81124
2019-02-06,0.6207,0.280208,0.631331,0.314175,0.002852
2019-02-07,0.619924,0.519696,0.779067,0.475898,0.904789
2019-02-08,0.818815,0.851234,0.957978,0.480859,0.280226


In [19]:
#si necesitas ayuda sobre algún comando (por ejemplo, date_range)
help(pd.date_range)

Help on function date_range in module pandas.core.indexes.datetimes:

date_range(start=None, end=None, periods=None, freq=None, tz=None, normalize=False, name=None, closed=None, **kwargs)
    Return a fixed frequency DatetimeIndex.
    
    Parameters
    ----------
    start : str or datetime-like, optional
        Left bound for generating dates.
    end : str or datetime-like, optional
        Right bound for generating dates.
    periods : integer, optional
        Number of periods to generate.
    freq : str or DateOffset, default 'D'
        Frequency strings can have multiples, e.g. '5H'. See
        :ref:`here <timeseries.offset_aliases>` for a list of
        frequency aliases.
    tz : str or tzinfo, optional
        Time zone name for returning localized DatetimeIndex, for example
        'Asia/Hong_Kong'. By default, the resulting DatetimeIndex is
        timezone-naive.
    normalize : bool, default False
        Normalize start/end dates to midnight before generating dat

### Convertir listas o diccionarios

In [20]:
lst = [4985792387, 9834798274, 948759238745, 39847593857, 983475]

price_df = pd.DataFrame(lst, columns = ["Sale_Price"])

In [21]:
price_df

Unnamed: 0,Sale_Price
0,4985792387
1,9834798274
2,948759238745
3,39847593857
4,983475


In [22]:
price_df['2']

KeyError: '2'

In [23]:
#crear un dataframe de una lista de listas
lst_lst = [[8450, 'CollgCr', 2003, 7, 208500],
          [9600, 'Veenker', 1976, 6, 181500],
          [11250, 'CollgCr', 2001, 7, 223500],
          [9550, 'Crawfor', 1915, 7, 140000],
          [14260, 'NoRidge', 2000, 8, 250000],
          [14115, 'Mitchel', 1993, 5, 143000],
          [10084, 'Somerst', 2004, 8, 307000],
          [10382, 'NWAmes', 1973, 7, 200000],
          [6120, 'OldTown', 1931, 7, 129900],
          [7420, 'BrkSide', 1939, 5, 118000]]

In [60]:
#definimos los nombres de las columnas
colnames = ["LotSize", "Neighorhood", "YearBuilt", "Quality", "SalePrice"]

In [65]:
lst_df = pd.DataFrame(lst_lst, columns=colnames)

In [66]:
lst_df

Unnamed: 0,LotSize,Neighorhood,YearBuilt,Quality,SalePrice
0,8450,CollgCr,2003,7,208500
1,9600,Veenker,1976,6,181500
2,11250,CollgCr,2001,7,223500
3,9550,Crawfor,1915,7,140000
4,14260,NoRidge,2000,8,250000
5,14115,Mitchel,1993,5,143000
6,10084,Somerst,2004,8,307000
7,10382,NWAmes,1973,7,200000
8,6120,OldTown,1931,7,129900
9,7420,BrkSide,1939,5,118000


In [42]:
#y ahora un diccionario:

house_dict = {'Baker House': [7420, 'BrkSide', 1939, 5, 118000],
             'Beazley House': [14115, 'Mitchel', 1993, 5, 143000],
             'Dominguez House': [14260, 'NoRidge', 2000, 8, 250000],
             'Hamilton House': [6120, 'OldTown', 1931, 7, 129900],
             'James House': [11250, 'CollgCr', 2001, 7, 223500],
             'Martinez House': [9600, 'Veenker', 1976, 6, 181500],
             'Roberts House': [9550, 'Crawfor', 1915, 7, 140000],
             'Smith House': [8450, 'CollgCr', 2003, 7, 208500],
             'Snyder House': [10084, 'Somerst', 2004, 8, 307000],
             'Zuckerman House': [10382, 'NWAmes', 1973, 7, 200000]}

In [43]:
#la opción más sencilla es la siguiente, que coge las keys como índices
pd.DataFrame(house_dict)

Unnamed: 0,Baker House,Beazley House,Dominguez House,Hamilton House,James House,Martinez House,Roberts House,Smith House,Snyder House,Zuckerman House
0,7420,14115,14260,6120,11250,9600,9550,8450,10084,10382
1,BrkSide,Mitchel,NoRidge,OldTown,CollgCr,Veenker,Crawfor,CollgCr,Somerst,NWAmes
2,1939,1993,2000,1931,2001,1976,1915,2003,2004,1973
3,5,5,8,7,7,6,7,7,8,7
4,118000,143000,250000,129900,223500,181500,140000,208500,307000,200000


In [46]:
#para colocarlo bien
house_df = pd.DataFrame(house_dict).transpose()

In [47]:
#para modificar el nombre de las columnas
house_df.columns= colnames

In [48]:
house_df.head()

Unnamed: 0,Col1,Col2,Col3,Col4,Col5
Baker House,7420,BrkSide,1939,5,118000
Beazley House,14115,Mitchel,1993,5,143000
Dominguez House,14260,NoRidge,2000,8,250000
Hamilton House,6120,OldTown,1931,7,129900
James House,11250,CollgCr,2001,7,223500


In [49]:
#otra forma de hacerlo, de manera que ya sabe que estamos importando información de un diccionario

house_df_2=pd.DataFrame.from_dict(house_dict, orient = "index")

In [51]:
house_df_2.head()

Unnamed: 0,0,1,2,3,4
Baker House,7420,BrkSide,1939,5,118000
Beazley House,14115,Mitchel,1993,5,143000
Dominguez House,14260,NoRidge,2000,8,250000
Hamilton House,6120,OldTown,1931,7,129900
James House,11250,CollgCr,2001,7,223500


In [52]:
#la función tail te da lo mismo que head pero empezando por el final
house_df.tail()

Unnamed: 0,Col1,Col2,Col3,Col4,Col5
Martinez House,9600,Veenker,1976,6,181500
Roberts House,9550,Crawfor,1915,7,140000
Smith House,8450,CollgCr,2003,7,208500
Snyder House,10084,Somerst,2004,8,307000
Zuckerman House,10382,NWAmes,1973,7,200000


In [53]:
#también existe la función shape, que significa que hay 10 filas y 5 columnas
house_df.shape

(10, 5)

In [55]:
#la función info te da los tipos de datos e información del dataframe
house_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, Baker House to Zuckerman House
Data columns (total 5 columns):
Col1    10 non-null object
Col2    10 non-null object
Col3    10 non-null object
Col4    10 non-null object
Col5    10 non-null object
dtypes: object(5)
memory usage: 480.0+ bytes


In [67]:
#para contar las ocurrencias de cada valor. 

lst_df["Quality"].value_counts(dropna=False)

7    5
8    2
5    2
6    1
Name: Quality, dtype: int64

In [None]:
house_df.apply(pd.Series.value_counts)

#para cada serie que tenemos en este dataframe quiero mirar las ocurrencias. Hace como un crosstab de todo

In [69]:
#si queremos seleccionar parte de una serie o dataframe, podemos jugar con los índices o con iloc (se usa para buscar por posición)


s.iloc[0]
#nos va a dar el primer valor de la serie s

1.0

In [70]:
#esto me da toda la info de Hamilton House en el house_df
house_df.loc["Hamilton House"]

Col1       6120
Col2    OldTown
Col3       1931
Col4          7
Col5     129900
Name: Hamilton House, dtype: object

In [73]:
df.iloc[0]

Col1    0.313659
Col2    0.704162
Col3    0.504396
Col4    0.627406
Col5    0.458778
Name: 2019-01-30 00:00:00, dtype: float64

In [74]:
#más ejemplos
df.loc["2019/01/30"]

Col1    0.313659
Col2    0.704162
Col3    0.504396
Col4    0.627406
Col5    0.458778
Name: 2019-01-30 00:00:00, dtype: float64

In [75]:
#si solo quiero la primera fila
df.iloc[0,:]

Col1    0.313659
Col2    0.704162
Col3    0.504396
Col4    0.627406
Col5    0.458778
Name: 2019-01-30 00:00:00, dtype: float64

In [76]:
#coordenadas. primera fila y primera columna
df.iloc[0,0]

0.3136587045337602

In [77]:
df.iloc[:2,:2]

Unnamed: 0,Col1,Col2
2019-01-30,0.313659,0.704162
2019-01-31,0.560701,0.936633


In [79]:
#suma de la columna SalePrice 

house_df["SalePrice"].sum()

KeyError: 'Hamilton House'