### **17 - Agregar una nueva columna**

In [12]:
import pandas as pd
import numpy as np

#### **`Ejemplos varios`**

In [None]:
df = pd.read_csv('flights.csv')
df

##### Ejemplo 1

Agregamos una nueva columna al Dataframe con un valor numerico

In [48]:
df['nueva_columna'] = 1

df

Unnamed: 0,year,month,passengers,nueva_columna
0,1949,January,112,1
1,1949,February,118,1
2,1949,March,132,1
3,1949,April,129,1
4,1949,May,121,1
...,...,...,...,...
139,1960,August,606,1
140,1960,September,508,1
141,1960,October,461,1
142,1960,November,390,1


##### Ejemplo 2

In [49]:
for idx, row in df.iterrows():
    df.at[idx, 'nueva_columna_2'] = 2

df

Unnamed: 0,year,month,passengers,nueva_columna,nueva_columna_2
0,1949,January,112,1,2.0
1,1949,February,118,1,2.0
2,1949,March,132,1,2.0
3,1949,April,129,1,2.0
4,1949,May,121,1,2.0
...,...,...,...,...,...
139,1960,August,606,1,2.0
140,1960,September,508,1,2.0
141,1960,October,461,1,2.0
142,1960,November,390,1,2.0


##### Ejemplo 3

Agrego una columna con la media de los pasajeros en cada fila

In [50]:
media_pasajeros = df['passengers'].mean()

df['media_passengers'] = media_pasajeros

df

Unnamed: 0,year,month,passengers,nueva_columna,nueva_columna_2,media_passengers
0,1949,January,112,1,2.0,280.298611
1,1949,February,118,1,2.0,280.298611
2,1949,March,132,1,2.0,280.298611
3,1949,April,129,1,2.0,280.298611
4,1949,May,121,1,2.0,280.298611
...,...,...,...,...,...,...
139,1960,August,606,1,2.0,280.298611
140,1960,September,508,1,2.0,280.298611
141,1960,October,461,1,2.0,280.298611
142,1960,November,390,1,2.0,280.298611


#### **`Columna condicional`**

##### **`>` map( )**

In [23]:
df = pd.read_csv('worldcup2014.csv')
df.head()

Unnamed: 0,team,continent,group,group_pos,cantidad_juegos,juegos_ganados,juegos_empatados,juegos_perdidos,goles_favor,goles_contra,goles_diferencia,puntos,clasificado
0,Brazil,southamerica,A,1,3,2,1,0,7,2,5,7,1
1,Mexico,northamerica,A,2,3,2,1,0,4,1,3,7,1
2,Croatia,europe,A,3,3,1,0,2,6,6,0,3,0
3,Cameroon,africa,A,4,3,0,0,3,1,9,−8,0,0
4,Netherlands,europe,B,1,3,3,0,0,10,3,7,9,1


In [21]:
# El resto de paises tomara un valor NaN
map_dictionary = {'Brazil':"Brasilia", 'Mexico': "DF", 'Croatia': "Zagreb", 'Netherlands': "Amsterdam", 'Cameroon': "Yande"}

df["capital"] = df["team"].map(map_dictionary)
df.head()

Unnamed: 0,team,continent,group,group_pos,cantidad_juegos,juegos_ganados,juegos_empatados,juegos_perdidos,goles_favor,goles_contra,goles_diferencia,puntos,clasificado,capital
0,Brazil,southamerica,A,1,3,2,1,0,7,2,5,7,1,Brasilia
1,Mexico,northamerica,A,2,3,2,1,0,4,1,3,7,1,DF
2,Croatia,europe,A,3,3,1,0,2,6,6,0,3,0,Zagreb
3,Cameroon,africa,A,4,3,0,0,3,1,9,−8,0,0,Yande
4,Netherlands,europe,B,1,3,3,0,0,10,3,7,9,1,Amsterdam


PODRIA UNIR ESTA LISTA DE PAISES CON SUS RESPECTIVAS CAPITALES EXTRAIDAS DESDE ALGUNA API 
Y TRANSFORMARLO EN UN DICCIONARIO PARA TRABAJARLO CON EL METODO 'map( )'
(LO DEJO PENDIENTE)

In [6]:
# dic = []
# for d in df.team.unique():
#     dic.append(d)

pais = [pais for pais in df.team.unique()]
len(pais)

32

##### **`>` if - else**

In [22]:
list_of_dates = [
    "2019-11-20",
    "2020-01-02",
    "2020-02-05",
    "2020-03-10",
    "2020-04-16",
    "2020-05-01",
]
employees = ["Hisila", "Shristi", "Zeppy", "Alina", "Jerry", "Kevin"]
salary = [200, 400, 300, 500, 600, 300]
df = pd.DataFrame(
    {"Name": employees, "Joined date": pd.to_datetime(list_of_dates), "Salary": salary}
)

df

Unnamed: 0,Name,Joined date,Salary
0,Hisila,2019-11-20,200
1,Shristi,2020-01-02,400
2,Zeppy,2020-02-05,300
3,Alina,2020-03-10,500
4,Jerry,2020-04-16,600
5,Kevin,2020-05-01,300


In [23]:
df["Status"] = ["Senior" if s >= 400 else "Junior" for s in df["Salary"]]
df

Unnamed: 0,Name,Joined date,Salary,Status
0,Hisila,2019-11-20,200,Junior
1,Shristi,2020-01-02,400,Senior
2,Zeppy,2020-02-05,300,Junior
3,Alina,2020-03-10,500,Senior
4,Jerry,2020-04-16,600,Senior
5,Kevin,2020-05-01,300,Junior


##### **`>` Numpy where( )**

In [16]:
list_of_dates = [
    "2019-11-20",
    "2020-01-02",
    "2020-02-05",
    "2020-03-10",
    "2020-04-16",
    "2020-05-01",
]
employees = ["Hisila", "Shristi", "Zeppy", "Alina", "Jerry", "Kevin"]
salary = [200, 400, 300, 500, 600, 300]
df = pd.DataFrame(
    {"Name": employees, "Joined date": pd.to_datetime(list_of_dates), "Salary": salary}
)

df

Unnamed: 0,Name,Joined date,Salary
0,Hisila,2019-11-20,200
1,Shristi,2020-01-02,400
2,Zeppy,2020-02-05,300
3,Alina,2020-03-10,500
4,Jerry,2020-04-16,600
5,Kevin,2020-05-01,300


In [25]:
df["Status_b"] = np.where(df["Salary"] >= 400, "Senior", "Junior")
df

Unnamed: 0,Name,Joined date,Salary,Status,Status_b
0,Hisila,2019-11-20,200,Junior,Junior
1,Shristi,2020-01-02,400,Senior,Senior
2,Zeppy,2020-02-05,300,Junior,Junior
3,Alina,2020-03-10,500,Senior,Senior
4,Jerry,2020-04-16,600,Senior,Senior
5,Kevin,2020-05-01,300,Junior,Junior


##### **`>` Pandas where( )**

In [21]:
list_of_dates = [
    "2019-11-20",
    "2020-01-02",
    "2020-02-05",
    "2020-03-10",
    "2020-04-16",
    "2020-05-01",
]
employees = ["Hisila", "Shristi", "Zeppy", "Alina", "Jerry", "Kevin"]
salary = [200, 400, 300, 500, 600, 300]
df = pd.DataFrame(
    {"Name": employees, "Joined date": pd.to_datetime(list_of_dates), "Salary": salary}
)

df

Unnamed: 0,Name,Joined date,Salary
0,Hisila,2019-11-20,200
1,Shristi,2020-01-02,400
2,Zeppy,2020-02-05,300
3,Alina,2020-03-10,500
4,Jerry,2020-04-16,600
5,Kevin,2020-05-01,300


In [22]:
# Manten todos los valores iguales o mayor a 400 y lo demas dale el valor de 'Junior'
df["Salary_c"] = df["Salary"].where(lambda x: x >= 400, 'Junior')
df.head()

Unnamed: 0,Name,Joined date,Salary,Salary_c
0,Hisila,2019-11-20,200,Junior
1,Shristi,2020-01-02,400,400
2,Zeppy,2020-02-05,300,Junior
3,Alina,2020-03-10,500,500
4,Jerry,2020-04-16,600,600


##### **`>` select( )** [Deprecated]

In [26]:
conditionlist = [
    (df["Salary"] >= 500),
    (df["Salary"] >= 300) & (df["Salary"] < 300),
    (df["Salary"] <= 300),
]
choicelist = ["High", "Mid", "Low"]
df["Salary_c"] = np.select(conditionlist, choicelist, default="Not Specified")

df

Unnamed: 0,Name,Joined date,Salary,Status,Status_b,Salary_c
0,Hisila,2019-11-20,200,Junior,Junior,Low
1,Shristi,2020-01-02,400,Senior,Senior,Not Specified
2,Zeppy,2020-02-05,300,Junior,Junior,Low
3,Alina,2020-03-10,500,Senior,Senior,High
4,Jerry,2020-04-16,600,Senior,Senior,High
5,Kevin,2020-05-01,300,Junior,Junior,Low


##### **`>` replace( )**

In [None]:
df = pd.read_csv('qog_std_cs_jan18.csv')
df.head(5)

Unnamed: 0,ccode,cname,ccodealp,ccodecow,ccodewb,version,aid_cpnc,aid_cpsc,aid_crnc,aid_crnio,...,wvs_relacc,wvs_relsch,wvs_relsci,wvs_satfin,wvs_satlif,wvs_screl,wvs_subh,wvs_survself,wvs_tradrat,wvs_trust
0,4,Afghanistan,AFG,700.0,4.0,QoGStdCSJan18,,,29.0,13.0,...,,,,,,,,,,
1,8,Albania,ALB,339.0,8.0,QoGStdCSJan18,,,26.0,13.0,...,,,,,,,,,,
2,12,Algeria,DZA,615.0,12.0,QoGStdCSJan18,,,21.0,6.0,...,3.710616,1.704692,3.681347,5.988125,6.301029,5.339371,3.805579,-0.409807,-0.498164,0.179286
3,20,Andorra,AND,232.0,20.0,QoGStdCSJan18,,,,,...,,,,,,,,,,
4,24,Angola,AGO,540.0,24.0,QoGStdCSJan18,,,22.0,13.0,...,,,,,,,,,,


In [None]:
hdi_group = df.loc[:, ['cname', 'ccode', 'undp_hdi', 'ht_region']]
hdi_group.shape

(194, 4)

Creo la nueva columna **`region_recod`** reemplazando los valores de la columna **`ht_region`** por otros valores

In [None]:
hdi_group['region_recod'] = hdi_group['ht_region'].replace([1,2,3,4,5,6,7,8,9,10],['EastEurope', 'LatAm', 'NorthAfrica','SubSaharian', 'WesternDem', 'EastAsia','SouthEastAsia','SouthAsia','Pacific','Caribbean'])
hdi_group

Unnamed: 0,cname,ccode,undp_hdi,ht_region,region_recod
0,Afghanistan,4,0.479,8,SouthAsia
1,Albania,8,0.762,1,EastEurope
2,Algeria,12,0.743,3,NorthAfrica
3,Andorra,20,0.857,5,WesternDem
4,Angola,24,0.531,4,SubSaharian
...,...,...,...,...,...
189,Uzbekistan,860,0.697,1,EastEurope
190,Venezuela,862,0.769,2,LatAm
191,Samoa,882,0.702,9,Pacific
192,Yemen,887,0.499,3,NorthAfrica


In [None]:
group_mean = round(hdi_group.groupby('region_recod')['undp_hdi'].mean(),2)
group_mean

region_recod
Caribbean        0.74
EastAsia         0.82
EastEurope       0.78
LatAm            0.72
NorthAfrica      0.75
Pacific          0.64
SouthAsia        0.61
SouthEastAsia    0.70
SubSaharian      0.51
WesternDem       0.90
Name: undp_hdi, dtype: float64