# Documentacion de Pandas

https://pandas.pydata.org/docs/user_guide/10min.html

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('archive.zip', compression='zip', header=0, sep=',', quotechar='"')

In [None]:
df.sample(n=20, random_state=1)

In [None]:
df.sample(n=20, random_state=1, weights='price')

In [None]:
df.head()
df.info(memory_usage="deep")

In [None]:
df.loc[~df.license.isnull()]

In [None]:
#low memory solo funciona con el engine de C para numpy.  Por defecto pandas usa pyarrow.  Tambien esta el motor de python
#que es mas flexible pero mas lento
#Pandas no soporta fechas en su casting desde csv, hay que user el parametro parse_dates para indicar que columnas son fechas
parse_dates=['last review']
dtypes = {'license': str}
df = pd.read_csv('archive.zip', compression='zip', header=0, sep=',', quotechar='"', dtype=dtypes, parse_dates=parse_dates)
df.head()

In [None]:
df.info(memory_usage="deep")

In [None]:
#Conteo de nulls por columna
df_nulls = df.isnull().sum().sort_values(ascending=False).to_frame('count')
#Conteo total de filas
df_nulls['total'] = df.shape[0]
#% de columnas vacias
df_nulls['%'] = df_nulls['count']/df_nulls['total']
df_nulls

In [None]:
df.drop('license', axis=1, inplace=True)

In [None]:
df.info(memory_usage="deep")

In [None]:
#Revisar los outliers con un simple min y max
df.agg([min, max])

In [None]:
df['price'] = pd.to_numeric(df['price'].str.replace('$','', regex=False), errors='coerce')

In [None]:
df['service fee'] = pd.to_numeric(df['service fee'].str.replace('$','', regex=False), errors='coerce')

In [None]:
df.agg([min, max])

In [None]:
df[df['last review'].apply(lambda x: x.year) > 2022]

In [None]:
df = df[df['last review'].apply(lambda x: x.year) <= 2022]
df

In [None]:
#Revisar duplicados
df.duplicated().sum()

In [None]:
df.duplicated(subset=['host name', 'lat', 'long']).sum()

In [None]:
df.drop_duplicates(subset=['host name', 'lat', 'long'], inplace=True)

In [None]:
df.info(memory_usage="deep")

In [None]:
df['country'].unique()

In [None]:
df['country code'].unique()

In [None]:
df['neighbourhood group'].unique()

In [None]:
df.loc[df['country'].isnull(), 'country'] = 'United States'
df.loc[df['country code'].isnull(), 'country code'] = 'US'

In [None]:
df_countries = pd.read_csv('countries.csv', header=0, sep=',', quotechar='"')
df_countries

In [None]:
df_countries.drop('English short name lower case', axis=1, inplace=True)
df_countries

In [None]:
#unir ambos datasets
merged_df = pd.merge(df, df_countries, left_on='country code', right_on='Alpha-2 code',how="inner")
merged_df

In [None]:
df.groupby(by=['country']).mean()

In [None]:
df.groupby(by=['neighbourhood group']).mean()

In [None]:
df.groupby(by=['neighbourhood group'])['service fee'].mean()

In [None]:
df.loc[df['neighbourhood group']=='brookln', 'neighbourhood group'] = 'Brooklyn'
df.loc[df['neighbourhood group']=='manhatan', 'neighbourhood group'] = 'Manhattan'
df.groupby(by=['neighbourhood group'])['service fee'].mean()

In [None]:
df.groupby(by=['neighbourhood group']).agg({'service fee':{ 'mean', 'min', 'max' }})