# Class 4: Let's do some work with Pandas

![Panda](https://c.tenor.com/HjWiWdQbvd0AAAAM/cute-bear.gif)


In [None]:
import pandas as pd
import numpy as np

### Series

In [None]:
gdp = {"GDP": [5974.7, 10031.0, 14681.1]} 


In [None]:
gdp_s= pd.Series(gdp,name='GDP')
gdp_s

In [None]:
#we can also create them from an array

cpi = np.array([127.5, 169.3, 217.488])
cpi_s= pd.Series(cpi,name='CPI')
cpi_s

### From series to dataframes

In [None]:
# create series from a list, then let's get into dataframes!
year = [1990, 2000, 2010]
country = ["US", "US", "US"]
year_s = pd.Series(year,name='Year')
country_s = pd.Series(country,name='Country')


In [None]:
Series_Df = pd.concat([year_s,country_s],axis=1)
Series_Df

In [None]:
Series_Df = pd.concat([year_s,country_s],axis=0)
Series_Df

In [None]:
data = {"GDP": [5974.7, 10031.0, 14681.1],
                   "CPI": [127.5, 169.3, 217.488],
                   "Year": [1990, 2000, 2010],
                   "Country": ["US", "US", "US"]}

df = pd.DataFrame(data)

In [None]:
df

## Breaking up a dataframe

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.columns.tolist()

In [None]:
df.index

In [None]:
df.index.tolist()

In [None]:
pwt_data = {'countrycode': ['CHN', 'CHN', 'CHN', 'FRA', 'FRA', 'FRA'],
        'pop': [1124.8, 1246.8, 1318.2, 58.2, 60.8, 64.7],
        'rgdpe': [2.611, 4.951, 11.106, 1.294, 1.753, 2.032],
        'year': [1990, 2000, 2010, 1990, 2000, 2010]}

pwt = pd.DataFrame(pwt_data)

a) What are the dimensions of pwt?

b) What dtypes are the variables? What do they mean?




In [None]:
pwt.shape

In [None]:
pwt

In [None]:
pwt.dtypes

## Columns

In [None]:
df

In [None]:
df['CPI']

In [None]:
df.CPI

In [None]:
df.iloc[:,1]

In [None]:
df[["CPI","Country"]]

In [None]:
df.iloc[:, [1,3]]

In [None]:
df.columns = ["gdp", "cpi", "year", "country"]
df

In [None]:
df.columns = [var.upper() for var in df.columns]
df

In [None]:
df = df.rename(columns = {"GDP":"NGDP"})

df

In [None]:
namelist = ["NGDP","CPI"]

df[namelist]

## Playing with rows

In [None]:
pwt

In [None]:
pwt.iloc[1,]

In [None]:
pwt.iloc[0:2,]

In [None]:
pwt.loc[pwt['year']==2000]

In [None]:
pwt[pwt['year']==2000]

In [None]:
pwt['year']==2000

## Using values as indices

In [None]:
pwt.set_index(["year"])

In [None]:
pwt

In [None]:
pwt.set_index(["year"]).loc[2000]


In [None]:
pwt.set_index(["year"], inplace = False)
# inplace=True makes a replacement in the dataframe


In [None]:
pwt

In [None]:
pwt.set_index(["year"], inplace = True)
pwt

In [None]:
# resetting

pwt=pwt.reset_index()

In [None]:
pwt

**Exercise .** How would you extract all rows after 1990?

In [None]:
pwt.loc[pwt['year']>=2000]

In [None]:
pwt.loc[pwt['year']>1990]

In [None]:
pwt.loc[pwt['year']!=1990]

In [None]:
#Show how to do using list comprehension with iloc


pwt.iloc[[i for i in (list(range(1,3))+list(range(4,6)))],]


In [None]:
list(range(1,3))

In [None]:
list(range(4,6))

In [None]:
list(range(1,3))+list(range(4,6))

In [None]:
pwt.iloc[list(range(1,3))+list(range(4,6)),]

In [None]:
#using concat

pd.concat([pwt.iloc[1:3,], pwt.iloc[4:6,]], axis=0)

In [None]:
pwt.iloc[1:3,]

In [None]:
pwt.iloc[4:6,]

In [None]:
pd.concat([pwt.iloc[1:3,], pwt.iloc[4:6,]], axis=0)

### Remove Stuff by Column or Row


In [None]:
df=pd.DataFrame(data)
df

In [None]:
df.drop("CPI", axis = 1) 

In [None]:
df

In [None]:
df.drop(0, axis = 0)

In [None]:
df

**Exercise** How would you drop one year from the data set?

In [None]:
df.set_index(['Year']).drop(2000, axis=0).reset_index()

In [None]:
df.set_index(['Year'])

In [None]:
df.set_index(['Year']).drop(2000, axis=0)

In [None]:
df.set_index(['Year']).drop(2000, axis=0).reset_index().drop(1,axis=0)

## Conditional Selection


In [None]:
df

In [None]:
# Here we select rows with CPI larger than 170
df[df.CPI>170]

In [None]:
df[df['CPI']>170]

In [None]:
# if we want to select the particular rows while selecting the specified columns
df.loc[df.CPI>170,['GDP','Year']]

## Calculations on a Dataframe


In [None]:
df["GDP"].dtypes


In [None]:
df["GDP"] + df["GDP"]

In [None]:
df["GDP"] / df["CPI"] #real gdp!

In [None]:
100*df["GDP"] / df["GDP"][0]

In [None]:
df

In [None]:
df['RGDP'] = df['GDP']/df['CPI']

In [None]:
df

In [None]:
df['GDP_div_1000'] = df['GDP'] / 1000

In [None]:
df

### Operations across rows/columns


In [None]:
df.sum(axis=0)

In [None]:
df.sum(axis=1)

In [None]:
df.var(axis=0)


In [None]:
df.var(axis=1)

**Exercise.** Can you select the year 2010 and compute the row sum of df?

In [None]:
df[df['Year']==2010].sum(axis=1)

## Simple Statistics


In [None]:
df.mean(axis=0)

In [None]:
test = pd.DataFrame(df.mean(axis=0))

test

In [None]:
test.loc["CPI"]

In [None]:
sumstate = df.describe() 
sumstate

In [None]:
type(sumstate)


## output and save

In [None]:
pwt.to_csv("pwt.csv")

pwt.to_excel("pwt.xlsx")