## Data Summarization
#### Using Excel & Python

In [1]:
import pandas as pd
df = pd.read_csv('webautomation_coursera.csv')

In [2]:
df.shape

(242, 16)

In [3]:
df.columns.tolist()

['url',
 'title',
 'associated-university-institution-company',
 'type',
 'image',
 'category-subject-area',
 'certificate-is-available',
 'description',
 'duration',
 'language',
 'level',
 'prerequisites',
 'price',
 'rating',
 'syllabus',
 'timestamp']

#### Basic Statistics w/ Pandas

1. get basic descriptive statistics of a numeric feature

In [None]:
df_nooutlier['life_sp'].describe()

2. get another descriptive value of the feature

In [None]:
print("Media", sunspots.median(), "\n") #choose a col w/ numeric values
df['Total_Revenue'].describe() #will output stats such as count, mean, std, min, etc.

#### Data Aggregation: grouping data by value in a column(s)

In [None]:
df_byitemtype = df.groupby('ItemType') #group by item type
df_byitemtype['Total_Revenue'].sum()
df.groupby(['Team', 'Position']) #grouping by more than one category

#### Pivot Tables: table of grouped values that aggregates the individual items of a more extensive table w/in one or more discrete categories

In [None]:
pivot1 = pd.pivot_table(df, value = ['Total_Revenue'], index = ['ItemType'], column = ['Region'], aggfunc = 'sum', sort = True)
#create a pivot table to aggregate data
print(pivot1) #shows total revenue for each region by item type 

#Pivot table with both columns and indexes
table2 = pd.pivot_table(df, index = ['Weather'], columns = ['Food'], values= 'Number', aggfunc=np.sum)

#Pivot table with two values
table3 = pd.pivot_table(df, index=['Weather'], values=['Number', 'Price'], aggfunc={'Price': np.mean, 'Number': np.sum})

#Pivot table with totals
table4 = pd.pivot_table(df, index=['Weather'], values=['Number', 'Price'], aggfunc={'Price': np.mean,'Number': np.sum}, margins=True)

#### Joining DataFrames

In [None]:
#inner join selects rows from two tables, if and only if values match, for cols specified in the join condition
address = pd.DataFrame({'EmpNr' : [5, 3, 9],'Dest' : ['Carrollton', 'Bremen', 'Villa Rica']})
salary = pd.DataFrame({'EmpNr' : [5, 9, 7],'Amount' : [10, 5, 2.5]})
print(address)
print(salary)

#another ex.
dest_tips = pd.merge(address, salary, how='inner')
print(dest_tips)

inner_join = pd.merge(address, salary, how = 'inner')
print(inner_join) #should return two rows

In [None]:
#outer joins do not require a match and can potentially return more rows
outer_join = pd.merge(address, salary, how='outer')
print(outer_join) #should show 4 rows; can include null/NaN values

#### Querying Data

In [None]:
#boolean indexing
df_nooutlier = df_deldup[df_deldup['life_sq'] < 1000]

#using loc/iloc; used more often (iloc = index)
df_nooutlier.loc[8135:8949, 'life_sq']
df_nooutlier.loc[(df_nooutlier['id']>8135)&(df_nooutlier['id']< 8391), 'life_sq']

#using .query()
df_nooutlier.query('id ==8059')
df_nooutlier.query('life_sq < 30')