# my pandas cheatsheet

Easy access codes

### Import the necessary libraries

In [2]:
import pandas as pd

### Import the data

In [40]:
# dataset based on European soccer teams
url_euro_teams = 'https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/02_Filtering_%26_Sorting/Euro12/Euro_2012_stats_TEAM.csv'
euro_teams = pd.read_csv(url_euro_teams, sep = ',')

# dataset based on Chipotle orders
url_chipo = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv'
chipo = pd.read_csv(url_chipo, sep = '\t')


### Get a feel for what dataset looks like.

In [26]:
# assign which dataset we're looking at
test_data = euro_teams

# See first 10 entries
test_data.head()

# Row count
rows = test_data.shape[0]
print("Number of rows: ", str(rows))

# Column count
columns = test_data.shape[1]
print("Number of columns: ", str(columns))

# Column names
print("Column names: ", str(test_data.columns))

# Indexing method
print("Index method: ", str(chipo.index))

# Data types of all columns
print("Data types for entire dataframe: ")
test_data.info()

# Data type of particular column
print("Data type of Goals column specifically is: ", str(euro_teams.Goals.dtype))


Number of rows:  16
Number of columns:  35
Column names:  Index(['Team', 'Goals', 'Shots on target', 'Shots off target',
       'Shooting Accuracy', '% Goals-to-shots', 'Total shots (inc. Blocked)',
       'Hit Woodwork', 'Penalty goals', 'Penalties not scored', 'Headed goals',
       'Passes', 'Passes completed', 'Passing Accuracy', 'Touches', 'Crosses',
       'Dribbles', 'Corners Taken', 'Tackles', 'Clearances', 'Interceptions',
       'Clearances off line', 'Clean Sheets', 'Blocks', 'Goals conceded',
       'Saves made', 'Saves-to-shots ratio', 'Fouls Won', 'Fouls Conceded',
       'Offsides', 'Yellow Cards', 'Red Cards', 'Subs on', 'Subs off',
       'Players Used'],
      dtype='object')
Index method:  RangeIndex(start=0, stop=4622, step=1)
Data types for entire dataframe: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 35 columns):
Team                          16 non-null object
Goals                         16 non-null int64
Shots on 

### Get summaries on certain columns of interest

In [64]:
chipo.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price,item_price2
0,1,1,Chips and Fresh Tomato Salsa,,$2.39,2.39
1,1,1,Izze,[Clementine],$3.39,3.39
2,1,1,Nantucket Nectar,[Apple],$3.39,3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39,2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98,16.98


In [67]:
# The most frequently ordered item in general
c = chipo.groupby('item_name')
c = c.sum()
c = c.sort_values(['quantity'], ascending=False)
print("The most frequently order item was: ")
print (c.head(6))

# The most frequently ordered subchoice:
c = chipo.groupby('choice_description').sum()
c = c.sort_values(['quantity'], ascending=False)
print("\nThe most frequently ordered subchoice was: ")
print (c.head(6))

# Can summarize one column this way
total_items_orders = chipo.quantity.sum()
print("\nThe sum of the quanitity column is: ", str(total_items_orders))

The most frequently order item was: 
                     order_id  quantity  item_price2
item_name                                           
Chicken Bowl           713926       761      7342.73
Chicken Burrito        497303       591      5575.82
Chips and Guacamole    449959       506      2201.04
Steak Burrito          328437       386      3851.43
Canned Soft Drink      304753       351       438.75
Chips                  208004       230       494.34

The most frequently ordered subchoice was: 
                                                    order_id  quantity  \
choice_description                                                       
[Diet Coke]                                           123455       159   
[Coke]                                                122752       143   
[Sprite]                                               80426        89   
[Fresh Tomato Salsa, [Rice, Black Beans, Cheese...     43088        49   
[Fresh Tomato Salsa, [Rice, Black Beans, Cheese...

### Lambda functions

In [43]:
# Tiny function that gets rid of '$' and turns str into a float
dollarizer = lambda x: float(x[1:-1])

print ("Item price before being dollarized: \n", str(chipo.item_price[:4]))
chipo = chipo.assign(item_price2 = chipo.item_price.apply(dollarizer))
print ("\n Item price after being dollarized: \n", str(chipo.item_price2[:4]))

Item price before being dollarized: 
 0    $2.39 
1    $3.39 
2    $3.39 
3    $2.39 
Name: item_price, dtype: object

 Item price after being dollarized: 
 0    2.39
1    3.39
2    3.39
3    2.39
Name: item_price2, dtype: float64


### Unique values in a column

In [53]:
# How many different teams are there?
euro_teams['Team'].nunique()

# same thing
print("Number of teams: ", str(euro_teams.Team.nunique()))

Number of teams:  16


### View only certain columns and rows

In [97]:
# filter only giving the column names
discipline = euro_teams[['Team', 'Yellow Cards', 'Red Cards', 'Goals']]
print("Selected columns only: ")
print(discipline)

# filter to certain rows
discipline_shortlist = discipline[3:7]
print("\nSubset of rows: ")
print(discipline_shortlist)


Selected columns only: 
                   Team  Yellow Cards  Red Cards  Goals
0               Croatia             9          0      4
1        Czech Republic             7          0      4
2               Denmark             4          0      4
3               England             5          0      5
4                France             6          0      3
5               Germany             4          0     10
6                Greece             9          1      5
7                 Italy            16          0      6
8           Netherlands             5          0      2
9                Poland             7          1      2
10             Portugal            12          0      6
11  Republic of Ireland             6          1      1
12               Russia             6          0      5
13                Spain            11          0     12
14               Sweden             7          0      5
15              Ukraine             5          0      2

Subset of rows: 
      

### Drop duplicates

### Filter and sort

In [None]:
# Sort based on multiple values (first red cards and then by yellow cards)
discipline.sort_values(['Red Cards', 'Yellow Cards'], ascending = False)

### Some miscellaneous tricks

In [99]:
# Round and take the mean
print("The rounded mean # of yellow cards is: ", str(round(discipline['Yellow Cards'].mean())))

# Sort based on multiple values (first red cards and then by yellow cards)
print('\nThe red card - yellow card - goals sorted set is:')
print(discipline.sort_values(['Red Cards', 'Yellow Cards', 'Goals'], ascending = False))

# Filter based on number of goals scored
print('\nA subset of only the teams with a minimum 7 goals: ')
print(discipline[discipline.Goals > 6])
                    

The rounded mean # of yellow cards is:  7

The red card - yellow card - goals sorted set is:
                   Team  Yellow Cards  Red Cards  Goals
6                Greece             9          1      5
9                Poland             7          1      2
11  Republic of Ireland             6          1      1
7                 Italy            16          0      6
10             Portugal            12          0      6
13                Spain            11          0     12
0               Croatia             9          0      4
14               Sweden             7          0      5
1        Czech Republic             7          0      4
12               Russia             6          0      5
4                France             6          0      3
3               England             5          0      5
8           Netherlands             5          0      2
15              Ukraine             5          0      2
5               Germany             4          0     10
2          