# Level UP

### Get up and running

In [17]:
import pandas as pd
import numpy as np

# Student drinking dataset
url = 'https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/04_Apply/Students_Alcohol_Consumption/student-mat.csv'
s_etoh = pd.read_csv(url, sep = ',').loc[: , "school":"guardian"]
s_etoh.head()

# crime dataset
url = 'https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/04_Apply/US_Crime_Rates/US_Crime_Rates_1960_2014.csv'
crime = pd.read_csv(url, sep = ',')
crime.head()

Unnamed: 0,Year,Population,Total,Violent,Property,Murder,Forcible_Rape,Robbery,Aggravated_assault,Burglary,Larceny_Theft,Vehicle_Theft
0,1960,179323175,3384200,288460,3095700,9110,17190,107840,154320,912100,1855400,328200
1,1961,182992000,3488000,289390,3198600,8740,17220,106670,156760,949600,1913000,336000
2,1962,185771000,3752200,301510,3450700,8530,17550,110860,164570,994300,2089600,366800
3,1963,188483000,4109500,316970,3792500,8640,17650,116470,174210,1086400,2297800,408300
4,1964,191141000,4564600,364220,4200400,9360,21420,130390,203050,1213200,2514400,472800


### Simple Lambda Functions

In [None]:
capitalizer = lambda x: x.capitalize()

stud_alcoh['Mjob'] = stud_alcoh['Mjob'].apply(capitalizer)
stud_alcoh['Fjob'] = stud_alcoh['Fjob'].apply(capitalizer)
stud_alcoh.tail()

### Slightly bigger function to create a new column based on another column

In [3]:
# Function to help create a new column to say whether or not legal drinking age
def majority(age):
    if age > 17:
        return True
    else:
        return False
    
s_etoh['legal_drinker'] = s_etoh.age.apply(majority)
s_etoh.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,legal_drinker
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,course,mother,True
1,GP,F,17,U,GT3,T,1,1,at_home,other,course,father,False
2,GP,F,15,U,LE3,T,1,1,at_home,other,other,mother,False
3,GP,F,15,U,GT3,T,4,2,health,services,home,mother,False
4,GP,F,16,U,GT3,T,3,3,other,other,home,father,False


### applymap to run function over entire dataset

In [4]:
# nonsensical but an exercise worth trying
def times10(x):
    if type(x) is int:
        return x * 10
    else: 
        return x
s_etoh.applymap(times10).head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,legal_drinker
0,GP,F,180,U,GT3,A,40,40,at_home,teacher,course,mother,True
1,GP,F,170,U,GT3,T,10,10,at_home,other,course,father,False
2,GP,F,150,U,LE3,T,10,10,at_home,other,other,mother,False
3,GP,F,150,U,GT3,T,40,20,health,services,home,mother,False
4,GP,F,160,U,GT3,T,30,30,other,other,home,father,False


### datetime function

In [18]:
# converting Year from integer to datetime format andt
crime.Year = pd.to_datetime(crime.Year, format = '%Y' )

# setting it as the index of the datase
crime = crime.set_index('Year', drop = True)

# deleting total column
del crime['Total']
crime.head()

Unnamed: 0_level_0,Population,Violent,Property,Murder,Forcible_Rape,Robbery,Aggravated_assault,Burglary,Larceny_Theft,Vehicle_Theft
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1960-01-01,179323175,288460,3095700,9110,17190,107840,154320,912100,1855400,328200
1961-01-01,182992000,289390,3198600,8740,17220,106670,156760,949600,1913000,336000
1962-01-01,185771000,301510,3450700,8530,17550,110860,164570,994300,2089600,366800
1963-01-01,188483000,316970,3792500,8640,17650,116470,174210,1086400,2297800,408300
1964-01-01,191141000,364220,4200400,9360,21420,130390,203050,1213200,2514400,472800


### The amazing resample function for time series datasets!

In [20]:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.resample.html
crimes = crime.resample('10AS').sum()

# take just the maximum population value for any given year instead of summing it up
population = crime['Population'].resample('10AS').max()
crimes['population'] = population
crimes


Unnamed: 0_level_0,Population,Violent,Property,Murder,Forcible_Rape,Robbery,Aggravated_assault,Burglary,Larceny_Theft,Vehicle_Theft,population
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1960-01-01,1915053175,4134930,45160900,106180,236720,1633510,2158520,13321100,26547700,5292100,201385000.0
1970-01-01,2121193298,9607930,91383800,192230,554570,4159020,4702120,28486000,53157800,9739900,220099000.0
1980-01-01,2371370069,14074328,117048900,206439,865639,5383109,7619130,33073494,72040253,11935411,248239000.0
1990-01-01,2612825258,17527048,119053499,211664,998827,5748930,10568963,26750015,77679366,14624418,272690813.0
2000-01-01,2947969117,13968056,100944369,163068,922499,4230366,8652124,21565176,67970291,11412834,307006550.0
2010-01-01,1570146307,6072017,44095950,72867,421059,1749809,3764142,10125170,30401698,3569080,318857056.0
2020-01-01,0,0,0,0,0,0,0,0,0,0,


### Getting the maximum number in a time series

In [22]:
# apparently the 90s was a pretty dangerous time in the US
crimes.idxmax(0)

Population           2000-01-01
Violent              1990-01-01
Property             1990-01-01
Murder               1990-01-01
Forcible_Rape        1990-01-01
Robbery              1990-01-01
Aggravated_assault   1990-01-01
Burglary             1980-01-01
Larceny_Theft        1990-01-01
Vehicle_Theft        1990-01-01
population           2010-01-01
dtype: datetime64[ns]