# Level UP

### Get up and running

In [4]:
import pandas as pd
import numpy as np
import datetime

# Student drinking dataset
url = 'https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/04_Apply/Students_Alcohol_Consumption/student-mat.csv'
s_etoh = pd.read_csv(url, sep = ',').loc[: , "school":"guardian"]
s_etoh.head()

# crime dataset
url = 'https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/04_Apply/US_Crime_Rates/US_Crime_Rates_1960_2014.csv'
crime = pd.read_csv(url, sep = ',')
crime.head()

Unnamed: 0,Year,Population,Total,Violent,Property,Murder,Forcible_Rape,Robbery,Aggravated_assault,Burglary,Larceny_Theft,Vehicle_Theft
0,1960,179323175,3384200,288460,3095700,9110,17190,107840,154320,912100,1855400,328200
1,1961,182992000,3488000,289390,3198600,8740,17220,106670,156760,949600,1913000,336000
2,1962,185771000,3752200,301510,3450700,8530,17550,110860,164570,994300,2089600,366800
3,1963,188483000,4109500,316970,3792500,8640,17650,116470,174210,1086400,2297800,408300
4,1964,191141000,4564600,364220,4200400,9360,21420,130390,203050,1213200,2514400,472800


### Simple Lambda Functions

In [None]:
capitalizer = lambda x: x.capitalize()

stud_alcoh['Mjob'] = stud_alcoh['Mjob'].apply(capitalizer)
stud_alcoh['Fjob'] = stud_alcoh['Fjob'].apply(capitalizer)
stud_alcoh.tail()

### Slightly bigger function to create a new column based on another column

In [3]:
# Function to help create a new column to say whether or not legal drinking age
def majority(age):
    if age > 17:
        return True
    else:
        return False
    
s_etoh['legal_drinker'] = s_etoh.age.apply(majority)
s_etoh.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,legal_drinker
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,course,mother,True
1,GP,F,17,U,GT3,T,1,1,at_home,other,course,father,False
2,GP,F,15,U,LE3,T,1,1,at_home,other,other,mother,False
3,GP,F,15,U,GT3,T,4,2,health,services,home,mother,False
4,GP,F,16,U,GT3,T,3,3,other,other,home,father,False


### applymap to run function over entire dataset

In [4]:
# nonsensical but an exercise worth trying
def times10(x):
    if type(x) is int:
        return x * 10
    else: 
        return x
s_etoh.applymap(times10).head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,legal_drinker
0,GP,F,180,U,GT3,A,40,40,at_home,teacher,course,mother,True
1,GP,F,170,U,GT3,T,10,10,at_home,other,course,father,False
2,GP,F,150,U,LE3,T,10,10,at_home,other,other,mother,False
3,GP,F,150,U,GT3,T,40,20,health,services,home,mother,False
4,GP,F,160,U,GT3,T,30,30,other,other,home,father,False


# Datetime

### all the datetime codes (date offset objects) [http://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html]

In [18]:
# converting Year from integer to datetime format andt
crime.Year = pd.to_datetime(crime.Year, format = '%Y' )

# setting it as the index of the datase
crime = crime.set_index('Year', drop = True)

# deleting total column
del crime['Total']
crime.head()

Unnamed: 0_level_0,Population,Violent,Property,Murder,Forcible_Rape,Robbery,Aggravated_assault,Burglary,Larceny_Theft,Vehicle_Theft
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1960-01-01,179323175,288460,3095700,9110,17190,107840,154320,912100,1855400,328200
1961-01-01,182992000,289390,3198600,8740,17220,106670,156760,949600,1913000,336000
1962-01-01,185771000,301510,3450700,8530,17550,110860,164570,994300,2089600,366800
1963-01-01,188483000,316970,3792500,8640,17650,116470,174210,1086400,2297800,408300
1964-01-01,191141000,364220,4200400,9360,21420,130390,203050,1213200,2514400,472800


### The amazing resample function for time series datasets!

In [20]:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.resample.html
crimes = crime.resample('10AS').sum()

# take just the maximum population value for any given year instead of summing it up
population = crime['Population'].resample('10AS').max()
crimes['population'] = population
crimes


Unnamed: 0_level_0,Population,Violent,Property,Murder,Forcible_Rape,Robbery,Aggravated_assault,Burglary,Larceny_Theft,Vehicle_Theft,population
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1960-01-01,1915053175,4134930,45160900,106180,236720,1633510,2158520,13321100,26547700,5292100,201385000.0
1970-01-01,2121193298,9607930,91383800,192230,554570,4159020,4702120,28486000,53157800,9739900,220099000.0
1980-01-01,2371370069,14074328,117048900,206439,865639,5383109,7619130,33073494,72040253,11935411,248239000.0
1990-01-01,2612825258,17527048,119053499,211664,998827,5748930,10568963,26750015,77679366,14624418,272690813.0
2000-01-01,2947969117,13968056,100944369,163068,922499,4230366,8652124,21565176,67970291,11412834,307006550.0
2010-01-01,1570146307,6072017,44095950,72867,421059,1749809,3764142,10125170,30401698,3569080,318857056.0
2020-01-01,0,0,0,0,0,0,0,0,0,0,


### Getting the maximum number in a time series

In [22]:
# apparently the 90s was a pretty dangerous time in the US
crimes.idxmax(0)

Population           2000-01-01
Violent              1990-01-01
Property             1990-01-01
Murder               1990-01-01
Forcible_Rape        1990-01-01
Robbery              1990-01-01
Aggravated_assault   1990-01-01
Burglary             1980-01-01
Larceny_Theft        1990-01-01
Vehicle_Theft        1990-01-01
population           2010-01-01
dtype: datetime64[ns]

### Downsampling (creating aggreated datasets based on year, month or whatever)

In [7]:
# see pandas exercises 06_wind stats

# Merging (need to update with append, concat, join, and merge to show differences)

In [23]:
# Append is just adding rows to the botoom, 
# concat does all that but with more flexibility. 
# Merge matches based on value


# Import and clean up files
cars1 = pd.read_csv("https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/05_Merge/Auto_MPG/cars1.csv")
cars2 = pd.read_csv("https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/05_Merge/Auto_MPG/cars2.csv")
cars1 = cars1.loc[: , "mpg":"car"]

# ADD ROWS (append)
cars = cars1.append(cars2)

# Check
print("cars1 number of rows: ", str(cars1.shape[0]))
print("cars2 number of rows: ", str(cars2.shape[0]))
print("cars number of rows: ", str(cars.shape[0]))


cars1 number of rows:  198
cars2 number of rows:  200
cars number of rows:  398


### Create random number series and add it as a column

In [24]:
nr_owners = np.random.randint(15000, #lowest number
                              high = 73001, #highest number (plus 1)
                              size = 398,  #how many rows we need it for
                              dtype = 'l') # code for a python integer
cars['owners'] = nr_owners
cars.tail()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,origin,car,owners
195,27.0,4,140,86,2790,15.6,82,1,ford mustang gl,28074
196,44.0,4,97,52,2130,24.6,82,2,vw pickup,71107
197,32.0,4,135,84,2295,11.6,82,1,dodge rampage,53564
198,28.0,4,120,79,2625,18.6,82,1,ford ranger,25502
199,31.0,4,119,82,2720,19.4,82,1,chevy s-10,72595


### Concat

In [25]:
#Get data to use
raw_data_1 = {
        'subject_id': ['1', '2', '3', '4', '5'],
        'first_name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'], 
        'last_name': ['Anderson', 'Ackerman', 'Ali', 'Aoni', 'Atiches']}

raw_data_2 = {
        'subject_id': ['4', '5', '6', '7', '8'],
        'first_name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'], 
        'last_name': ['Bonder', 'Black', 'Balwner', 'Brice', 'Btisan']}

raw_data_3 = {
        'subject_id': ['1', '2', '3', '4', '5', '7', '8', '9', '10', '11'],
        'test_id': [51, 15, 15, 61, 16, 14, 15, 1, 61, 16]}
data1 = pd.DataFrame(raw_data_1, columns = raw_data_1.keys())
data2 = pd.DataFrame(raw_data_2, columns = raw_data_2.keys())
data3 = pd.DataFrame(raw_data_3, columns = raw_data_3.keys())

In [27]:
# very similar to append
all_data = pd.concat([data1, data2])
print("Concating by just appending rows:")
print(all_data)

# that extra flexibility I was talking about
all_data_cols = pd.concat([data1, data2], axis = 1)
print("\nConcating by adding columns")
print(all_data_cols)



Concating by just appending rows:
  subject_id first_name last_name
0          1       Alex  Anderson
1          2        Amy  Ackerman
2          3      Allen       Ali
3          4      Alice      Aoni
4          5     Ayoung   Atiches
0          4      Billy    Bonder
1          5      Brian     Black
2          6       Bran   Balwner
3          7      Bryce     Brice
4          8      Betty    Btisan

Concating by adding columns
  subject_id first_name last_name subject_id first_name last_name
0          1       Alex  Anderson          4      Billy    Bonder
1          2        Amy  Ackerman          5      Brian     Black
2          3      Allen       Ali          6       Bran   Balwner
3          4      Alice      Aoni          7      Bryce     Brice
4          5     Ayoung   Atiches          8      Betty    Btisan


### Merge

In [28]:
print('basic merge:')
print(pd.merge(all_data, data3, on = "subject_id"))

print('\nmerge only when data from both dataframes (inner):')
print(pd.merge(data1, data2, on = "subject_id", how = 'inner'))

print('\nmerge everything (outer):')
print(pd.merge(data1, data2, on = "subject_id", how = 'outer'))

basic merge:
  subject_id first_name last_name  test_id
0          1       Alex  Anderson       51
1          2        Amy  Ackerman       15
2          3      Allen       Ali       15
3          4      Alice      Aoni       61
4          4      Billy    Bonder       61
5          5     Ayoung   Atiches       16
6          5      Brian     Black       16
7          7      Bryce     Brice       14
8          8      Betty    Btisan       15

merge only when data from both dataframes (inner):
  subject_id first_name_x last_name_x first_name_y last_name_y
0          4        Alice        Aoni        Billy      Bonder
1          5       Ayoung     Atiches        Brian       Black

merge everything (outer):
  subject_id first_name_x last_name_x first_name_y last_name_y
0          1         Alex    Anderson          NaN         NaN
1          2          Amy    Ackerman          NaN         NaN
2          3        Allen         Ali          NaN         NaN
3          4        Alice        Aoni