In [None]:
# Two-Dimensional NumPy Arrays

In [1]:
import numpy as np

# Subway ridership for 5 stations on 10 different days
ridership = np.array([
    [   0,    0,    2,    5,    0],
    [1478, 3877, 3674, 2328, 2539],
    [1613, 4088, 3991, 6461, 2691],
    [1560, 3392, 3826, 4787, 2613],
    [1608, 4802, 3932, 4477, 2705],
    [1576, 3933, 3909, 4979, 2685],
    [  95,  229,  255,  496,  201],
    [   2,    0,    1,   27,    0],
    [1438, 3785, 3589, 4174, 2215],
    [1342, 4043, 4009, 4665, 3033]
])

In [2]:
# Change False to True for each block of code to see what it does

# Accessing elements
if True:
    print ridership[1, 3]
    print ridership[1:3, 3:5]
    print ridership[1, :]
    


2328
[[2328 2539]
 [6461 2691]]
[1478 3877 3674 2328 2539]


In [4]:
# Vectorized operations on rows or columns
if True:
    print ridership[0, :] + ridership[1, :]
    print ridership[:, 0] + ridership[:, 1]

[1478 3877 3676 2333 2539]
[   0 5355 5701 4952 6410 5509  324    2 5223 5385]


In [5]:
# Vectorized operations on entire arrays
if True:
    a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    b = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]])
    print a + b

[[ 2  3  4]
 [ 6  7  8]
 [10 11 12]]


In [7]:
def mean_riders_for_max_station(ridership):
    '''
    Fill in this function to find the station with the maximum riders on the
    first day, then return the mean riders per day for that station. Also
    return the mean ridership overall for comparsion.
    
    Hint: NumPy's argmax() function might be useful:
    http://docs.scipy.org/doc/numpy/reference/generated/numpy.argmax.html
    '''
    
    # station with the maximum riders on the first day,
    max_station = ridership[0,:].argmax() # position (column): 3
    
    # the mean riders per day for that station
    mean_for_max  = ridership[:,max_station].mean()
    
    # the mean ridership overall for comparsion
    overall_mean = ridership.mean()
    
    
    
    return (overall_mean, mean_for_max)

In [8]:
mean_riders_for_max_station(ridership)
# Note : mean() in NumPy 2D operates on the whole 2D

(2342.5999999999999, 3239.9000000000001)

In [9]:
# NumPy Axis

In [13]:
import numpy as np

# Change False to True for this block of code to see what it does

# NumPy axis argument
if True:
    a = np.array([
        [1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]
    ])
    
    print a.sum()
    print a.sum(axis=0)
    print a.sum(axis=1)
    



45
[12 15 18]
[ 6 15 24]


In [23]:
# Subway ridership for 5 stations on 10 different days
ridership = np.array([
    [   0,    0,    2,    5,    0],
    [1478, 3877, 3674, 2328, 2539],
    [1613, 4088, 3991, 6461, 2691],
    [1560, 3392, 3826, 4787, 2613],
    [1608, 4802, 3932, 4477, 2705],
    [1576, 3933, 3909, 4979, 2685],
    [  95,  229,  255,  496,  201],
    [   2,    0,    1,   27,    0],
    [1438, 3785, 3589, 4174, 2215],
    [1342, 4043, 4009, 4665, 3033]
])


In [38]:
def min_and_max_riders_per_day(ridership):
    '''
    Fill in this function. First, for each subway station, calculate the
    mean ridership per day. Then, out of all the subway stations, return the
    maximum and minimum of these values. That is, find the maximum
    mean-ridership-per-day and the minimum mean-ridership-per-day for any
    subway station.
    '''
    # for each subway station, calculate the mean ridership per day
    mean_dayily_per_station = ridership.mean(axis=0)
    max_daily_ridership = mean_dayily_per_station.max()     # Replace this with your code
    min_daily_ridership = mean_dayily_per_station.min()     # Replace this with your code
    
    return (max_daily_ridership, min_daily_ridership)

In [39]:
min_and_max_riders_per_day(ridership)

(3239.9000000000001, 1071.2)

In [42]:
# NumPy and Pandas Data Types
'''
One advantage DataFrame over 2d NumPy array is:
    - DF can have multiple datatype
    - DF has indexes 
'''

import pandas as pd
import numpy as np
enrollments_df = pd.DataFrame({
    'account_key' : [488,488,488,488,488],
    'status': ['canceld','canceld','canceld','canceld','canceld'],
    'join_date' : ['2014-01-05','2014-01-05','2014-01-05','2014-01-05','2014-01-05'],
    'days_to_cancel': [65,0,0,6,np.nan]
})

enrollments_df

Unnamed: 0,account_key,days_to_cancel,join_date,status
0,488,65.0,2014-01-05,canceld
1,488,0.0,2014-01-05,canceld
2,488,0.0,2014-01-05,canceld
3,488,6.0,2014-01-05,canceld
4,488,,2014-01-05,canceld


In [43]:
enrollments.mean()
# enrollments.mean(axis=1) ---- won't work cuz data type

account_key       488.00
days_to_cancel     17.75
dtype: float64

In [44]:
# Accessing Elements of a DataFrame

In [53]:
# DataFrame creation
if True:
    # You can create a DataFrame out of a dictionary mapping column names to values
    df_1 = pd.DataFrame({'A': [0, 1, 2], 'B': [3, 4, 5]})
    print df_1
    
    print
    print
    # You can also use a list of lists or a 2D NumPy array
    df_2 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=['A', 'B', 'C'])
    print df_2

   A  B
0  0  3
1  1  4
2  2  5


   A  B  C
0  0  1  2
1  3  4  5


In [54]:
import pandas as pd

# Subway ridership for 5 stations on 10 different days
ridership_df = pd.DataFrame(
    data=[[   0,    0,    2,    5,    0],
          [1478, 3877, 3674, 2328, 2539],
          [1613, 4088, 3991, 6461, 2691],
          [1560, 3392, 3826, 4787, 2613],
          [1608, 4802, 3932, 4477, 2705],
          [1576, 3933, 3909, 4979, 2685],
          [  95,  229,  255,  496,  201],
          [   2,    0,    1,   27,    0],
          [1438, 3785, 3589, 4174, 2215],
          [1342, 4043, 4009, 4665, 3033]],
    index=['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11',
           '05-06-11', '05-07-11', '05-08-11', '05-09-11', '05-10-11'],
    columns=['R003', 'R004', 'R005', 'R006', 'R007']
)
ridership_df

# Change False to True for each block of code to see what it does

Unnamed: 0,R003,R004,R005,R006,R007
05-01-11,0,0,2,5,0
05-02-11,1478,3877,3674,2328,2539
05-03-11,1613,4088,3991,6461,2691
05-04-11,1560,3392,3826,4787,2613
05-05-11,1608,4802,3932,4477,2705
05-06-11,1576,3933,3909,4979,2685
05-07-11,95,229,255,496,201
05-08-11,2,0,1,27,0
05-09-11,1438,3785,3589,4174,2215
05-10-11,1342,4043,4009,4665,3033


In [55]:
# Accessing elements
if True:
    print 'accessing throug index position'
    print ridership_df.iloc[0]
    print
    print 'accessing throug index'
    print ridership_df.loc['05-05-11']
    print
    print 'accesssing using column name'
    print ridership_df['R003']
    print
    print 'accessing using 2 indexes postions'
    print ridership_df.iloc[1, 3]

accessing throug index position
R003    0
R004    0
R005    2
R006    5
R007    0
Name: 05-01-11, dtype: int64

accessing throug index
R003    1608
R004    4802
R005    3932
R006    4477
R007    2705
Name: 05-05-11, dtype: int64

accesssing using column name
05-01-11       0
05-02-11    1478
05-03-11    1613
05-04-11    1560
05-05-11    1608
05-06-11    1576
05-07-11      95
05-08-11       2
05-09-11    1438
05-10-11    1342
Name: R003, dtype: int64

accessing using 2 indexes postions
2328


In [60]:
    
# Accessing multiple rows !!! no upper bound
print ridership_df.iloc[1:4]

          R003  R004  R005  R006  R007
05-02-11  1478  3877  3674  2328  2539
05-03-11  1613  4088  3991  6461  2691
05-04-11  1560  3392  3826  4787  2613


In [62]:
    
# Accessing multiple columns
print ridership_df[['R003', 'R005']]

          R003  R005
05-01-11     0     2
05-02-11  1478  3674
05-03-11  1613  3991
05-04-11  1560  3826
05-05-11  1608  3932
05-06-11  1576  3909
05-07-11    95   255
05-08-11     2     1
05-09-11  1438  3589
05-10-11  1342  4009


In [70]:
# Pandas axis
df = pd.DataFrame({'A': [0, 1, 2], 'B': [3, 4, 5]})
print df
print
print df.sum()
print
print df.sum(axis=1)
print
print df.values.sum()

   A  B
0  0  3
1  1  4
2  2  5

A     3
B    12
dtype: int64

0    3
1    5
2    7
dtype: int64

15


In [101]:
    
def mean_riders_for_max_station(ridership_df):
        '''
        Fill in this function to find the station with the maximum riders on the
        first day, then return the mean riders per day for that station. Also
        return the mean ridership overall for comparsion.

        This is the same as a previous exercise, but this time the
        input is a Pandas DataFrame rather than a 2D NumPy array.
        '''
        
        max_Station = ridership_df.iloc[0].argmax()
        mean_for_max = ridership_df[max_Station].mean()
        #overall_mean = ridership_df.mean() # Series of mean for each column !!!
        overall_mean = ridership_df.values.mean() # it's not possible to get overall mean from df, but we used values to make it numPy array
        return (max_Station, mean_for_max ,overall_mean)

In [102]:
mean_riders_for_max_station(ridership_df)

('R006', 3239.9, 2342.5999999999999)

In [103]:
# Loading Data into a DataFrame

In [104]:
import pandas as pd
path = '/home/matar/GitHub/Online_Courses/Udacity-ND/P02_Investigate_A_Dataset/Lessons/L03_NumPy_And_Pandas_For_2D_Data/'
subway_df = pd.read_csv(path + 'nyc_subway_weather.csv')


In [105]:
subway_df.head()

Unnamed: 0,UNIT,DATEn,TIMEn,ENTRIESn,EXITSn,ENTRIESn_hourly,EXITSn_hourly,datetime,hour,day_week,...,pressurei,rain,tempi,wspdi,meanprecipi,meanpressurei,meantempi,meanwspdi,weather_lat,weather_lon
0,R003,05-01-11,00:00:00,4388333,2911002,0.0,0.0,2011-05-01 00:00:00,0,6,...,30.22,0,55.9,3.5,0.0,30.258,55.98,7.86,40.700348,-73.887177
1,R003,05-01-11,04:00:00,4388333,2911002,0.0,0.0,2011-05-01 04:00:00,4,6,...,30.25,0,52.0,3.5,0.0,30.258,55.98,7.86,40.700348,-73.887177
2,R003,05-01-11,12:00:00,4388333,2911002,0.0,0.0,2011-05-01 12:00:00,12,6,...,30.28,0,62.1,6.9,0.0,30.258,55.98,7.86,40.700348,-73.887177
3,R003,05-01-11,16:00:00,4388333,2911002,0.0,0.0,2011-05-01 16:00:00,16,6,...,30.26,0,57.9,15.0,0.0,30.258,55.98,7.86,40.700348,-73.887177
4,R003,05-01-11,20:00:00,4388333,2911002,0.0,0.0,2011-05-01 20:00:00,20,6,...,30.28,0,52.0,10.4,0.0,30.258,55.98,7.86,40.700348,-73.887177


In [106]:
subway_df.describe()

Unnamed: 0,ENTRIESn,EXITSn,ENTRIESn_hourly,EXITSn_hourly,hour,day_week,weekday,latitude,longitude,fog,...,pressurei,rain,tempi,wspdi,meanprecipi,meanpressurei,meantempi,meanwspdi,weather_lat,weather_lon
count,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,...,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0
mean,28124860.0,19869930.0,1886.589955,1361.487866,10.046754,2.905719,0.714436,40.724647,-73.940364,0.009824,...,29.971096,0.224741,63.10378,6.927872,0.004618,29.971096,63.10378,6.927872,40.728555,-73.938693
std,30436070.0,20289860.0,2952.385585,2183.845409,6.938928,2.079231,0.451688,0.07165,0.059713,0.098631,...,0.137942,0.417417,8.455597,4.510178,0.016344,0.131158,6.939011,3.179832,0.06542,0.059582
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.576152,-74.073622,0.0,...,29.55,0.0,46.9,0.0,0.0,29.59,49.4,0.0,40.600204,-74.01487
25%,10397620.0,7613712.0,274.0,237.0,4.0,1.0,0.0,40.677107,-73.987342,0.0,...,29.89,0.0,57.0,4.6,0.0,29.913333,58.283333,4.816667,40.688591,-73.98513
50%,18183890.0,13316090.0,905.0,664.0,12.0,3.0,1.0,40.717241,-73.953459,0.0,...,29.96,0.0,61.0,6.9,0.0,29.958,60.95,6.166667,40.72057,-73.94915
75%,32630490.0,23937710.0,2255.0,1537.0,16.0,5.0,1.0,40.759123,-73.907733,0.0,...,30.06,0.0,69.1,9.2,0.0,30.06,67.466667,8.85,40.755226,-73.912033
max,235774600.0,149378200.0,32814.0,34828.0,20.0,6.0,1.0,40.889185,-73.755383,1.0,...,30.32,1.0,86.0,23.0,0.1575,30.293333,79.8,17.083333,40.862064,-73.694176


In [107]:
# Calculating Correlation
'''Understand and Interpreting Correlations
This page contains some scatterplots of variables with different values of correlation.
This page lets you use a slider to change the correlation and see how the data might look.
Pearson's r only measures linear correlation! This image shows some different linear and non-linear relationships and what Pearson's r will be for those relationships.
Corrected vs. Uncorrected Standard Deviation
By default, Pandas' std() function computes the standard deviation using Bessel's correction. Calling std(ddof=0) ensures that Bessel's correction will not be used.

Previous Exercise
The exercise where you used a simple heuristic to estimate correlation was the "Pandas Series" exercise in Lesson 2.

Pearson's r in NumPy
NumPy's corrcoef() function can be used to calculate Pearson's r, also known as the correlation coefficient.'''

import pandas as pd
path = '/home/matar/GitHub/Online_Courses/Udacity-ND/P02_Investigate_A_Dataset/Lessons/L03_NumPy_And_Pandas_For_2D_Data/'
subway_df = pd.read_csv(path + 'nyc_subway_weather.csv')

def correlation(x, y):
    '''
    Fill in this function to compute the correlation between the two
    input variables. Each input is either a NumPy array or a Pandas
    Series.
    
    correlation = average of (x in standard units) times (y in standard units)
    
    Remember to pass the argument "ddof=0" to the Pandas std() function!
    '''
    
    # strndarize x, y
    std_x = (x - x.mean()) / x.std(ddof=0) # ddof=0 to get uncorrected std
    std_y = (y - y.mean()) / y.std(ddof=0)
    
    # multiply each pair of value (vectorize)
    Pearson_r = (std_x * std_y).mean()
    
    return Pearson_r # NumPy has fun to cal Pearson_r but here we did it for learning purpos

entries = subway_df['ENTRIESn_hourly']
cum_entries = subway_df['ENTRIESn']
rain = subway_df['meanprecipi']
temp = subway_df['meantempi']

print correlation(entries, rain)
print correlation(entries, temp)
print correlation(rain, temp)

print correlation(entries, cum_entries)

0.0356485157722
-0.0266933483216
-0.229034323408
0.585895470766


In [108]:
# Pandas Axis Names
'''
axis = 0 or axis = 1

could be like :
axis = 'index' ==> top to down
axis = 'column' ==> left to right

'''

In [111]:
#DataFrame Vectorized Operations
import pandas as pd

# Examples of vectorized operations on DataFrames:
# Change False to True for each block of code to see what it does

# Adding DataFrames with the column names
if True:
    df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})
    df2 = pd.DataFrame({'a': [10, 20, 30], 'b': [40, 50, 60], 'c': [70, 80, 90]})
    print df1 + df2
print   
# Adding DataFrames with overlapping column names 
if True:
    df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})
    df2 = pd.DataFrame({'d': [10, 20, 30], 'c': [40, 50, 60], 'b': [70, 80, 90]})
    print df1 + df2
print
# Adding DataFrames with overlapping row indexes
if True:
    df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]},
                       index=['row1', 'row2', 'row3'])
    df2 = pd.DataFrame({'a': [10, 20, 30], 'b': [40, 50, 60], 'c': [70, 80, 90]},
                       index=['row4', 'row3', 'row2'])
    print df1 + df2

    a   b   c
0  11  44  77
1  22  55  88
2  33  66  99

    a   b   c   d
0 NaN  74  47 NaN
1 NaN  85  58 NaN
2 NaN  96  69 NaN

         a     b     c
row1   NaN   NaN   NaN
row2  32.0  65.0  98.0
row3  23.0  56.0  89.0
row4   NaN   NaN   NaN


In [116]:
# Cumulative entries and exits for one station for a few hours.
entries_and_exits = pd.DataFrame({
    'ENTRIESn': [3144312, 3144335, 3144353, 3144424, 3144594,
                 3144808, 3144895, 3144905, 3144941, 3145094],
    'EXITSn': [1088151, 1088159, 1088177, 1088231, 1088275,
               1088317, 1088328, 1088331, 1088420, 1088753]
})

print entries_and_exits
def get_hourly_entries_and_exits(entries_and_exits):
    '''
    Fill in this function to take a DataFrame with cumulative entries
    and exits (entries in the first column, exits in the second) and
    return a DataFrame with hourly entries and exits (entries in the
    first column, exits in the second).
    '''
    return entries_and_exits - entries_and_exits.shift(1)

   ENTRIESn   EXITSn
0   3144312  1088151
1   3144335  1088159
2   3144353  1088177
3   3144424  1088231
4   3144594  1088275
5   3144808  1088317
6   3144895  1088328
7   3144905  1088331
8   3144941  1088420
9   3145094  1088753


In [115]:
get_hourly_entries_and_exits(entries_and_exits)

Unnamed: 0,ENTRIESn,EXITSn
0,,
1,23.0,8.0
2,18.0,18.0
3,71.0,54.0
4,170.0,44.0
5,214.0,42.0
6,87.0,11.0
7,10.0,3.0
8,36.0,89.0
9,153.0,333.0


In [117]:
# DataFrame applymap() ==> to perform non-built in functions
import pandas as pd

# Change False to True for this block of code to see what it does

# DataFrame applymap()
if True:
    df = pd.DataFrame({
        'a': [1, 2, 3],
        'b': [10, 20, 30],
        'c': [5, 10, 15]
    })
    
    def add_one(x):
        return x + 1
        
    print df.applymap(add_one)

   a   b   c
0  2  11   6
1  3  21  11
2  4  31  16


In [125]:
grades_df = pd.DataFrame(
    data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
          'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
    index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio', 
           'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)


# single grade
def convert_grade(grade):
    if grade >= 90:
        return 'A'
    elif grade >= 80:
        return 'B'
    elif grade >= 70:
        return 'C'
    elif grade >= 60:
        return 'D'
    else :
        return 'F'   
        
def convert_grades(grades):
    '''
    Fill in this function to convert the given DataFrame of numerical
    grades to letter grades. Return a new DataFrame with the converted
    grade.
    
    The conversion rule is:
        90-100 -> A
        80-89  -> B
        70-79  -> C
        60-69  -> D
        0-59   -> F
    '''
    return grades.applymap(convert_grade)

In [126]:
print grades_df
convert_grades(grades_df)

         exam1  exam2
Andre       43     24
Barry       81     63
Chris       78     56
Dan         75     56
Emilio      89     67
Fred        70     51
Greta       91     79
Humbert     65     46
Ivan        98     72
James       87     60


Unnamed: 0,exam1,exam2
Andre,F,F
Barry,B,D
Chris,C,F
Dan,C,F
Emilio,B,D
Fred,C,F
Greta,A,C
Humbert,D,F
Ivan,A,C
James,B,D


In [132]:
# DataFrame apply() - Case 1 : take one column and return one colme (take df and return new modified df)

import pandas as pd

grades_df = pd.DataFrame(
    data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
          'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
    index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio', 
           'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)

# Change False to True for this block of code to see what it does

# DataFrame apply()
if True:
    def convert_grades_curve(exam_grades):
        # Pandas has a bult-in function that will perform this calculation
        # This will give the bottom 0% to 10% of students the grade 'F',
        # 10% to 20% the grade 'D', and so on. You can read more about
        # the qcut() function here:
        # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.qcut.html
        return pd.qcut(exam_grades,
                       [0, 0.1, 0.2, 0.5, 0.8, 1],
                       labels=['F', 'D', 'C', 'B', 'A'])
        
    # qcut() operates on a list, array, or Series. This is the
    # result of running the function on a single column of the
    # DataFrame.
    print grades_df
    print convert_grades_curve(grades_df['exam1'])
    
    # qcut() does not work on DataFrames, but we can use apply()
    # to call the function on each column separately
    print grades_df.apply(convert_grades_curve)

         exam1  exam2
Andre       43     24
Barry       81     63
Chris       78     56
Dan         75     56
Emilio      89     67
Fred        70     51
Greta       91     79
Humbert     65     46
Ivan        98     72
James       87     60
Andre      F
Barry      B
Chris      C
Dan        C
Emilio     B
Fred       C
Greta      A
Humbert    D
Ivan       A
James      B
Name: exam1, dtype: category
Categories (5, object): [F < D < C < B < A]
        exam1 exam2
Andre       F     F
Barry       B     B
Chris       C     C
Dan         C     C
Emilio      B     B
Fred        C     C
Greta       A     A
Humbert     D     D
Ivan        A     A
James       B     B


In [133]:
def standardize_colum(column):
    return (column - column.mean()) / column.std()

def standardize(df):
    '''
    Fill in this function to standardize each column of the given
    DataFrame. To standardize a variable, convert each value to the
    number of standard deviations it is above or below the mean.
    '''
    return df.apply(standardize_colum)

In [134]:
standardize(grades_df)

Unnamed: 0,exam1,exam2
Andre,-2.196525,-2.186335
Barry,0.208891,0.366571
Chris,0.01899,-0.091643
Dan,-0.170911,-0.091643
Emilio,0.715295,0.628408
Fred,-0.487413,-0.418938
Greta,0.841896,1.413917
Humbert,-0.803916,-0.746234
Ivan,1.284999,0.955703
James,0.588694,0.170194


In [138]:
# DataFrame apply() Use Case 2 - take one column and return 1 cell ( create Series, where each column of df reduced to a single calue)
import numpy as np
import pandas as pd

df = pd.DataFrame({
    'a': [4, 5, 3, 1, 2],
    'b': [20, 10, 40, 50, 30],
    'c': [25, 20, 5, 15, 10]
})

# Change False to True for this block of code to see what it does

# DataFrame apply() - use case 2
if True:   
    print df.apply(np.mean)
    print df.apply(np.max)

a     3.0
b    30.0
c    15.0
dtype: float64
a     5
b    50
c    25
dtype: int64


In [139]:
def second_largest_column(column):
    sorted_column = column.sort_values(ascending=False)
    return sorted_column.iloc[1]

def second_largest(df):
    '''
    Fill in this function to return the second-largest value of each 
    column of the input DataFrame.
    '''
    return df.apply(second_largest_column)

In [140]:
second_largest(df)

a     4
b    40
c    20
dtype: int64

In [143]:
# adding a DataFrame to a Series
import pandas as pd

# Change False to True for each block of code to see what it does


In [147]:

# Adding a Series to a square DataFrame
if True:
    s= pd.Series([1, 2, 3, 4])
    df = pd.DataFrame({
        0: [10, 20, 30, 40],
        1: [50, 60, 70, 80],
        2: [90, 100, 110, 120],
        3: [130, 140, 150, 160]
    })
    
    print df
    print '' # Create a blank line between outputs
    print df + s
    


    0   1    2    3
0  10  50   90  130
1  20  60  100  140
2  30  70  110  150
3  40  80  120  160

    0   1    2    3
0  11  52   93  134
1  21  62  103  144
2  31  72  113  154
3  41  82  123  164


In [148]:
# Adding a Series to a one-row DataFrame 
if True:
    s = pd.Series([1, 2, 3, 4])
    df = pd.DataFrame({0: [10], 1: [20], 2: [30], 3: [40]})
    
    print df
    print '' # Create a blank line between outputs
    print df + s



    0   1   2   3
0  10  20  30  40

    0   1   2   3
0  11  22  33  44


In [149]:
# Adding a Series to a one-column DataFrame
if True:
    s = pd.Series([1, 2, 3, 4])
    df = pd.DataFrame({0: [10, 20, 30, 40]})
    
    print df
    print '' # Create a blank line between outputs
    print df + s
    

    


    0
0  10
1  20
2  30
3  40

    0   1   2   3
0  11 NaN NaN NaN
1  21 NaN NaN NaN
2  31 NaN NaN NaN
3  41 NaN NaN NaN


In [150]:
# Adding when DataFrame column names match Series index
if True:
    s = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
    df = pd.DataFrame({
        'a': [10, 20, 30, 40],
        'b': [50, 60, 70, 80],
        'c': [90, 100, 110, 120],
        'd': [130, 140, 150, 160]
    })
    
    print df
    print '' # Create a blank line between outputs
    print df + s
    


    a   b    c    d
0  10  50   90  130
1  20  60  100  140
2  30  70  110  150
3  40  80  120  160

    a   b    c    d
0  11  52   93  134
1  21  62  103  144
2  31  72  113  154
3  41  82  123  164


In [151]:
# Adding when DataFrame column names don't match Series index
if True:
    s = pd.Series([1, 2, 3, 4])
    df = pd.DataFrame({
        'a': [10, 20, 30, 40],
        'b': [50, 60, 70, 80],
        'c': [90, 100, 110, 120],
        'd': [130, 140, 150, 160]
    })
    
    print df
    print '' # Create a blank line between outputs
    print df + s

    a   b    c    d
0  10  50   90  130
1  20  60  100  140
2  30  70  110  150
3  40  80  120  160

    0   1   2   3   a   b   c   d
0 NaN NaN NaN NaN NaN NaN NaN NaN
1 NaN NaN NaN NaN NaN NaN NaN NaN
2 NaN NaN NaN NaN NaN NaN NaN NaN
3 NaN NaN NaN NaN NaN NaN NaN NaN


In [152]:
# Standardizing Each Column Again --- using vectorize
import pandas as pd

In [153]:
# Adding using +
if True:
    s = pd.Series([1, 2, 3, 4])
    df = pd.DataFrame({
        0: [10, 20, 30, 40],
        1: [50, 60, 70, 80],
        2: [90, 100, 110, 120],
        3: [130, 140, 150, 160]
    })
    
    print df
    print '' # Create a blank line between outputs
    print df + s
    

    0   1    2    3
0  10  50   90  130
1  20  60  100  140
2  30  70  110  150
3  40  80  120  160

    0   1    2    3
0  11  52   93  134
1  21  62  103  144
2  31  72  113  154
3  41  82  123  164


In [154]:
# Adding with axis='index'
if True:
    s = pd.Series([1, 2, 3, 4])
    df = pd.DataFrame({
        0: [10, 20, 30, 40],
        1: [50, 60, 70, 80],
        2: [90, 100, 110, 120],
        3: [130, 140, 150, 160]
    })
    
    print df
    print '' # Create a blank line between outputs
    print df.add(s, axis='index')
    # The functions sub(), mul(), and div() work similarly to add()

    0   1    2    3
0  10  50   90  130
1  20  60  100  140
2  30  70  110  150
3  40  80  120  160

    0   1    2    3
0  11  51   91  131
1  22  62  102  142
2  33  73  113  153
3  44  84  124  164


In [156]:
# Adding with axis='columns'
if True:
    s = pd.Series([1, 2, 3, 4])
    df = pd.DataFrame({
        0: [10, 20, 30, 40],
        1: [50, 60, 70, 80],
        2: [90, 100, 110, 120],
        3: [130, 140, 150, 160]
    })
    
    print df
    print '' # Create a blank line between outputs
    print df.add(s, axis='columns')
    # The functions sub(), mul(), and div() work similarly to add()
    


    0   1    2    3
0  10  50   90  130
1  20  60  100  140
2  30  70  110  150
3  40  80  120  160

    0   1    2    3
0  11  52   93  134
1  21  62  103  144
2  31  72  113  154
3  41  82  123  164


In [157]:
grades_df = pd.DataFrame(
    data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
          'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
    index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio', 
           'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)

In [None]:
def standardize(df):
    '''
    Fill in this function to standardize each column of the given
    DataFrame. To standardize a variable, convert each value to the
    number of standard deviations it is above or below the mean.
    
    This time, try to use vectorized operations instead of apply().
    You should get the same results as you did before.
    '''
    return None

def standardize_rows(df):
    '''
    Optional: Fill in this function to standardize each row of the given
    DataFrame. Again, try not to use apply().
    
    This one is more challenging than standardizing each column!
    '''
    return None