# Data Analysis
## Lesson 3 : NumPy and Pandas for 2D Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

### Related Questions about NYC subway and weather
> 1. is there correlation in metro usage and rainy days<br>
2. how many people check weather before start of the day<br>
3. seasonal impact on sunrise and sunset<br>
4. average temperature of stations in day<br>
5. min and max temperature of stations<br>

Questions I thought of:
    - What variables are related to subsway ridership?
        - Which stations have the most riders?
        - What are the ridership patterns over time?
        - How does the weather affect ridership?
    - What patterns can I find in the weather?
        - Is the temperature rising throughout the month?
        - How does the weather vary across the city?

In [2]:
df_newyork = pd.read_csv('./csv/nyc_subway_weather.csv')

In [3]:
df_newyork

Unnamed: 0,UNIT,DATEn,TIMEn,ENTRIESn,EXITSn,ENTRIESn_hourly,EXITSn_hourly,datetime,hour,day_week,...,pressurei,rain,tempi,wspdi,meanprecipi,meanpressurei,meantempi,meanwspdi,weather_lat,weather_lon
0,R003,05-01-11,00:00:00,4388333,2911002,0.0,0.0,2011-05-01 00:00:00,0,6,...,30.22,0,55.9,3.5,0.000000,30.258000,55.980000,7.860000,40.700348,-73.887177
1,R003,05-01-11,04:00:00,4388333,2911002,0.0,0.0,2011-05-01 04:00:00,4,6,...,30.25,0,52.0,3.5,0.000000,30.258000,55.980000,7.860000,40.700348,-73.887177
2,R003,05-01-11,12:00:00,4388333,2911002,0.0,0.0,2011-05-01 12:00:00,12,6,...,30.28,0,62.1,6.9,0.000000,30.258000,55.980000,7.860000,40.700348,-73.887177
3,R003,05-01-11,16:00:00,4388333,2911002,0.0,0.0,2011-05-01 16:00:00,16,6,...,30.26,0,57.9,15.0,0.000000,30.258000,55.980000,7.860000,40.700348,-73.887177
4,R003,05-01-11,20:00:00,4388333,2911002,0.0,0.0,2011-05-01 20:00:00,20,6,...,30.28,0,52.0,10.4,0.000000,30.258000,55.980000,7.860000,40.700348,-73.887177
5,R003,05-02-11,00:00:00,4388348,2911036,15.0,34.0,2011-05-02 00:00:00,0,0,...,30.31,0,50.0,6.9,0.000000,30.238333,54.166667,8.250000,40.700348,-73.887177
6,R003,05-02-11,04:00:00,5818689,3874767,19.0,40.0,2011-05-02 04:00:00,4,0,...,30.27,0,50.0,4.6,0.000000,30.238333,54.166667,8.250000,40.700348,-73.887177
7,R003,05-02-11,08:00:00,4388855,2911194,488.0,118.0,2011-05-02 08:00:00,8,0,...,30.30,0,53.1,10.4,0.000000,30.238333,54.166667,8.250000,40.700348,-73.887177
8,R003,05-02-11,12:00:00,4389345,2911326,490.0,132.0,2011-05-02 12:00:00,12,0,...,30.24,0,57.0,11.5,0.000000,30.238333,54.166667,8.250000,40.700348,-73.887177
9,R003,05-02-11,16:00:00,4389576,2911558,231.0,232.0,2011-05-02 16:00:00,16,0,...,30.16,0,59.0,11.5,0.000000,30.238333,54.166667,8.250000,40.700348,-73.887177


### Two-Dimensional NumPy Arrays

In [11]:
import numpy as np

# Subway ridership for 5 stations on 10 different days
ridership = np.array([
    [   0,    0,    2,    5,    0],
    [1478, 3877, 3674, 2328, 2539],
    [1613, 4088, 3991, 6461, 2691],
    [1560, 3392, 3826, 4787, 2613],
    [1608, 4802, 3932, 4477, 2705],
    [1576, 3933, 3909, 4979, 2685],
    [  95,  229,  255,  496,  201],
    [   2,    0,    1,   27,    0],
    [1438, 3785, 3589, 4174, 2215],
    [1342, 4043, 4009, 4665, 3033]
])

# Change False to True for each block of code to see what it does

# Accessing elements
if False:
    print(ridership[1, 3])
    print(ridership[1:3, 3:5])
    print(ridership[1, :])
    
# Vectorized operations on rows or columns
if True:
    print(ridership[0, :] + ridership[1, :])
    print(ridership[:, 0] + ridership[:, 1])
    
# Vectorized operations on entire arrays
if False:
    a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    b = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]])
    print(a + b)

[1478 3877 3676 2333 2539]
[   0 5355 5701 4952 6410 5509  324    2 5223 5385]


In [17]:
# Find the max riders on the first day
max_station = ridership[0,:].argmax()
max_station

# Find the mean riders per day
mean_for_max = ridership[:,max_station].mean()

# Find the mean ridership overall for comparison
overall_mean = ridership.mean()

In [18]:
def mean_riders_for_max_station(ridership):
    '''
    Fill in this function to find the station with the maximum riders on the
    first day, then return the mean riders per day for that station. Also
    return the mean ridership overall for comparsion.
    
    Hint: NumPy's argmax() function might be useful:
    http://docs.scipy.org/doc/numpy/reference/generated/numpy.argmax.html
    '''
    # Find the max riders on the first day
    max_station = ridership[0,:].argmax()

    # Find the mean ridership overall for comparison
    overall_mean = ridership.mean()
    # Find the mean riders per day
    mean_for_max = ridership[:,max_station].mean()
    
    return (overall_mean, mean_for_max)

In [19]:
mean_riders_for_max_station(ridership)

(2342.5999999999999, 3239.9000000000001)

### NumPy Axis
![title](./picture/numpy_axis.png)

In [40]:
# Change False to True for this block of code to see what it does

# NumPy axis argument
if True:
    a = np.array([
        [1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]
    ])
    
    print(a.sum())
    print(a.sum(axis=0))
    print(a.sum(axis=1))
    
# Subway ridership for 5 stations on 10 different days
ridership = np.array([
    [   0,    0,    2,    5,    0],
    [1478, 3877, 3674, 2328, 2539],
    [1613, 4088, 3991, 6461, 2691],
    [1560, 3392, 3826, 4787, 2613],
    [1608, 4802, 3932, 4477, 2705],
    [1576, 3933, 3909, 4979, 2685],
    [  95,  229,  255,  496,  201],
    [   2,    0,    1,   27,    0],
    [1438, 3785, 3589, 4174, 2215],
    [1342, 4043, 4009, 4665, 3033]
])

def min_and_max_riders_per_day(ridership):
    '''
    Fill in this function. First, for each subway station, calculate the
    mean ridership per day. Then, out of all the subway stations, return the
    maximum and minimum of these values. That is, find the maximum
    mean-ridership-per-day and the minimum mean-ridership-per-day for any
    subway station.
    '''
    station_mean_per_day = ridership.mean(axis=0)
    print(station_mean_per_day)
    
    max_mean_station = station_mean_per_day.max()
    min_mean_station = station_mean_per_day.min()
    
    print('max : %8.4f, min : %8.4f' % (max_mean_station, min_mean_station))
    max_daily_ridership = max_mean_station     # Replace this with your code
    min_daily_ridership = min_mean_station     # Replace this with your code
    
    return (max_daily_ridership, min_daily_ridership)

45
[12 15 18]
[ 6 15 24]


In [41]:
min_and_max_riders_per_day(ridership)

[ 1071.2  2814.9  2718.8  3239.9  1868.2]
max : 3239.9000, min : 1071.2000


(3239.9000000000001, 1071.2)

### NumPy and Pandas Data Types
- NumPy는 자료형이 변환되지만, Pandas는 고유의 자료형이 컬럼별로 유지된다.

In [43]:
enrollments_df = pd.DataFrame({
    'account_key':[448,448,448,448,448],
    'status':['canceled','canceled','canceled','canceled','canceled'],
    'join_date':['2014-11-10','2014-11-05','2015-01-27','2014-11-10','2015-03-10'],
    'days_to_cancel':[65,5,0,0,np.nan],
    'is_udacity':[True,True,True,True,True]
})

enrollments_df

Unnamed: 0,account_key,days_to_cancel,is_udacity,join_date,status
0,448,65.0,True,2014-11-10,canceled
1,448,5.0,True,2014-11-05,canceled
2,448,0.0,True,2015-01-27,canceled
3,448,0.0,True,2014-11-10,canceled
4,448,,True,2015-03-10,canceled


In [44]:
enrollments_df.mean()

account_key       448.0
days_to_cancel     17.5
is_udacity          1.0
dtype: float64

In [45]:
enrollments_df.mean(axis=1)

0    171.333333
1    151.333333
2    149.666667
3    149.666667
4    224.500000
dtype: float64

### Accessing Elements of a DataFrame

In [58]:
# Subway ridership for 5 stations on 10 different days
ridership_df = pd.DataFrame(
    data=[[   0,    0,    2,    5,    0],
          [1478, 3877, 3674, 2328, 2539],
          [1613, 4088, 3991, 6461, 2691],
          [1560, 3392, 3826, 4787, 2613],
          [1608, 4802, 3932, 4477, 2705],
          [1576, 3933, 3909, 4979, 2685],
          [  95,  229,  255,  496,  201],
          [   2,    0,    1,   27,    0],
          [1438, 3785, 3589, 4174, 2215],
          [1342, 4043, 4009, 4665, 3033]],
    index=['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11',
           '05-06-11', '05-07-11', '05-08-11', '05-09-11', '05-10-11'],
    columns=['R003', 'R004', 'R005', 'R006', 'R007']
)

# Change False to True for each block of code to see what it does

# DataFrame creation
if False:
    # You can create a DataFrame out of a dictionary mapping column names to values
    df_1 = pd.DataFrame({'A': [0, 1, 2], 'B': [3, 4, 5]})
    print(df_1)

    # You can also use a list of lists or a 2D NumPy array
    df_2 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=['A', 'B', 'C'])
    print(df_2)
   

# Accessing elements
if False:
    print(ridership_df.iloc[0])
    print(ridership_df.loc['05-05-11'])
    print(ridership_df['R003'])
    print(ridership_df.iloc[1, 3])
    
# Accessing multiple rows
if False:
    print(ridership_df.iloc[1:4])
    
# Accessing multiple columns
if False:
    print(ridership_df[['R003', 'R005']])
    
# Pandas axis
if False:
    df = pd.DataFrame({'A': [0, 1, 2], 'B': [3, 4, 5]})
    print(df.sum())
    print(df.sum(axis=1))
    print(df.values.sum())
    
max_station = ridership_df.loc['05-01-11'].argmax()
max_station

mean_for_max = ridership_df[max_station].mean()

overall_mean = ridership_df.values.mean()

2342.5999999999999

In [64]:
def mean_riders_for_max_station(ridership):
    '''
    Fill in this function to find the station with the maximum riders on the
    first day, then return the mean riders per day for that station. Also
    return the mean ridership overall for comparsion.
    
    This is the same as a previous exercise, but this time the
    input is a Pandas DataFrame rather than a 2D NumPy array.
    '''
    # find the station with the maximun riders on the first day
    # max_station = ridership.loc['05-01-11'].argmax()
    max_station = ridership.iloc[0].argmax()
    max_station

    overall_mean = ridership.values.mean() # Replace this with your code
    mean_for_max = ridership[max_station].mean() # Replace this with your code
    
    return (overall_mean, mean_for_max)

In [65]:
mean_riders_for_max_station(ridership_df)

(2342.5999999999999, 3239.9)