In [None]:
#this file computes the number of people using the metro between timestamps
#this file comutes the total number of people using the metro per station per day
#this file computes the stations with the biggest difference in mean passengers on weekdays and weekends!

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime,timedelta
mta_data = pd.read_pickle('mta_data.pickle')

#reset index and drop old indices to remove duplicate indexes and fix cols; remove duplicates
mta_data = mta_data.reset_index().drop('index',axis=1)
mta_data.columns = [col.strip() for col in mta_data.columns]
mta_data.drop_duplicates()

# convert dates to datetime, create col; convert that to day of week, create col; convert that to weekday T/F, create col
mta_data['weekday'] = pd.to_datetime(mta_data.DATE,format='%m/%d/%Y')
mta_data['weekday_n'] = mta_data.weekday.dt.dayofweek
mta_data['weekday_tf'] = mta_data.weekday_n<5

#create columns for entries since last time
mta_data['peop_in'] = mta_data.groupby(['weekday','SCP'])['ENTRIES'].diff(periods = 1)
mta_data['peop_out'] = mta_data.groupby(['weekday','SCP'])['EXITS'].diff(periods = 1)
mta_data.fillna(0,inplace=True)

#filter the weird values and fill nan
mta_data.peop_out[mta_data['peop_out']<0] = 0
mta_data.peop_in[mta_data['peop_in']<0] = 0
mta_data.peop_out[mta_data['peop_out']>4000] = 0
mta_data.peop_in[mta_data['peop_in']>4000] = 0
mta_data.fillna(0)

#drop values where there is nobody in/out that we have fewer rows
mta_data.drop(mta_data[mta_data['peop_in']==0].index,inplace=True)
mta_data.drop(mta_data[mta_data['peop_out']==0].index,inplace=True)

#find sum of data and the total number of people through by day
sum_d = mta_data.groupby(['STATION','weekday']).sum()
sum_d['through']=sum_d.peop_in+sum_d.peop_out
sum_d.reset_index(inplace=True)

#Create new row in order to fix issue with adding t/f values and calculate dif in mean on weekday v weekend
sum_d['weekend_tf'] = sum_d.weekday_tf == 0
sum_d_m = sum_d.groupby(['STATION','weekend_tf']).mean()
sum_d_m.reset_index(inplace=True)
sum_d_m['mean_dif']=sum_d_m.groupby("STATION").diff(periods=1).through

#remove NA and find 10 worst stations (more traffic on weekend=touristy)
top10b = sum_d_m.dropna().sort_values(by='mean_dif').head(10)

#plot those 10
plt.figure(figsize=(10,10))
plt.bar(x=top10b.STATION,height=top10b.mean_dif)
plt.xticks(rotation=90)
plt.title('Most Touristy Stations')
plt.xlabel('Station Name')
plt.ylabel('Difference in People on Weekends/Weekdays')
plt.show()

#remove NA and find 10 best stations (more traddic on weekdays = commuter)
top10g = sum_d_m.dropna().sort_values(by='mean_dif',ascending=False).head(10)

#plot those 10
plt.figure(figsize=(10,10))
plt.bar(x=top10g.STATION,height=top10g.mean_dif)
plt.xticks(rotation=90)
plt.title('Least Touristy Stations')
plt.xlabel('Station Name')
plt.ylabel('Difference in People on Weekends/Weekdays');